## Importing Packages

In [34]:
import pandas as pd
import numpy as np
import tqdm
import pickle
from pprint import pprint
import os

import warnings
warnings.filterwarnings('ignore')

#sklearn
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split

import gensim
from gensim import corpora, models
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel

import pyLDAvis
import pyLDAvis.sklearn
import pyLDAvis.gensim_models as gensimvis

In [2]:
with open('processed_tweets.pickle', 'rb') as read_file:
    df = pickle.load(read_file)

## Train-Test Split

In [3]:
X_train, X_test = train_test_split(df.tweet, test_size=0.2, random_state=42)
X_train

20180    [congrats, lead, nonprofit, organization, dedi...
17391    [youre, pet, owner, new, york, know, toxic, in...
9976                 [restock, feel, good, brooklynwineco]
6257     [since, percent, excess, heat, retain, earth, ...
6245     [shorebird, watch, big, business, new, jersey,...
                               ...                        
11964    [ooh, yourcbdstorebk, union, berkeley, cbd, st...
21575    [meet, horticultural, therapist, garden, progr...
5390     [alone, together, gowanus, community, group, s...
860      [new, york, city, experience, dangerous, heat,...
15795    [congratulations, raise, money, thank, partici...
Name: tweet, Length: 17672, dtype: object

In [4]:
X_test

19530    [bundle, new, yorkers, go, dip, freeze, every,...
20241    [congratulations, award, grant, historic, hous...
15031                            [refresh, flower, street]
9960     [mask, thermometers, oximeters, near, ave, bea...
735      [deeply, thankful, haul, trash, recycle, treat...
                               ...                        
16606      [get_repost, member, dl, dd, show, chefs, take]
21677    [know, secure, food, box, family, four, please...
7948                                  [missamericanpienyc]
4630     [dont, trash, tree, mulch, drop, bbp, anytime,...
8439     [virtual, learn, great, option, yourchild, lea...
Name: tweet, Length: 4418, dtype: object

In [5]:
train_list_of_lists = list(X_train.values)

## Bigram-Trigram Models

(I did not incorporate bigrams and trigrams into the model yet)

In [6]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(train_list_of_lists, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[train_list_of_lists], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [7]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

In [8]:
data_words_bigrams = make_bigrams(train_list_of_lists)

## Bag of Words

In [11]:
id2word = Dictionary(train_list_of_lists)
corpus = [id2word.doc2bow(text) for text in train_list_of_lists]

In [12]:
sample = corpus[3000]

for i in range(len(sample)):
    print("Word {} (\"{}\") appears {} time(s).".format(sample[i][0], 
                                                     id2word[sample[i][0]], 
                                                     sample[i][1]))

Word 173 ("class") appears 1 time(s).
Word 572 ("get_repost") appears 1 time(s).
Word 660 ("wine") appears 1 time(s).
Word 692 ("january") appears 1 time(s).
Word 715 ("pizza") appears 1 time(s).
Word 732 ("february") appears 1 time(s).
Word 833 ("thursday") appears 2 time(s).
Word 1022 ("month") appears 1 time(s).
Word 1091 ("dd") appears 1 time(s).
Word 3509 ("pair") appears 2 time(s).


## LDA with Bag of Words

In [13]:
# Build LDA model
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=4, 
                     random_state=42,
                     chunksize=100,
                     passes=100,
                     update_every=5,
                     alpha='auto',
                     per_word_topics=True)

pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]


[(0,
  '0.019*"water" + 0.010*"river" + 0.008*"protect" + 0.008*"new" + '
  '0.008*"clean" + 0.006*"state" + 0.006*"hudson" + 0.006*"environmental" + '
  '0.006*"society" + 0.006*"littoral"'),
 (1,
  '0.014*"us" + 0.011*"new" + 0.011*"thank" + 0.010*"join" + 0.009*"today" + '
  '0.009*"get" + 0.008*"day" + 0.008*"make" + 0.008*"help" + 0.008*"learn"'),
 (2,
  '0.025*"park" + 0.016*"brooklyn" + 0.012*"st" + 0.010*"come" + '
  '0.009*"get_repost" + 0.008*"slope" + 0.008*"good" + 0.008*"corner" + '
  '0.007*"amaze" + 0.007*"house"'),
 (3,
  '0.011*"high" + 0.010*"line" + 0.009*"june" + 0.008*"friday" + '
  '0.008*"ticket" + 0.007*"april" + 0.006*"bio" + 0.006*"business" + '
  '0.006*"update" + 0.006*"staff"')]


In [23]:
pyLDAvis.enable_notebook()
LDAvis_prepared = gensimvis.prepare(lda_model, corpus, id2word)
LDAvis_prepared

In [12]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=train_list_of_lists, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.28826282195420916


In [24]:
lda_model_bow = gensim.models.LdaMulticore(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=4, 
                                            random_state=42,
                                            chunksize=100,
                                            passes=100,
                                            update_every=5,
                                            alpha='auto',
                                            per_word_topics=True,
                                            workers=2)

for idx, topic in lda_model_bow.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.013*"park" + 0.010*"open" + 0.008*"come" + 0.008*"brooklyn" + 0.008*"st" + 0.007*"get" + 0.006*"get_repost" + 0.005*"slope" + 0.005*"day" + 0.005*"us"
Topic: 1 
Words: 0.007*"grant" + 0.007*"garden" + 0.007*"us" + 0.006*"learn" + 0.006*"receive" + 0.005*"make" + 0.005*"gift" + 0.005*"community" + 0.005*"see" + 0.005*"today"
Topic: 2 
Words: 0.016*"water" + 0.010*"new" + 0.009*"river" + 0.007*"protect" + 0.007*"high" + 0.006*"line" + 0.006*"state" + 0.006*"change" + 0.005*"hudson" + 0.005*"fish"
Topic: 3 
Words: 0.017*"thank" + 0.012*"us" + 0.010*"help" + 0.010*"support" + 0.009*"join" + 0.008*"community" + 0.008*"work" + 0.008*"new" + 0.007*"get" + 0.006*"need"


In [25]:
LDAvis_prepared_2 = gensimvis.prepare(lda_model_bow, corpus, id2word)
LDAvis_prepared_2

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [26]:
for index, score in sorted(lda_model_bow[corpus[3000]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_bow.print_topic(index, 4)))


Score: 0.9402359127998352	 
Topic: 0.013*"park" + 0.010*"open" + 0.008*"come" + 0.008*"brooklyn"

Score: 0.020550765097141266	 
Topic: 0.007*"grant" + 0.007*"garden" + 0.007*"us" + 0.006*"learn"

Score: 0.019865620881319046	 
Topic: 0.017*"thank" + 0.012*"us" + 0.010*"help" + 0.010*"support"

Score: 0.019347691908478737	 
Topic: 0.016*"water" + 0.010*"new" + 0.009*"river" + 0.007*"protect"


In [27]:
# Compute Coherence Score
coherence_model_lda_2 = CoherenceModel(model=lda_model_bow, texts=train_list_of_lists, dictionary=id2word, coherence='c_v')
coherence_lda_2 = coherence_model_lda_2.get_coherence()
print('Coherence Score: ', coherence_lda_2)

Coherence Score:  0.28302921569964656


## LDA with TF-IDF

In [29]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.42466773753051157),
 (1, 0.1766020120629213),
 (2, 0.23014024341235387),
 (3, 0.2832750364803111),
 (4, 0.2366724011621648),
 (5, 0.2832750364803111),
 (6, 0.18979599067454203),
 (7, 0.2143401059677086),
 (8, 0.24462258623808159),
 (9, 0.33186344059309086),
 (10, 0.29674778220154957),
 (11, 0.19428450256362492),
 (12, 0.19638562074430463),
 (13, 0.24300529787323882),
 (14, 0.2088991128043566)]


In [30]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus=corpus_tfidf,
                                                id2word=id2word,
                                                num_topics=4, 
                                                random_state=42,
                                                chunksize=100,
                                                passes=100,
                                                update_every=5,
                                                alpha='auto',
                                                per_word_topics=True,
                                                workers=4)

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.006*"thank" + 0.005*"us" + 0.004*"join" + 0.004*"support" + 0.004*"community" + 0.003*"learn" + 0.003*"today" + 0.003*"work" + 0.003*"help" + 0.003*"program"
Topic: 1 Word: 0.003*"order" + 0.003*"new" + 0.002*"delivery" + 0.002*"get" + 0.002*"today" + 0.002*"plastic" + 0.002*"open" + 0.002*"day" + 0.002*"park" + 0.002*"slope"
Topic: 2 Word: 0.005*"st" + 0.004*"corner" + 0.003*"open" + 0.003*"union" + 0.003*"store" + 0.003*"ave" + 0.003*"president" + 0.003*"wine" + 0.002*"happy" + 0.002*"come"
Topic: 3 Word: 0.006*"water" + 0.004*"gowanus" + 0.003*"river" + 0.003*"new" + 0.002*"protect" + 0.002*"climate" + 0.002*"clean" + 0.002*"quality" + 0.002*"drink" + 0.002*"ocean"


In [31]:
LDAvis_prepared_3 = gensimvis.prepare(lda_model_tfidf, corpus_tfidf, id2word)
LDAvis_prepared_3

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [32]:
for index, score in sorted(lda_model_tfidf[corpus[3000]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 4)))


Score: 0.5621809959411621	 
Topic: 0.005*"st" + 0.004*"corner" + 0.003*"open" + 0.003*"union"

Score: 0.39833658933639526	 
Topic: 0.006*"thank" + 0.005*"us" + 0.004*"join" + 0.004*"support"

Score: 0.02010035328567028	 
Topic: 0.003*"order" + 0.003*"new" + 0.002*"delivery" + 0.002*"get"

Score: 0.01938203163444996	 
Topic: 0.006*"water" + 0.004*"gowanus" + 0.003*"river" + 0.003*"new"


In [33]:
# Compute Coherence Score
coherence_model_lda_3 = CoherenceModel(model=lda_model_tfidf, texts=train_list_of_lists, dictionary=id2word, coherence='c_v')
coherence_lda_3 = coherence_model_lda_3.get_coherence()
print('Coherence Score: ', coherence_lda_3)

Coherence Score:  0.2654761237827161


## Hyperparameter Tuning

In [35]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=42,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=train_list_of_lists, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [37]:
grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]
corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=540)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    pbar.close()

  0%|          | 0/540 [03:02<?, ?it/s]
Process SpawnPoolWorker-131:
Process SpawnPoolWorker-130:
Process SpawnPoolWorker-133:
Process SpawnPoolWorker-129:
Process SpawnPoolWorker-127:
Process SpawnPoolWorker-132:
Process SpawnPoolWorker-128:
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/examsherpa/opt/anaconda3/envs/nlp-env/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/examsherpa/opt/anaconda3/envs/nlp-env/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/examsherpa/opt/anaconda3/envs/nlp-env/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/examsherpa/opt/anaconda3/envs/nlp-env/lib/python3.8/multiprocessing/process.py", line 108, in run
    sel

KeyboardInterrupt: 