In [22]:
import pandas as pd
#sklearn
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from pprint import pprint

from sklearn.model_selection import train_test_split


import warnings
warnings.filterwarnings('ignore')

import gensim
from gensim import corpora, models
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel

import pickle

import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline




In [11]:
with open('processed_tweets.pickle', 'rb') as read_file:
    df = pickle.load(read_file)

In [4]:
X_train, X_test = train_test_split(df.tweet, test_size=0.2, random_state=42)
X_train

20180    [congrats, lead, nonprofit, organization, dedi...
17391    [youre, pet, owner, new, york, know, toxic, in...
9976                 [restock, feel, good, brooklynwineco]
6257     [since, percent, excess, heat, retain, earth, ...
6245     [shorebird, watch, big, business, new, jersey,...
                               ...                        
11964    [ooh, yourcbdstorebk, union, berkeley, cbd, st...
21575    [meet, horticultural, therapist, garden, progr...
5390     [alone, together, gowanus, community, group, s...
860      [new, york, city, experience, dangerous, heat,...
15795    [congratulations, raise, money, thank, partici...
Name: tweet, Length: 17672, dtype: object

In [5]:
X_test

19530    [bundle, new, yorkers, go, dip, freeze, every,...
20241    [congratulations, award, grant, historic, hous...
15031                            [refresh, flower, street]
9960     [mask, thermometers, oximeters, near, ave, bea...
735      [deeply, thankful, haul, trash, recycle, treat...
                               ...                        
16606      [get_repost, member, dl, dd, show, chefs, take]
21677    [know, secure, food, box, family, four, please...
7948                                  [missamericanpienyc]
4630     [dont, trash, tree, mulch, drop, bbp, anytime,...
8439     [virtual, learn, great, option, yourchild, lea...
Name: tweet, Length: 4418, dtype: object

In [33]:
train_list_of_lists = list(X_train.values)

In [7]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(train_list_of_lists, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[train_list_of_lists], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [35]:
id2word = Dictionary(train_list_of_lists)
id2word[0]

'bounty'

In [37]:
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in train_list_of_lists]
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1)]]


In [10]:
# Build LDA model
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=4, 
                     random_state=42,
                     chunksize=100,
                     passes=100,
                     update_every=5,
                     alpha='auto',
                     per_word_topics=True)

pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]


[(0,
  '0.019*"water" + 0.010*"river" + 0.008*"protect" + 0.008*"new" + '
  '0.008*"clean" + 0.006*"state" + 0.006*"hudson" + 0.006*"environmental" + '
  '0.006*"society" + 0.006*"littoral"'),
 (1,
  '0.014*"us" + 0.011*"new" + 0.011*"thank" + 0.010*"join" + 0.009*"today" + '
  '0.009*"get" + 0.008*"day" + 0.008*"make" + 0.008*"help" + 0.008*"learn"'),
 (2,
  '0.025*"park" + 0.016*"brooklyn" + 0.012*"st" + 0.010*"come" + '
  '0.009*"get_repost" + 0.008*"slope" + 0.008*"good" + 0.008*"corner" + '
  '0.007*"amaze" + 0.007*"house"'),
 (3,
  '0.011*"high" + 0.010*"line" + 0.009*"june" + 0.008*"friday" + '
  '0.008*"ticket" + 0.007*"april" + 0.006*"bio" + 0.006*"business" + '
  '0.006*"update" + 0.006*"staff"')]


In [12]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=train_list_of_lists, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.28826282195420916


In [15]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.42466773753051157),
 (1, 0.1766020120629213),
 (2, 0.23014024341235387),
 (3, 0.2832750364803111),
 (4, 0.2366724011621648),
 (5, 0.2832750364803111),
 (6, 0.18979599067454203),
 (7, 0.2143401059677086),
 (8, 0.24462258623808159),
 (9, 0.33186344059309086),
 (10, 0.29674778220154957),
 (11, 0.19428450256362492),
 (12, 0.19638562074430463),
 (13, 0.24300529787323882),
 (14, 0.2088991128043566)]


In [16]:
lda_model = gensim.models.LdaMulticore(corpus, num_topics=4, id2word=id2word, passes=2, workers=2)

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.009*"us" + 0.008*"join" + 0.007*"learn" + 0.005*"park" + 0.005*"program" + 0.004*"nyc" + 0.004*"come" + 0.004*"get" + 0.004*"see" + 0.004*"new"
Topic: 1 
Words: 0.009*"day" + 0.006*"open" + 0.005*"see" + 0.005*"time" + 0.005*"us" + 0.005*"new" + 0.005*"get" + 0.005*"make" + 0.004*"park" + 0.004*"saturday"
Topic: 2 
Words: 0.008*"water" + 0.007*"park" + 0.006*"help" + 0.006*"please" + 0.006*"us" + 0.006*"make" + 0.006*"get" + 0.006*"work" + 0.005*"today" + 0.005*"thank"
Topic: 3 
Words: 0.016*"new" + 0.010*"thank" + 0.007*"today" + 0.007*"us" + 0.006*"open" + 0.005*"brooklyn" + 0.005*"get" + 0.005*"york" + 0.005*"one" + 0.005*"order"


In [38]:
bow_doc_300 = X_train[300]
bow_doc_300

['prepare',
 'memorial',
 'day',
 'formerly',
 'know',
 'decoration',
 'day',
 'search',
 'historic',
 'newspapers',
 'nehfunded']

In [20]:
for index, score in sorted(lda_model[corpus[300]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 4)))


Score: 0.7345105409622192	 
Topic: 0.008*"water" + 0.007*"park" + 0.006*"help" + 0.006*"please"

Score: 0.09021734446287155	 
Topic: 0.009*"day" + 0.006*"open" + 0.005*"see" + 0.005*"time"

Score: 0.08998871594667435	 
Topic: 0.016*"new" + 0.010*"thank" + 0.007*"today" + 0.007*"us"

Score: 0.08528342843055725	 
Topic: 0.009*"us" + 0.008*"join" + 0.007*"learn" + 0.005*"park"


In [17]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=4, id2word=id2word, passes=2, workers=4)

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.003*"open" + 0.003*"thank" + 0.002*"new" + 0.002*"see" + 0.002*"today" + 0.002*"water" + 0.002*"get" + 0.002*"great" + 0.002*"day" + 0.002*"park"
Topic: 1 Word: 0.004*"us" + 0.003*"join" + 0.003*"open" + 0.003*"get" + 0.003*"new" + 0.003*"day" + 0.002*"please" + 0.002*"help" + 0.002*"make" + 0.002*"thank"
Topic: 2 Word: 0.003*"park" + 0.003*"thank" + 0.003*"today" + 0.003*"get" + 0.002*"see" + 0.002*"new" + 0.002*"day" + 0.002*"us" + 0.002*"open" + 0.002*"come"
Topic: 3 Word: 0.004*"thank" + 0.004*"new" + 0.003*"today" + 0.003*"park" + 0.003*"water" + 0.002*"us" + 0.002*"support" + 0.002*"join" + 0.002*"start" + 0.002*"learn"


In [21]:
for index, score in sorted(lda_model_tfidf[corpus[300]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 4)))


Score: 0.7292246222496033	 
Topic: 0.004*"thank" + 0.004*"new" + 0.003*"today" + 0.003*"park"

Score: 0.0921914353966713	 
Topic: 0.003*"open" + 0.003*"thank" + 0.002*"new" + 0.002*"see"

Score: 0.08941055834293365	 
Topic: 0.004*"us" + 0.003*"join" + 0.003*"open" + 0.003*"get"

Score: 0.08917337656021118	 
Topic: 0.003*"park" + 0.003*"thank" + 0.003*"today" + 0.003*"get"
