# Topic Modeling

In [64]:
import pandas as pd
import numpy as np

# cluster detection
import sklearn
import sklearn.feature_extraction.text
import sklearn.pipeline
import sklearn.preprocessing
import sklearn.datasets
import sklearn.cluster
import sklearn.decomposition
import sklearn.metrics

import nltk # for collocations
import scipy #For hierarchical clustering and some visuals
#import scipy.cluster.hierarchy
import requests #For downloading our datasets
import matplotlib.pyplot as plt #For graphics
import matplotlib.cm #Still for graphics
import seaborn as sns #Makes the graphics look nicer

### 1. Get data

In [65]:
comments_df = pd.read_csv('../data/comments_df.csv')

Note: normalized_tokens column is string type, not list

In [67]:
normalized_tokens = comments_df['normalized_tokens'][0]
print(type(normalized_tokens))

<class 'str'>


In [68]:
from ast import literal_eval

def converter(x):
    return literal_eval(x)

comments_df = pd.read_csv('../data/comments_df.csv', converters={'tokens':converter, 'normalized_tokens':converter})

In [69]:
comments_df.head(20)

Unnamed: 0.1,Unnamed: 0,username,flair_text,body,tokens_new,normalized_tokens,normalized_tokens_count,word_count
0,0,Tungsten_,,Thanks to everyone who engaged in insightful a...,"['Thanks', 'to', 'everyone', 'who', 'engaged',...","[thank, engage, insightful, respectful, discou...",9,20
1,1,ProudBlackMatt,Chinese-American,I would prefer using a process that takes into...,"['I', 'would', 'prefer', 'using', 'a', 'proces...","[prefer, process, take, account, poverty, inst...",52,103
2,2,TomatoCanned,,"u/Tungsten_, Thanks for creating a section jus...","['u/Tungsten_,', 'Thanks', 'for', 'creating', ...","[u/tungsten_,, thank, create, section, discuss...",126,269
3,3,bad-fengshui,,As with anything related to Asians in politics...,"['As', 'with', 'anything', 'related', 'to', 'A...","[relate, asians, politic, m, see, lot, non, as...",25,59
4,4,Pancake_muncher,,Yet colleges will allow alumni and doners in e...,"['Yet', 'colleges', 'will', 'allow', 'alumni',...","[college, allow, alumnus, doner, easily, consi...",19,40
5,5,suberry,,I just hated Affirmative Action as a distracti...,"['I', 'just', 'hated', 'Affirmative', 'Action'...","[hate, affirmative, action, distraction, banda...",78,171
6,6,Puzzled-Painter3301,,My own feeling is that I was never in love wit...,"['My', 'own', 'feeling', 'is', 'that', 'I', 'w...","[feeling, love, affirmative, action, possible,...",102,231
7,7,e9967780,,Anti Asian racism whether against East Asians ...,"['Anti', 'Asian', 'racism', 'whether', 'agains...","[anti, asian, racism, east, asians, south, asi...",21,46
8,8,,,Can we overturn legacy and athlete admissions ...,"['Can', 'we', 'overturn', 'legacy', 'and', 'at...","[overturn, legacy, athlete, admission, point, ...",15,29
9,9,OkartoIceCream,,"I want to remind people that in California, on...","['I', 'want', 'to', 'remind', 'people', 'that'...","[want, remind, people, california, progressive...",104,200


In [6]:
normalized_tokens = comments_df['normalized_tokens'][0]
tokens_new = comments_df['tokens'][0]
print(type(normalized_tokens))
print(type(tokens_new))

<class 'list'>
<class 'list'>


Token columns are now lists.

In [11]:
comments_df.head(10)

Unnamed: 0.1,Unnamed: 0,username,flair_text,body,tokens,normalized_tokens,word_count,normalized_tokens_count
0,0,Tungsten_,,Thanks to everyone who engaged in insightful a...,"[Thanks, to, everyone, who, engaged, in, insig...","[thank, engage, insightful, respectful, discou...",20,9
1,1,ProudBlackMatt,Chinese-American,I would prefer using a process that takes into...,"[I, would, prefer, using, a, process, that, ta...","[prefer, process, take, account, poverty, inst...",103,52
2,2,TomatoCanned,,"u/Tungsten_, Thanks for creating a section jus...","[u/Tungsten_,, Thanks, for, creating, a, secti...","[u/tungsten_,, thank, create, section, discuss...",269,126
3,3,bad-fengshui,,As with anything related to Asians in politics...,"[As, with, anything, related, to, Asians, in, ...","[relate, asians, politic, m, see, lot, non, as...",59,25
4,4,Pancake_muncher,,Yet colleges will allow alumni and doners in e...,"[Yet, colleges, will, allow, alumni, and, done...","[college, allow, alumnus, doner, easily, consi...",40,19
5,5,suberry,,I just hated Affirmative Action as a distracti...,"[I, just, hated, Affirmative, Action, as, a, d...","[hate, affirmative, action, distraction, banda...",171,78
6,6,Puzzled-Painter3301,,My own feeling is that I was never in love wit...,"[My, own, feeling, is, that, I, was, never, in...","[feeling, love, affirmative, action, possible,...",231,102
7,7,e9967780,,Anti Asian racism whether against East Asians ...,"[Anti, Asian, racism, whether, against, East, ...","[anti, asian, racism, east, asians, south, asi...",46,21
8,8,,,Can we overturn legacy and athlete admissions ...,"[Can, we, overturn, legacy, and, athlete, admi...","[overturn, legacy, athlete, admission, point, ...",29,15
9,9,OkartoIceCream,,"I want to remind people that in California, on...","[I, want, to, remind, people, that, in, Califo...","[want, remind, people, california, progressive...",200,104


Text has already been tokenized, lemmatized, normalized.

### 1.1 Collocations

In [13]:
comments_df.tail(5)

Unnamed: 0.1,Unnamed: 0,username,flair_text,body,tokens,normalized_tokens,word_count,normalized_tokens_count
3278,3618,aduogetsatastegouda,,But that's irrelevant. The right not to be dis...,"[But, that, 's, irrelevant, The, right, not, t...","[irrelevant, right, discriminate, base, race, ...",84,38
3279,3619,rentonwong,Support Asian-American Media!,"Despite my dislike of AA, at least 2/3rds of A...","[Despite, my, dislike, of, AA, at, least, 2/3r...","[despite, dislike, aa, 2/3rds, asian, american...",32,19
3280,3620,rentonwong,Support Asian-American Media!,> If 1/3 of a racial minority's members say th...,"[>, If, 1/3, of, a, racial, minority, 's, memb...","[>, racial, minority, member, want, discrimina...",61,27
3281,3621,,,I'm just annoyed at how there's so much handwa...,"[I, 'm, just, annoyed, at, how, there, 's, so,...","[m, annoyed, handwaving, consequence, pro, aa,...",117,48
3282,3622,rentonwong,Support Asian-American Media!,The current system as it stands preserves whil...,"[The, current, system, as, it, stands, preserv...","[current, system, stand, preserve, privilege, ...",102,49


In [8]:
bigrams = nltk.collocations.BigramCollocationFinder.from_words(comments_df['normalized_tokens'].sum())
print(f'There are {bigrams.N} bigrams in the finder.')

There are 129802 bigrams in the finder.


Note: Why are there so many bigrams? 130955 of them?

In [9]:
def bigramScoring(count, wordsTuple, total):
    return count
bigrams.nbest(bigramScoring, 50)

[('affirmative', 'action'),
 ('asian', 'americans'),
 ('asian', 'american'),
 ('white', 'people'),
 ('high', 'school'),
 ('college', 'admission'),
 ('race', 'base'),
 ('asian', 'student'),
 ('legacy', 'admission'),
 ('test', 'score'),
 ('ivy', 'league'),
 ('white', 'student'),
 ('high', 'education'),
 ('black', 'hispanic'),
 ('support', 'affirmative'),
 ('black', 'people'),
 ('student', 'body'),
 ('asian', 'applicant'),
 ('model', 'minority'),
 ('black', 'latino'),
 ('chinese', 'americans'),
 ('middle', 'class'),
 ('supreme', 'court'),
 ('black', 'student'),
 ('asian', 'kid'),
 ('sit', 'score'),
 ('feel', 'like'),
 ('african', 'american'),
 ('admission', 'officer'),
 ('east', 'asians'),
 ('m', 'sure'),
 ('admission', 'process'),
 ('asian', 'people'),
 ('minority', 'group'),
 ('holistic', 'admission'),
 ('white', 'supremacy'),
 ('african', 'americans'),
 ('base', 'affirmative'),
 ('personality', 'score'),
 ('american', 'student'),
 ('elite', 'school'),
 ('low', 'income'),
 ('united', 's

In [14]:
bgs = nltk.collocations.BigramCollocationFinder.from_words(comments_df['normalized_tokens'].sum())

In [11]:
print(comments_df['normalized_tokens'].sum())



In [12]:
fdist = nltk.FreqDist(bgs)
for k,v in fdist.items():
    print(k,v)

('thank', 'engage') 1
('engage', 'insightful') 1
('insightful', 'respectful') 1
('respectful', 'discourse') 1
('discourse', 'news') 1
('news', 'thread') 1
('thread', 'lock') 1
('lock', 'comment') 1
('comment', 'prefer') 1
('prefer', 'process') 3
('process', 'take') 4
('take', 'account') 10
('account', 'poverty') 3
('poverty', 'instead') 3
('instead', 'generation') 1
('generation', 'family') 1
('family', 'come') 3
('come', 'america') 4
('america', 'painfully') 1
('painfully', 'poor') 1
('poor', 'show') 1
('show', 'money') 1
('money', 'education') 1
('education', 'work') 2
('work', 'kitchen') 1
('kitchen', 'laundromat') 1
('laundromat', 'notice') 1
('notice', 'lot') 1
('lot', 'people') 23
('people', 'big') 3
('big', 'reddit') 1
('reddit', 'board') 1
('board', 'talk') 1
('talk', 'shit') 1
('shit', 'chinese') 1
('chinese', 'billionaire') 1
('billionaire', 'boogeyman') 1
('boogeyman', 'fearmongere') 1
('fearmongere', 'like') 1
('like', 'erase') 1
('erase', 'visible') 1
('visible', 'asian') 

KeyboardInterrupt: 

In [15]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
bgs.score_ngrams(bigram_measures.likelihood_ratio)[:40]
# other options include student_t, chi_sq, likelihood_ratio, pmi

[(('affirmative', 'action'), 15226.4342078786),
 (('asian', 'americans'), 3585.2902643882453),
 (('asian', 'american'), 1865.1904979227793),
 (('ivy', 'league'), 1183.4430297823765),
 (('supreme', 'court'), 971.0789262940524),
 (('high', 'school'), 899.7342577793054),
 (('test', 'score'), 848.2646167185108),
 (('white', 'people'), 783.9618131394609),
 (('united', 'states'), 771.8831626004769),
 (('model', 'minority'), 704.6066692035264),
 (('college', 'admission'), 696.5088624079676),
 (('student', 'body'), 692.6797459082904),
 (('middle', 'class'), 646.0384289558522),
 (('black', 'hispanic'), 627.3234302611875),
 (('race', 'base'), 625.6423922927731),
 (('legacy', 'admission'), 623.9503054537987),
 (('black', 'latino'), 596.0896798979427),
 (('high', 'education'), 550.8718417094477),
 (('edward', 'blum'), 540.2974416669242),
 (('sit', 'score'), 531.8083060341709),
 (('admission', 'officer'), 501.58876730325113),
 (('graduation', 'rate'), 443.5077791786115),
 (('african', 'american'), 

In [16]:
bgs.score_ngrams(bigram_measures.student_t)[:40]

[(('affirmative', 'action'), 37.69224105284759),
 (('asian', 'americans'), 23.090381305967437),
 (('asian', 'american'), 17.711920574842694),
 (('white', 'people'), 13.3812770752901),
 (('high', 'school'), 12.287735844210761),
 (('college', 'admission'), 11.452436044208717),
 (('race', 'base'), 10.2855629978962),
 (('legacy', 'admission'), 10.123050116515795),
 (('test', 'score'), 9.435707031071585),
 (('ivy', 'league'), 9.420340198490383),
 (('high', 'education'), 9.053814146665587),
 (('black', 'hispanic'), 8.999958991282993),
 (('student', 'body'), 8.803481284306),
 (('model', 'minority'), 8.711603985534374),
 (('support', 'affirmative'), 8.64643726586451),
 (('asian', 'student'), 8.5615742827561),
 (('black', 'latino'), 8.514927734109916),
 (('supreme', 'court'), 8.415932453697216),
 (('middle', 'class'), 8.375928105017985),
 (('white', 'student'), 8.094991214610424),
 (('chinese', 'americans'), 8.089565211940107),
 (('black', 'people'), 7.9536490914350715),
 (('asian', 'applicant'

In [21]:
trigrams = nltk.collocations.TrigramCollocationFinder.from_words(comments_df['normalized_tokens'].sum())

In [22]:
trigram_measures = nltk.collocations.TrigramAssocMeasures()
trigrams.score_ngrams(trigram_measures.likelihood_ratio)[:40]

[(('support', 'affirmative', 'action'), 23385.340467626396),
 (('base', 'affirmative', 'action'), 23055.67523406557),
 (('affirmative', 'action', 'program'), 23053.65651325944),
 (('affirmative', 'action', 'policy'), 22967.50265134967),
 (('opponent', 'affirmative', 'action'), 22965.70702544531),
 (('oppose', 'affirmative', 'action'), 22951.76800655381),
 (('rid', 'affirmative', 'action'), 22951.747058966674),
 (('benefit', 'affirmative', 'action'), 22936.611196017577),
 (('affirmative', 'action', 'help'), 22922.729946090803),
 (('opposition', 'affirmative', 'action'), 22918.39458770029),
 (('affirmative', 'action', 'base'), 22914.642282778324),
 (('anti', 'affirmative', 'action'), 22909.472604514005),
 (('dismantle', 'affirmative', 'action'), 22902.060628516592),
 (('affirmative', 'action', 'c94b5a9c'), 22893.559688642898),
 (('restore', 'affirmative', 'action'), 22893.559688642898),
 (('pro', 'affirmative', 'action'), 22890.117348688935),
 (('favor', 'affirmative', 'action'), 22889.4

In [19]:
[s for s in dir(trigram_measures) if s[0] != '_']

['chi_sq',
 'jaccard',
 'likelihood_ratio',
 'mi_like',
 'pmi',
 'poisson_stirling',
 'raw_freq',
 'student_t']

***Come back to collocations and n-grams after finding out what role it would play:***
- Would some of the statistically significant n-grams be terms in the td-idf matrix?

### 2. CountVectorizer

First, before vectorizing, convert normalized_tokens column from list to string type

In [23]:
comments_df['normalized_tokens_str'] = comments_df['normalized_tokens'].apply(lambda x: ' '.join(item for item in x))

In [24]:
print(comments_df['normalized_tokens_str'][0])
comments_df.head(5)

thank engage insightful respectful discourse news thread lock comment


Unnamed: 0.1,Unnamed: 0,username,flair_text,body,tokens,normalized_tokens,word_count,normalized_tokens_count,normalized_tokens_str
0,0,Tungsten_,,Thanks to everyone who engaged in insightful a...,"[Thanks, to, everyone, who, engaged, in, insig...","[thank, engage, insightful, respectful, discou...",20,9,thank engage insightful respectful discourse n...
1,1,ProudBlackMatt,Chinese-American,I would prefer using a process that takes into...,"[I, would, prefer, using, a, process, that, ta...","[prefer, process, take, account, poverty, inst...",103,52,prefer process take account poverty instead ge...
2,2,TomatoCanned,,"u/Tungsten_, Thanks for creating a section jus...","[u/Tungsten_,, Thanks, for, creating, a, secti...","[u/tungsten_,, thank, create, section, discuss...",269,126,"u/tungsten_, thank create section discuss read..."
3,3,bad-fengshui,,As with anything related to Asians in politics...,"[As, with, anything, related, to, Asians, in, ...","[relate, asians, politic, m, see, lot, non, as...",59,25,relate asians politic m see lot non asian peop...
4,4,Pancake_muncher,,Yet colleges will allow alumni and doners in e...,"[Yet, colleges, will, allow, alumni, and, done...","[college, allow, alumnus, doner, easily, consi...",40,19,college allow alumnus doner easily consider me...


In [25]:
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer()
count_vector = count_vectorizer.fit_transform(comments_df['normalized_tokens_str'])

In [26]:
print(count_vector.shape)

(3283, 10416)


- 3283 rows, 10416 columns/unique tokens

What's the average number of tokens per comment? -> About 40 tokens per comment

In [34]:
# avg num of tokens per row
total_num_comments = len(comments_df['normalized_tokens'].sum())

avg_num_token_per_comment = total_num_comments/comments_df.shape[0]

print(avg_num_token_per_comment)

39.53761803228754


### 3. TD-IDF Vectorizer

In [27]:
tdidf_transformer = sklearn.feature_extraction.text.TfidfTransformer()
tdidf_vector = tdidf_transformer.fit_transform(count_vector)

In [28]:
list(zip(count_vectorizer.vocabulary_.keys(), tdidf_vector.data))[:20]

[('thank', 0.2780200460567064),
 ('engage', 0.26195927494969323),
 ('insightful', 0.43215032574037665),
 ('respectful', 0.2575158754452915),
 ('discourse', 0.3727913011076437),
 ('news', 0.416606600805565),
 ('thread', 0.3353399263238714),
 ('lock', 0.35292279562425094),
 ('comment', 0.22687459584571354),
 ('prefer', 0.08876180082890424),
 ('process', 0.16869934059802721),
 ('take', 0.10273522940576957),
 ('account', 0.09721605705964464),
 ('poverty', 0.12282557059523336),
 ('instead', 0.11604808805308388),
 ('generation', 0.11604808805308388),
 ('family', 0.10873273422159661),
 ('come', 0.1451705776706874),
 ('america', 0.1381517221258386),
 ('painfully', 0.11937042691013038)]

### 4. Prune Matrix of features

In [29]:
#initialize
prune_vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(min_df=3, max_features=1000, stop_words='english', norm='l2', ngram_range=(1,2)) # why norm=l2?
#train
pruned_vec = prune_vectorizer.fit_transform(comments_df['normalized_tokens_str'])

- min document freq=3 because low document frequency inflates td-idf
- An idea: visualize document freq of each word

In [30]:
pruned_vec

<3283x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 76130 stored elements in Compressed Sparse Row format>

Now, matrix is only 1000 terms/columns

In [32]:
# try to find term in matrix
termtofind = 'hello'
try:
    print(prune_vectorizer.vocabulary_[termtofind])
except KeyError:
    print(f'"{termtofind}" is missing')
    print('The available words are: {} ...'.format(list(prune_vectorizer.vocabulary_.keys())[:10]))

"hello" is missing
The available words are: ['thank', 'news', 'thread', 'comment', 'prefer', 'process', 'account', 'poverty', 'instead', 'generation'] ...


### Gensim

1. Create corpus object
2. Create Dictionary that maps tokens to ids

In [70]:
from gensim.models.phrases import Phrases
from gensim.models import LdaModel, CoherenceModel, LsiModel, HdpModel
from gensim.corpora import Dictionary

In [35]:
#dict_ = gensim.corpora.Dictionary(comments_df['normalized_tokens'])

In [71]:
comments = []
comments_df = comments_df.reset_index()

for ind, row in comments_df.iterrows():
    comments.append(row['normalized_tokens'])

In [72]:
print(comments[0])

['thank', 'engage', 'insightful', 'respectful', 'discourse', 'news', 'thread', 'lock', 'comment']


In [73]:
bigram = Phrases(comments, min_count=1, threshold=1)
texts = [bigram[line] for line in comments]

print(texts[1])

['prefer_process', 'take_account', 'poverty_instead', 'generation', 'family_come', 'america', 'painfully', 'poor', 'show', 'money', 'education', 'work', 'kitchen', 'laundromat', 'notice', 'lot_people', 'big', 'reddit', 'board', 'talk', 'shit', 'chinese', 'billionaire', 'boogeyman', 'fearmongere', 'like', 'erase', 'visible', 'asian', 'race', 'come_america', 'refugee', 'reduce_asians', 'monolithic', 'rich', 'asian_stereotype', 'help_chinese', 'people_know', 'come_america', 'bag', 'cash']


In [74]:
dict_ = Dictionary(comments)
corpus = [dict_.doc2bow(comment) for comment in comments]

print(corpus[1])

[(9, 1), (10, 3), (11, 2), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 3), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 2), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1)]


In [75]:
lda_model = LdaModel(corpus=corpus, num_topics=10, id2word=dict_)
lda_model.show_topics()

[(0,
  '0.022*"aa" + 0.013*"people" + 0.011*"school" + 0.011*"admission" + 0.011*"race" + 0.009*"asian" + 0.009*"action" + 0.008*"think" + 0.008*"college" + 0.007*"affirmative"'),
 (1,
  '0.013*"action" + 0.012*"affirmative" + 0.008*"asian" + 0.007*"like" + 0.007*"say" + 0.005*"think" + 0.005*"minority" + 0.005*"student" + 0.005*"group" + 0.005*"comment"'),
 (2,
  '0.023*"affirmative" + 0.022*"action" + 0.017*"asian" + 0.011*"college" + 0.011*"black" + 0.011*"people" + 0.009*"white" + 0.009*"admission" + 0.009*"student" + 0.009*"americans"'),
 (3,
  '0.026*"asian" + 0.011*"think" + 0.010*"school" + 0.009*"americans" + 0.009*"american" + 0.008*"student" + 0.008*"like" + 0.007*"action" + 0.007*"affirmative" + 0.006*"say"'),
 (4,
  '0.018*"action" + 0.015*"asian" + 0.015*"affirmative" + 0.011*"race" + 0.010*"admission" + 0.009*"system" + 0.008*"group" + 0.007*"high" + 0.007*"white" + 0.006*"american"'),
 (5,
  '0.014*"score" + 0.013*"asian" + 0.013*"admission" + 0.010*"asians" + 0.010*"ha

In [76]:
hdp_model = HdpModel(corpus=corpus, id2word=dict_)
hdp_model.show_topics()

[(0,
  '0.017*asian + 0.010*action + 0.010*affirmative + 0.009*people + 0.009*white + 0.008*school + 0.007*student + 0.007*think + 0.007*asians + 0.006*like + 0.006*race + 0.006*americans + 0.006*admission + 0.006*minority + 0.006*black + 0.005*college + 0.005*harvard + 0.005*aa + 0.005*american + 0.004*say'),
 (1,
  '0.012*asian + 0.008*action + 0.008*affirmative + 0.006*white + 0.006*people + 0.005*race + 0.005*asians + 0.005*student + 0.005*admission + 0.005*harvard + 0.005*think + 0.005*like + 0.004*aa + 0.004*americans + 0.004*school + 0.004*minority + 0.004*black + 0.003*college + 0.003*issue + 0.003*m'),
 (2,
  '0.008*asian + 0.004*people + 0.004*white + 0.004*action + 0.004*affirmative + 0.003*asians + 0.003*race + 0.003*student + 0.003*chinese + 0.003*admission + 0.003*like + 0.003*black + 0.003*college + 0.003*school + 0.003*think + 0.003*applicant + 0.002*group + 0.002*aa + 0.002*americans + 0.002*minority'),
 (3,
  '0.007*asian + 0.006*affirmative + 0.006*action + 0.005*adm

#### LDA with unigrams

In [37]:
bigram = gensim.models.phrases.Phrases(corpus)
texts = [bigram[line] for line in corpus]
texts = [bigram[line] for line in corpus]

print(texts[0])

TypeError: decoding to str: need a bytes-like object, tuple found

Serialize the corpus using the sparse coordinate Matrix Market format. It wraps a term-document matrix on disk and present it as object that supports iteration over matrix rows. This is important when corpus is large.

In [50]:
gensim.corpora.MmCorpus.serialize('comments.mm', corpus)
comments_mm = gensim.corpora.MmCorpus('comments.mm')

In [55]:
comments_lda = gensim.models.ldamodel.LdaModel(corpus=comments_mm, id2word=dict_, num_topics=10, alpha='auto', eta='auto')

In [52]:
comment1Bow = dict_.doc2bow(comments_df['normalized_tokens'][0])
comment1lda = comments_lds[comment1Bow]
print("The topics of the text: {}".format(comments_df['body'][0]))
print("are: {}".format(comment1lda))

The topics of the text: Thanks to everyone who engaged in insightful and respectful discourse about the news. 

This thread is now locked for comments.
are: [(3, 0.012458167), (7, 0.01029491), (8, 0.92109084)]


In [56]:
ldaDF = pd.DataFrame({
        'id': comments_df.index,
        'body' : comments_df['body'],
        'topics' : [comments_lda[dict_.doc2bow(l)] for l in comments_df['normalized_tokens']]
    })

In [58]:
#Dict to temporally hold the probabilities
topicsProbDict = {i : [0] * len(ldaDF) for i in range(comments_lda.num_topics)}

#Load them into the dict
for index, topicTuples in enumerate(ldaDF['topics']):
    for topicNum, prob in topicTuples:
        topicsProbDict[topicNum][index] = prob

#Update the DataFrame
for topicNum in range(comments_lda.num_topics):
    ldaDF['topic_{}'.format(topicNum)] = topicsProbDict[topicNum]

ldaDF[0::100]

Unnamed: 0,id,body,topics,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9
0,0,Thanks to everyone who engaged in insightful a...,"[(3, 0.0106167495), (4, 0.011217218), (6, 0.92...",0.0,0.0,0.0,0.010617,0.011217,0.0,0.920128,0.0,0.0,0.0
100,100,"Why couldn't they, though? In my mind there's ...","[(0, 0.010108083), (1, 0.01100792), (3, 0.0133...",0.010108,0.011008,0.0,0.013325,0.014079,0.011092,0.01155,0.010627,0.010043,0.898984
200,200,You sure about that?\n\nWon't the fact that un...,"[(3, 0.68547374), (4, 0.29744232)]",0.0,0.0,0.0,0.685474,0.297442,0.0,0.0,0.0,0.0,0.0
300,300,"I didn't, someone else did. I'll upvote you.","[(0, 0.042363152), (1, 0.046131656), (2, 0.038...",0.042363,0.046132,0.03849,0.587529,0.058939,0.046487,0.0484,0.044534,0.042093,0.045034
400,400,I posted the link to give more context in my r...,"[(5, 0.124859564), (9, 0.8714679)]",0.0,0.0,0.0,0.0,0.0,0.12486,0.0,0.0,0.0,0.871468
500,500,But while the percentage of “students of color...,"[(1, 0.18496393), (3, 0.21084195), (4, 0.45112...",0.0,0.184964,0.0,0.210842,0.451125,0.0,0.130737,0.0,0.0,0.021136
600,600,I have zero faith that this would actually be ...,"[(0, 0.020511512), (1, 0.022337135), (2, 0.018...",0.020512,0.022337,0.018636,0.800261,0.028561,0.022508,0.023436,0.021564,0.02038,0.021805
700,700,You have some misunderstanding about the natur...,"[(4, 0.97912586)]",0.0,0.0,0.0,0.0,0.979126,0.0,0.0,0.0,0.0,0.0
800,800,>except perhaps at the very top\n\nSo that's n...,"[(0, 0.027631233), (1, 0.030089056), (2, 0.025...",0.027631,0.030089,0.025105,0.036407,0.038455,0.03032,0.726116,0.029048,0.027455,0.029375
900,900,But why chime in to make a dig on this kid bef...,"[(4, 0.95921713)]",0.0,0.0,0.0,0.0,0.959217,0.0,0.0,0.0,0.0,0.0


In [59]:
comments_lda.show_topic(1)

[('asian', 0.01784408),
 ('asians', 0.013170793),
 ('think', 0.010688461),
 ('like', 0.009915449),
 ('black', 0.0090882415),
 ('school', 0.0069274665),
 ('go', 0.006630172),
 ('people', 0.006596642),
 ('m', 0.0065906732),
 ('harvard', 0.0058167796)]

In [60]:
comments_lda.show_topic(2)

[('asian', 0.021551589),
 ('people', 0.010022552),
 ('affirmative', 0.009944041),
 ('white', 0.00979761),
 ('action', 0.009154163),
 ('americans', 0.009126073),
 ('think', 0.00794701),
 ('school', 0.006344072),
 ('way', 0.005961804),
 ('asians', 0.0054197386)]

In [61]:
comments_lda.show_topic(3)

[('action', 0.038633417),
 ('affirmative', 0.03834887),
 ('asian', 0.022084871),
 ('people', 0.010441354),
 ('minority', 0.010119891),
 ('school', 0.009288386),
 ('americans', 0.009026395),
 ('support', 0.008406855),
 ('college', 0.008172725),
 ('asians', 0.007327383)]

In [62]:
topicsDict = {}
for topicNum in range(comments_lda.num_topics):
    topicWords = [w for w, p in comments_lda.show_topic(topicNum)]
    topicsDict['Topic_{}'.format(topicNum)] = topicWords

wordRanksDF = pd.DataFrame(topicsDict)
wordRanksDF

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,Topic_9
0,white,asian,asian,action,asian,asian,asian,asian,people,people
1,people,asians,people,affirmative,people,admission,white,race,like,chinese
2,asian,think,affirmative,asian,race,race,student,minority,aa,asian
3,>,like,white,people,affirmative,aa,>,people,think,aa
4,harvard,black,action,minority,action,student,school,asians,white,think
5,like,school,americans,school,asians,school,americans,americans,asian,minority
6,asians,go,think,americans,white,action,admission,racism,school,good
7,aa,people,school,support,admission,college,action,white,harvard,student
8,student,m,way,college,black,american,affirmative,m,get,school
9,group,harvard,asians,asians,think,americans,black,action,issue,university
