### Topic modeling

In [28]:
import pandas as pd
import pickle
import scipy.sparse
from gensim import matutils, models

In [29]:
data_dtm = pd.read_pickle('../data/docterm_matrix.pkl')
data_dtm

Unnamed: 0,aapi,aba,abandon,abandoned,abel,abest,abilities,ability,able,abortion,...,yorker,yorkers,youd,youll,young,youre,youve,zeal,zero,zucker
left,0,1,1,0,1,0,0,2,2,0,...,0,0,0,0,3,0,0,0,0,0
right,2,0,0,1,0,1,1,0,3,10,...,1,3,1,4,6,2,2,1,4,1


#### LDA with gensim

In [30]:
data_dtm = data_dtm.transpose()
data_dtm.head()

Unnamed: 0,left,right
aapi,0,2
aba,1,0
abandon,1,0
abandoned,0,1
abel,1,0


In [31]:
# dataframe --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(data_dtm)
corpus = matutils.Sparse2Corpus(sparse_counts)
corpus

<gensim.matutils.Sparse2Corpus at 0x7fa01130c450>

In [32]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
cv = pickle.load(open("../data/cv.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [33]:
# Now that we have the corpus (term-document matrix) and id2word (dictionary of location: term),
# we need to specify two other parameters as well - the number of topics and the number of passes
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
lda.print_topics()

[(0,
  '0.006*"biden" + 0.005*"click" + 0.004*"people" + 0.004*"just" + 0.004*"new" + 0.004*"like" + 0.004*"states" + 0.004*"news" + 0.004*"president" + 0.003*"democrats"'),
 (1,
  '0.006*"biden" + 0.004*"people" + 0.004*"like" + 0.004*"new" + 0.003*"mr" + 0.003*"american" + 0.003*"government" + 0.002*"times" + 0.002*"google" + 0.002*"white"')]