### Topic modeling

In [22]:
import pandas as pd
import pickle
import scipy.sparse
from gensim import matutils, models

In [3]:
data_dtm = pd.read_pickle('../data/docterm_matrix.pkl')
data_dtm

Unnamed: 0,abest,acosta,acting,administration,affects,ahmed,airlines,align,allows,ambitions,...,welloff,white,whoops,wilfred,willingness,woke,world,year,york,zero
left,0,0,0,0,1,0,0,0,1,1,...,1,0,0,0,1,0,0,0,0,0
right,1,1,2,3,0,1,2,1,0,0,...,0,2,1,1,0,1,1,3,1,1


#### LDA with gensim

In [6]:
data_dtm = data_dtm.transpose()
data_dtm.head()

Unnamed: 0,left,right
abest,0,1
acosta,0,1
acting,0,2
administration,0,3
affects,1,0


In [7]:
# dataframe --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(data_dtm)
corpus = matutils.Sparse2Corpus(sparse_counts)
corpus

<gensim.matutils.Sparse2Corpus at 0x7fa012685c90>

In [26]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
cv = pickle.load(open("../data/cv.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [27]:
# Now that we have the corpus (term-document matrix) and id2word (dictionary of location: term),
# we need to specify two other parameters as well - the number of topics and the number of passes
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
lda.print_topics()

[(0,
  '0.003*"biden" + 0.003*"carlson" + 0.003*"tucker" + 0.003*"new" + 0.003*"fox" + 0.003*"news" + 0.003*"tonight" + 0.003*"discuss" + 0.003*"reacts" + 0.003*"united"'),
 (1,
  '0.012*"biden" + 0.011*"carlson" + 0.010*"tucker" + 0.008*"new" + 0.007*"host" + 0.007*"angle" + 0.007*"ingraham" + 0.007*"reacts" + 0.007*"discuss" + 0.007*"tonight"')]