In [None]:
from gensim.models import LdaModel
import gensim
from gensim.models.coherencemodel import CoherenceModel
import pandas as pd
import numpy as np
import spacy
import gc
from gensim.test.utils import datapath

In [None]:
#import the train dataset
df = pd.read_csv('politics_biz_tech_quotes.csv.gz')

#getting back the list 
from ast import literal_eval
df['stems'] = df.apply(lambda x: literal_eval(x['stems']), axis = 1)
df['stems'].head()

#estricting the train dataset to one topic
pol = df[df['topic'] == 'politics&biz&tech'  ]
#retaining only those with high confidence
print('With 0.5 confidence, retaining only', len(pol[pol.score >0.5])/len(pol),'%')
pol.head()

With 0.7 confidence, retaining only 0.8999959009913745 %


Unnamed: 0,quoteId,lemmas,stems,geoNames,score,topic,spectrum
0,2016-01-15-001323,"['start', 'new', 'tradition', 'field', 'stream...","[start, new, tradit, field, stream, shop, pass...",[],0.765691,politics&biz&tech,"{'politics&biz&tech': 0.7656912803649902, 'spo..."
2,2016-01-15-001439,"['current', 'allegation', 'misrepresentation',...","[current, alleg, misrepresent, primari, goal, ...",[],0.955535,politics&biz&tech,"{'politics&biz&tech': 0.9555345000699162, 'spo..."
5,2016-01-15-001063,"['debate', 'troop', 'afghanistan', 'go', 'reop...","[debat, troop, afghanistan, go, reopen]","['US', 'Afghanistan']",0.759259,politics&biz&tech,"{'politics&biz&tech': 0.7592592481523752, 'spo..."
6,2016-01-15-001143,"['reemergence', 'afghanistan', 'issue']","[reemerg, afghanistan, issu]",['Afghanistan'],0.638896,politics&biz&tech,"{'politics&biz&tech': 0.6388958431780338, 'spo..."
7,2016-01-15-001107,"['james', 'madison', 'visit', 'professorship',...","[jame, madison, visit, professorship, amend, i...",[],0.650817,politics&biz&tech,"{'politics&biz&tech': 0.6508172042667866, 'spo..."


In [None]:
#import the dictionary
dictionary_10 = gensim.corpora.Dictionary.load('1_layer_model.txt.id2word')

#make the corpus
rate = int(len(pol)*0.8)
train_df = pol.iloc[:rate]
test_df = pol.iloc[rate:]
train_corpus = [dictionary_10.doc2bow(doc) for doc in train_df.stems.values]
test_corpus = [dictionary_10.doc2bow(doc) for doc in test_df.stems.values]
print('train_df: ', len(train_df), 'train_corpus', len(train_corpus))
print('test_df: ', len(test_df), 'test_corpus', len(test_corpus))

train_df:  409855 train_corpus 409855
test_df:  102464 test_corpus 102464


In [None]:
#check on the baseline lda model being used
from gensim.test.utils import datapath
from pprint import pprint
temp_lda = LdaModel.load('1_layer_model.txt')
pprint(temp_lda.print_topics())

[(0,
  '0.018*"need" + 0.014*"peopl" + 0.012*"govern" + 0.011*"issu" + '
  '0.010*"system" + 0.009*"problem" + 0.008*"secur" + 0.007*"concern" + '
  '0.007*"countri" + 0.007*"health"'),
 (1,
  '0.026*"work" + 0.022*"great" + 0.018*"peopl" + 0.015*"famili" + '
  '0.013*"commun" + 0.012*"school" + 0.011*"want" + 0.011*"year" + '
  '0.009*"help" + 0.009*"life"'),
 (2,
  '0.044*"go" + 0.032*"think" + 0.032*"like" + 0.030*"know" + 0.029*"want" + '
  '0.026*"peopl" + 0.025*"thing" + 0.017*"time" + 0.015*"come" + 0.015*"say"'),
 (3,
  '0.014*"new" + 0.013*"busi" + 0.012*"year" + 0.008*"market" + '
  '0.008*"develop" + 0.008*"compani" + 0.006*"product" + 0.006*"build" + '
  '0.006*"continu" + 0.006*"high"'),
 (4,
  '0.028*"state" + 0.012*"court" + 0.012*"law" + 0.010*"offic" + 0.010*"decis" '
  '+ 0.009*"case" + 0.009*"presid" + 0.009*"polic" + 0.009*"unit" + '
  '0.008*"govern"'),
 (5,
  '0.018*"woman" + 0.012*"fight" + 0.012*"man" + 0.010*"campaign" + '
  '0.010*"trump" + 0.009*"say" + 0.009

In [None]:
n_topics_to_try = [4, 5, 6] 
lda_models = []
for n in n_topics_to_try:
  train_model =  gensim.models.LdaMulticore(train_corpus, 
                                    num_topics = n, 
                                    id2word = dictionary_10,                                    
                                    passes = 10,
                                    alpha = 0.001,
                                    workers = 5)
  #save the model to file
  temp_file = datapath("./2_layer_model"+str(n))
  train_model.save("1_layer_model"+str(n)+".txt")
  lda_models.append(train_model)
  #compute metrics
  coherence_train = CoherenceModel(model=train_model, texts=train_df.stems.values, dictionary=dictionary_10, coherence='c_v')
  coherence_train = coherence_train.get_coherence()
  coherence_test = CoherenceModel(model=train_model, texts=test_df.stems.values, dictionary=dictionary_10, coherence='c_v')
  coherence_test = coherence_test.get_coherence()
  print('###################')
  print('N_topics:\t', n)
  print('Coherence_train:\t', coherence_train)
  print('Coherence_test:\t', coherence_test)

###################
N_topics:	 4
Coherence_train:	 0.40363980986328296
Coherence_test:	 0.37056612240811865
###################
N_topics:	 5
Coherence_train:	 0.40190590141345134
Coherence_test:	 0.3850878906817027
###################
N_topics:	 6
Coherence_train:	 0.4172401046563407
Coherence_test:	 0.39500101084866585


In [None]:
n_topics_to_try = [7, 8, 9]
for n in n_topics_to_try:
  train_model =  gensim.models.LdaMulticore(train_corpus, 
                                    num_topics = n, 
                                    id2word = dictionary_10,                                    
                                    passes = 10,
                                    alpha = 0.001,
                                    workers = 5)
  #save the model to file
  temp_file = datapath("./2_layer_model"+str(n))
  train_model.save("1_layer_model"+str(n)+".txt")
  lda_models.append(train_model)
  #compute metrics
  coherence_train = CoherenceModel(model=train_model, texts=train_df.stems.values, dictionary=dictionary_10, coherence='c_v')
  coherence_train = coherence_train.get_coherence()
  coherence_test = CoherenceModel(model=train_model, texts=test_df.stems.values, dictionary=dictionary_10, coherence='c_v')
  coherence_test = coherence_test.get_coherence()
  print('###################')
  print('N_topics:\t', n)
  print('Coherence_train:\t', coherence_train)
  print('Coherence_test:\t', coherence_test)

###################
N_topics:	 7
Coherence_train:	 0.4817316083656948
Coherence_test:	 0.4583992125523017
###################
N_topics:	 8
Coherence_train:	 0.4869139428337849
Coherence_test:	 0.4435242418396107
###################
N_topics:	 9
Coherence_train:	 0.49759265710935374
Coherence_test:	 0.47577877610542785


In [None]:
n_topics_to_try = [20, 30]
for n in n_topics_to_try:
  train_model =  gensim.models.LdaMulticore(train_corpus, 
                                    num_topics = n, 
                                    id2word = dictionary_10,                                    
                                    passes = 10,
                                    alpha = 0.001,
                                    workers = 5)
  #save the model to file
  temp_file = datapath("./2_layer_model"+str(n))
  train_model.save("1_layer_model"+str(n)+".txt")
  lda_models.append(train_model)
  #compute metrics
  coherence_train = CoherenceModel(model=train_model, texts=train_df.stems.values, dictionary=dictionary_10, coherence='c_v')
  coherence_train = coherence_train.get_coherence()
  coherence_test = CoherenceModel(model=train_model, texts=test_df.stems.values, dictionary=dictionary_10, coherence='c_v')
  coherence_test = coherence_test.get_coherence()
  print('###################')
  print('N_topics:\t', n)
  print('Coherence_train:\t', coherence_train)
  print('Coherence_test:\t', coherence_test)

###################
N_topics:	 20
Coherence_train:	 0.5291168052327155
Coherence_test:	 0.47003389878826096
###################
N_topics:	 30
Coherence_train:	 0.47302952264310877
Coherence_test:	 0.4374878364580619


In [None]:
n_topics_to_try = [15, 25]
lda_models = []
for n in n_topics_to_try:
  train_model =  gensim.models.LdaMulticore(train_corpus, 
                                    num_topics = n, 
                                    id2word = dictionary_10,                                    
                                    passes = 10,
                                    workers = 5)
  #save the model to file
  temp_file = datapath("./2_layer_model"+str(n))
  train_model.save("1_layer_model"+str(n)+".txt")
  lda_models.append(train_model)
  #compute metrics
  coherence_train = CoherenceModel(model=train_model, texts=train_df.stems.values, dictionary=dictionary_10, coherence='c_v')
  coherence_train = coherence_train.get_coherence()
  coherence_test = CoherenceModel(model=train_model, texts=test_df.stems.values, dictionary=dictionary_10, coherence='c_v')
  coherence_test = coherence_test.get_coherence()
  print('###################')
  print('N_topics:\t', n)
  print('Coherence_train:\t', coherence_train)
  print('Coherence_test:\t', coherence_test)

###################
N_topics:	 15
Coherence_train:	 0.5556450955862073
Coherence_test:	 0.5000772783501467
###################
N_topics:	 25
Coherence_train:	 0.5213544194659454
Coherence_test:	 0.4454921089975931


In [None]:
n_topics_to_try = [10, 12, 17, 22, 25]
lda_models = []
for n in n_topics_to_try:
  train_model =  gensim.models.LdaMulticore(train_corpus, 
                                    num_topics = n, 
                                    id2word = dictionary_10,                                    
                                    passes = 10,
                                    workers = 5)
  #save the model to file
  temp_file = datapath("./2_layer_model"+str(n))
  train_model.save("1_layer_model"+str(n)+".txt")
  lda_models.append(train_model)
  #compute metrics
  coherence_train = CoherenceModel(model=train_model, texts=train_df.stems.values, dictionary=dictionary_10, coherence='c_v')
  coherence_train = coherence_train.get_coherence()
  coherence_test = CoherenceModel(model=train_model, texts=test_df.stems.values, dictionary=dictionary_10, coherence='c_v')
  coherence_test = coherence_test.get_coherence()
  print('###################')
  print('N_topics:\t', n)
  print('Coherence_train:\t', coherence_train)
  print('Coherence_test:\t', coherence_test)

###################
N_topics:	 10
Coherence_train:	 0.5211101649056407
Coherence_test:	 0.4741554900816647
###################
N_topics:	 12
Coherence_train:	 0.5327665367666964
Coherence_test:	 0.4818774490373609
###################
N_topics:	 17
Coherence_train:	 0.5466070181812305
Coherence_test:	 0.4734530859832064
###################
N_topics:	 22
Coherence_train:	 0.5416167440363293
Coherence_test:	 0.47076413495714353
###################
N_topics:	 25
Coherence_train:	 0.51155527113694
Coherence_test:	 0.44446176773356894
