In [1]:
import argparse
import re
import os
import warnings
warnings.filterwarnings('ignore')
import gensim
from gensim import corpora
from gensim.models import CoherenceModel
import time
from pprint import pprint

In [2]:
# Load preprocessed data
data_words = []
data_words_path = "data_words.txt"

with open(data_words_path, "r", encoding = 'utf-8') as f:
    file = f.readlines()
for line in file:
    words = line.split("', '")
    data_words.append(words)    
del file

In [3]:
# Create dictionary
id2word = corpora.Dictionary(data_words)
id2word.filter_extremes(no_below=5, no_above=0.5)
# Create corpus
corpus = [id2word.doc2bow(text) for text in data_words]

In [4]:
# Calculate topic diversity score, score is between 0-1
def topic_diversity(lda_model, t):
    topics = str(lda_model.print_topics(t))
    topics = re.sub(r'\W|1|2|3|4|5|6|7|8|9', "", topics)
    topics = re.sub(r'0', " ", topics)
    topiclist = topics.split(' ')
    topiclist = [x for x in topiclist if x != '']
    topicset = set(topiclist)
    diversity_score = len(topicset)/len(topiclist)
    return diversity_score

In [5]:
# Build LDA model, only keeping the results
def build_lda_model(corpus, dictionary, topic, alpha, beta, seed):
    lda_model = gensim.models.LdaModel(corpus=corpus,
                                       id2word=dictionary,
                                       num_topics=topic, 
                                       random_state=seed,
                                       chunksize=100,
                                       passes=10,
                                       alpha=alpha,
                                       eta=beta)
    pprint(lda_model.print_topics())

    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, \
                          dictionary=id2word, coherence='c_v')
    coherence_score = '%.3f'%coherence_model_lda.get_coherence()
    diversity_score = '%.3f'%topic_diversity(lda_model, topic)
    
    model_name = 'topic_'+str(topic)+'_alpha_'+str(alpha)+'_beta_'+str(beta)+'_coh_'+str(coherence_score)\
                   +'_div_'+str(diversity_score)
    # make sure the folder already exists
    model_path = str(model_name) + ".txt"
    lda_model.save(model_path)
    # note that only the name and scores of the model is saved into a txt file here to save memory
    f = open(model_path, 'w')
    f.write(model_name)
    f.close()
    return coherence_score, diversity_score

In [6]:
t = 30
a = 0.1
b = 0.1
s = 42
print(build_lda_model(corpus=corpus, dictionary=id2word, topic=t, alpha=a, beta=b, seed=s))

[(4,
  '0.097*"language" + 0.044*"dialect" + 0.038*"spanish" + 0.035*"dutch" + '
  '0.035*"zuid_gelder" + 0.031*"speak" + 0.021*"word" + 0.018*"english" + '
  '0.013*"netherland" + 0.012*"french"'),
 (26,
  '0.015*"reverend" + 0.000*"wilson" + 0.000*"postman" + 0.000*"bryan" + '
  '0.000*"primus" + 0.000*"menlo_park" + 0.000*"irishman" + 0.000*"arle" + '
  '0.000*"boone" + 0.000*"niagara_fall"'),
 (25,
  '0.009*"bear" + 0.009*"family" + 0.008*"death" + 0.008*"die" + 0.007*"write" '
  '+ 0.007*"child" + 0.006*"return" + 0.005*"young" + 0.005*"school" + '
  '0.005*"say"'),
 (18,
  '0.235*"island" + 0.048*"shark" + 0.040*"wale" + 0.040*"australian" + '
  '0.033*"tiger" + 0.022*"extinct" + 0.015*"wombat" + 0.013*"australia" + '
  '0.013*"tonga" + 0.012*"woodpecker"'),
 (8,
  '0.016*"octavian" + 0.004*"mark_antony" + 0.002*"cleopatra" + '
  '0.002*"caesarion" + 0.001*"octavia_minor" + 0.001*"ides_march" + '
  '0.000*"gaius_julius" + 0.000*"antony_cleopatra" + 0.000*"triumvirate" + '
  '0.00