# Topic Modeling with LDA (Gensim)

## Prerequisites

In [None]:
import pandas as pd
import string
import os
import time

mallet_path = '../lib/mallet-2.0.8/bin/mallet'
use_mallet  = False

# input files
corpus_dir      = '../corpus/'
dict_filename   = corpus_dir + 'gps_ngrams.dict'
corpus_filename = corpus_dir + 'gps_ngrams_bow.mm'

# output files
model_dir       = '../model/{}_topics/'
model_filename  = 'topic_model.lda'

## Load Dictionary and Corpus

In [None]:
from gensim import corpora, models

dictionary = corpora.Dictionary.load(dict_filename)
print(dictionary)

corpus_bow = corpora.MmCorpus(corpus_filename)
print(corpus_bow)

## Model Computations

In [None]:
num_topics = 150

In [None]:
def compute_lda(corpus, id2word, num_topics):
    
    print('Computing LDA for', num_topics, 'topics...')
    
    start_time = time.time()
    model_lda = models.LdaModel(corpus, id2word=id2word, num_topics=num_topics)
    print("--- took %d:%.2d minutes ---" % divmod(time.time() - start_time, 60))
    
    return model_lda

def compute_lda_mallet(corpus, id2word, num_topics):
    
    print('Computing LDA (Mallet) for', num_topics, 'topics...')
    
    start_time = time.time()
    model_lda_mallet = models.wrappers.LdaMallet(mallet_path, corpus=corpus_bow, num_topics=num_topics, id2word=dictionary)
    print("--- took %d:%.2d minutes ---" % divmod(time.time() - start_time, 60))
    
    return model_lda_mallet

def save_model(model_lda, num_topics):
    pathname = model_dir.format(num_topics)
    
    # ensure directories exist
    if not os.path.exists(pathname):
        os.makedirs(pathname)
    
    filename = pathname + model_filename
    
    print('Saving LDA model to ', filename)
    model_lda.save(filename)
    return

def load_model(num_topics):
    filename = model_dir.format(num_topics) + model_filename
    print('Loading LDA model from', filename)
    return models.ldamodel.LdaModel.load(filename)

def get_model(corpus, id2word, num_topics):
    try:
        model_lda = load_model(num_topics)
    except  IOError as e:
        errno, strerror = e.args
        print("I/O error({0}): {1}".format(errno,strerror))
        model_lda = compute_lda(corpus, id2word, num_topics)
        save_model(model_lda, num_topics)
    
    print(model_lda)
    return model_lda

def compute_coherence(model, corpus, id2word, num_topics, coherence='u_mass'):
    cm = models.CoherenceModel(model=model, corpus=corpus, dictionary=id2word, coherence=coherence)
    return cm.get_coherence()

def create_models(corpus, dictionary, num_topics_list):
    
    coherence_vals = []
    
    for num_topics in num_topics_list:
        
        model_lda = get_model(corpus, dictionary, num_topics)
        coh = compute_coherence(model_lda, corpus, dictionary, num_topics)
        
        coherence_vals.append((num_topics, coh))
    
    return coherence_vals

## Compute LDA

In [None]:
model_lda = get_model(corpus_bow, dictionary, num_topics)
coherence = compute_coherence(model_lda, corpus_bow, dictionary, num_topics)
print("coherence: {:.3f}".format(coherence))

In [None]:
# model_lda.print_topics(10)
# model_lda.top_topics(corpus_bow)

### Using Mallet

In [None]:
if use_mallet:
    num_topics = 10
    mallet = compute_lda_mallet(corpus_bow, dictionary, num_topics)
    compute_coherence(mallet, corpus_bow, dictionary, num_topics)

## Calculate coherence for multiple models

 see also:
   * https://radimrehurek.com/gensim/models/coherencemodel.html
   * https://rare-technologies.com/what-is-topic-coherence/

In [None]:
#coherence_vals = create_models(corpus_bow, dictionary, range(20, 501, 3))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(*zip(*coherence_vals))
plt.xlabel("Num Topics")
plt.ylabel("Coherence Score")
#plt.legend(("Coherence_values"), loc='best')
plt.show()