## Calculate coherence for a pretrained model

The following workflow describes how to use gensim to calculate coherence measures for an LDA model that has already identified topics.

## Extract the top 10 terms from the term frequency table

In [1]:
import pandas as pd
from gensim.corpora.dictionary import Dictionary
from gensim.models import CoherenceModel

TopicTermFreq = pd.read_csv('/Users/dankoban/Documents/EM6575/mallet_command_line/ct_tidy_topics.csv')

num_topics = 50
num_top_terms = 10

topic_term_list = []
top_terms = []
for k in range(1, num_topics+1):    
    top_terms_topic_k = TopicTermFreq[TopicTermFreq['topic'] == (k-1)].sort_values('count', ascending = False)['term'].tolist()[0:num_top_terms]
    top_terms_topic_k = [term.replace('.', '') for term in top_terms_topic_k]
    top_terms_topic_k = [term.replace("'", '') for term in top_terms_topic_k]
    top_terms_topic_k = [term.replace("-", '') for term in top_terms_topic_k]
    top_terms = top_terms + top_terms_topic_k
    topic_term_list.append(top_terms_topic_k)

top_terms = list(set(top_terms))

## Load the raw text files and parse to retain top term vocab only

In [2]:
texts = []
counter = 0
with open('/Users/dankoban/Documents/CT_LDA/CT_data/mallet_input_data_crowdtangle.txt','r') as infile:
    for line in infile:
        line = line.split(' ')                  
        line = [term.replace('.', '') for term in line]
        line = [term.replace("'", '') for term in line]
        line = [term.replace("-", '') for term in line]        
        line = [word.lower() for word in line if word.lower() in top_terms]
        line = list(set(line))
        counter += 1
        if counter %500000 == 0:
            print(counter)
        texts.append(line)      

500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
4500000
5000000
5500000
6000000
6500000
7000000
7500000
8000000
8500000
9000000
9500000
10000000
10500000
11000000


## Transform the raw text into bag of words dictionary and corpus

In [3]:
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

## Calculate topic coherence 

In [4]:
from gensim.models import CoherenceModel

cm = CoherenceModel(topics=topic_term_list, corpus=corpus, dictionary=dictionary, coherence='u_mass')
coherence_scores = cm.get_coherence_per_topic()
coherence_scores

[-3.978878204157565,
 -2.299720968385133,
 -2.005407063176433,
 -3.172322365847731,
 -2.240740997598709,
 -2.388352787702119,
 -2.036640510975703,
 -2.900492531931636,
 -1.8349444344914034,
 -3.0766114248037457,
 -2.3270905023555954,
 -4.283486157007621,
 -2.3964789147181302,
 -2.236048173669421,
 -3.06967317604051,
 -3.428580898919883,
 -2.756076446123321,
 -2.237700586151633,
 -2.636235271806184,
 -2.037812383769127,
 -2.2296224707148427,
 -3.255973765727164,
 -2.119104969117333,
 -2.2424927644913515,
 -3.1675029766354825,
 -2.051539249502048,
 -3.02294447200405,
 -3.185954107693092,
 -2.7205895282298695,
 -2.4588847425999365,
 -2.85354086518332,
 -2.045536705113718,
 -2.386392181929321,
 -3.046281174340204,
 -3.607602698791733,
 -3.263780561646673,
 -2.6155231395562053,
 -2.6199529274512274,
 -2.9974466794375765,
 -3.0241401095924902,
 -3.5443366793934654,
 -3.4611927361080577,
 -3.1652307615092443,
 -3.030180916820792,
 -2.264092538185675,
 -3.714548811509984,
 -2.8918982345643016,