## Calculate coherence for a pretrained model

The following workflow describes how to use gensim to calculate coherence measures for an LDA model that has already identified topics.

## Extract the top 10 terms from the term frequency table

In [1]:
import pandas as pd
from gensim.corpora.dictionary import Dictionary
from gensim.models import CoherenceModel
import re

#TopicTermFreq = pd.read_csv('/Users/dankoban/Documents/EM6575/mallet_command_line/ct_tidy_topics.csv')
TopicTermFreq = pd.read_csv('/Users/dankoban/Documents/EM6575/twitter/twitter_tidy_topics.csv')

num_topics = 50
num_top_terms = 10

topic_term_list = []
top_terms = []
for k in range(1, num_topics+1):    
    top_terms_topic_k = TopicTermFreq[TopicTermFreq['topic'] == (k-1)].sort_values('count', ascending = False)['term'].tolist()[0:num_top_terms]
    top_terms_topic_k = [re.sub(r'\W+', '', term) for term in top_terms_topic_k]    
    top_terms = top_terms + top_terms_topic_k
    topic_term_list.append(top_terms_topic_k)

top_terms = list(set(top_terms))

## Load the raw text files and parse to retain top term vocab only

In [2]:
texts = []
counter = 0
#with open('/Users/dankoban/Documents/CT_LDA/CT_data/mallet_input_data_crowdtangle.txt','r') as infile:
with open('/Users/dankoban/Documents/EM6575/twitter/hashtag model/mallet_nocomma_no@.txt','r') as infile:    
    for line in infile:
        line = line.split(' ')                  
        line = [re.sub(r'\W+', '', term) for term in line]
        line = [word.lower() for word in line if word.lower() in top_terms]
        line = list(set(line))
        counter += 1
        if counter %500000 == 0:            
            print(counter)
        texts.append(line)      

500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
4500000
5000000
5500000
6000000
6500000
7000000
7500000
8000000
8500000
9000000
9500000
10000000
10500000
11000000
11500000
12000000
12500000
13000000
13500000
14000000


## Transform the raw text into bag of words dictionary and corpus

In [3]:
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

## Calculate topic coherence 

In [4]:
from gensim.models import CoherenceModel

cm = CoherenceModel(topics=topic_term_list, corpus=corpus, dictionary=dictionary, coherence='u_mass')
coherence_scores = cm.get_coherence_per_topic()
coherence_scores

[-4.57445971827279,
 -6.505849751978521,
 -3.751673392963046,
 -3.22298514577627,
 -7.446415826076183,
 -2.4565096954705905,
 -4.563694261699008,
 -4.388998511867815,
 -3.3976821578401495,
 -4.5552923597313715,
 -3.748189597795187,
 -3.5002872992080896,
 -4.012745382782841,
 -4.9423134665416875,
 -4.397528202154902,
 -5.110074660499582,
 -4.590958969332789,
 -3.969075293240008,
 -4.4237498912949045,
 -3.3842137900712386,
 -4.383665342479561,
 -5.350602127901752,
 -4.0775899008318754,
 -5.081703266289324,
 -3.41509585372713,
 -3.078583941350838,
 -4.9168828548249985,
 -4.36313042608254,
 -3.3002984518204515,
 -2.841411617231853,
 -3.891200577798426,
 -4.354412544978026,
 -3.1269961580901464,
 -3.657985318372107,
 -3.0707961606251724,
 -2.8366252927093925,
 -5.007230803507584,
 -8.11118489988223,
 -5.513449719159767,
 -4.158425464853597,
 -3.559006628027826,
 -3.5630438274885132,
 -7.179693651748433,
 -4.030279671955433,
 -4.103612838157457,
 -3.695535243025001,
 -2.983323282684404,
 -3.