### Unveil how ldaViz work by mapping with gensim and ranking words based topic and lambda value

In [1]:
import sys
import os
import pickle

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from gensim import corpora, models
from gensim.models import LdaModel

import pyLDAvis.gensim

In [2]:
model_path = os.path.join('../../model/mallet_as_gensim_weights_50_2019_02_12')
dictionary_path = os.path.join('../../data/processed/dictionary.dict')
corpus_path = os.path.join('../../data/processed/corpus_bow_new.mm')
label_definition_path = os.path.join('../../data/processed/Topic Definition_2019_04_02.npy')
save_file_path = os.path.join('../../data/results/temp_results/', 'mapping_file_for_mallet_as_gensim_weights_50_2019_04_02.csv')

print(os.getcwd())
print(model_path)
print(label_definition_path)

/mnt/notebook/poc
../../model/mallet_as_gensim_weights_50_2019_02_12
../../data/processed/Topic Definition_2019_04_02.npy


#### Import Model, Dictionary, and Label (manually created)

In [3]:
lda_model = LdaModel.load(model_path)
old_dict = corpora.Dictionary.load(dictionary_path)
old_corp = corpora.MmCorpus(corpus_path)

In [4]:
label_topic_dict = np.load(label_definition_path)
label_topic_dict = dict(label_topic_dict.tolist())

#### Use LDAviz to find ranking by topic size

In [8]:
import time
start_time = time.time()
pyLDA_data = pyLDAvis.gensim.prepare(lda_model, old_corp, old_dict)
print("--- %s seconds ---" % (time.time() - start_time))

  kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
  log_lift = np.log(topic_term_dists / term_proportion)
  log_ttd = np.log(topic_term_dists)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


--- 154.9616448879242 seconds ---


In [9]:
pyLDAvis.display(pyLDA_data)

#### Get topic index mapping between gensim lda and ldaViz

In [10]:
# rank is LDAviz topic, value is gensim ID + 1
topic_order = pyLDA_data.topic_order

topic_map_df = pd.DataFrame(data = np.array(topic_order) - 1, columns = ['Gensim topic id'])

topic_map_df['LDA viz topic id'] = np.array(topic_map_df.index.to_list()) + 1

#### Get "relevent terms" using lambda value of 0.65, which has exact match with above ldaViz

In [31]:
lambda_value = 0.65

df = pyLDA_data.topic_info

# Loglift associated with lambda = 0. These words only edfist in the current topic, but not in other topics.
# Freq associated with lambda = 1. Ranked by the frequency of words within the current topic.
df['Score'] = lambda_value * df.logprob + (1 - lambda_value) * np.log(df.Freq/df.Total) 
df['rank'] = df.groupby('Category')['Score'].rank(ascending = False)
df = df[df['rank'] <= 30]

df = df[['Category','Term','rank']].reset_index().pivot(index = 'Category', columns ='rank', values = 'Term')
df.head()

rank,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,...,21.0,22.0,23.0,24.0,25.0,26.0,27.0,28.0,29.0,30.0
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Default,bank,gdp,fiscal,tax,authority,revenue,inflation,government,deficit,exchange_rate,...,financial,project,risk,market,trade,data,credit,increase,external,expenditure
Topic1,authority,note,agree,mission,concern,view,stress,acknowledge,argue,emphasize,...,reiterate,assessment,urge,explain,issue,confident,risk,discussion,underscore,commitment
Topic10,tax,revenue,vat,exemption,administration,tax_base,collection,custom,income_tax,taxpayer,...,introduce,personal_income_tax,income_tax_rate,excise,increase,evasion,gst,duty,include,property_tax
Topic11,fiscal,deficit,budget,stabilizer,fiscal_policy,target,automatic,stimulus,gdp,surplus,...,year,authority,cyclical,fiscal_stance,rule,balance,discretionary,structural_balance,room,sgp
Topic12,labor_market,employment,worker,wage,job,labor,unemployment,skill,work,benefit,...,participation,hour,bargaining,youth,creation,young,active_labor_market,reduce,reform,unemployment_rate


#### Generate Topic-Lable Mapping by applying IOU to manually-created labels

In [28]:
def calculate_intersection_over_union(list_a, list_b):
    
    inter_set = list(set(list_a) & set(list_b))
    union_set = list(set(list_a) | set(list_b))
    
    return len(inter_set)/len(union_set)

def map_topic_label(model_dict, label_dict):
    
    new_list =dict()
    
    for model_key, model_value in model_dict.items():
        iou_list =[]
        for label_key, label_value in label_dict.items():
            iou_list.append(calculate_intersection_over_union(model_value, label_value))
        max_id = np.array(iou_list).argmax()
        new_list[model_key] = list(label_dict.keys())[max_id]
    
    return new_list

topic_label_dict = map_topic_label(model_dict= model_topic_dict, label_dict= label_topic_dict )
topic_label_dict

# save results
topic_map_df['label'] = topic_map_df['Gensim topic id'].apply(lambda x: topic_label_dict[x])
topic_map_df.to_csv(save_file_path)

#### Get (Topic ID ~ Topc Word List) mapping in the original gensim lda model

In [5]:
model_topic_list = lda_model.show_topics(num_topics= 50, num_words= 15, formatted= False)
model_topic_list = dict(model_topic_list)

model_topic_dict = dict()

for key, value in model_topic_list.items():
    word_list, prob = zip(*value)
    model_topic_dict[key] = list(word_list)