## This file builds a Mallet LDA model, and saves results/visualization

#### Some terminologies:
1. raw_doc: unprocessed raw document from txt file
2. docs: lemmentized corpus
3. corpus_bow: bag of words corpus
4. corpus_tfidf: tfidf corpus

#### Change from eariler version:
1. filter out documents with too few words

### Load dictionary and pre-built functions

In [None]:
from gensim import corpora, models
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel 
import numpy as np
import sys
import os
import gensim
import pickle
#from collections import Counter
#import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import datetime
import pyLDAvis
import pyLDAvis.gensim

In [None]:
## global folder path 
data_folder = '../../data/'
model_folder = '../../model/'
raw_data_path = os.path.join(data_folder,'raw/article_IV_corpus.txt')
data_processed_folder = os.path.join(data_folder,'processed')
results_folder = os.path.join(data_folder,'results','temp_results')
## binary file for mallet model
mallet_path = '/mnt/packages/Mallet/bin/mallet' # update this path

In [None]:
# %load topic_models.py
# python_root = './scripts'
# sys.path.insert(0, python_root)

#%%
def prepare_data(data_folder,save=True):
    ## read and transform data 
    contents = pickle.load(open(os.path.join(data_folder,'lemma_corpus.p'), "rb"))
    print('length of lemmentized corpus: {}'.format(len(contents)))
    docs = list()
    for paragraph in contents:
        docs.append([w for sentance in paragraph for w in sentance])

    # build dictionary

    dictionary = corpora.Dictionary(docs)
    dictionary.filter_extremes(no_below=5,no_above=0.5, keep_n=10000)
    # convert document into bow
    corpus_bow = [dictionary.doc2bow(text) for text in docs]
    ## comput tfidf feature vectors
    tfidf = models.TfidfModel(corpus_bow) # smartirs = 'atc' https://radimrehurek.com/gensim/models/tfidfmodel.html
    corpus_tfidf = tfidf[corpus_bow]
    
    ## save dictionary and corpora 
    if save:
        dictionary_save_path = os.path.join(data_folder,'dictionary.dict')
        dictionary.compactify()
        dictionary.save(dictionary_save_path)
        corpora.MmCorpus.serialize( os.path.join(data_folder,'corpus_bow.mm'), corpus_bow)
        corpora.MmCorpus.serialize( os.path.join(data_folder,'corpus_tfidf.mm'), corpus_tfidf)
        #print(len(dictionary))
    return docs,dictionary,corpus_bow,corpus_tfidf

#%%

### Load and process data

#### Load original text to look through later

In [None]:
with open(raw_data_path,'r',encoding='utf8') as f:
    raw_doc = f.readlines()
    raw_doc = [l.strip(' \n') for l in raw_doc if len(l)>50]

print('Length of raw documents {}'.format(len(raw_doc)))

#### Load lemmentized corpus

In [None]:
rerun = True
if rerun == True:                ## save gensim objects, corpus, dictionary, and lda model
    mode = 'all'
    docs,dictionary,corpus_bow,corpus_tfidf = prepare_data(data_processed_folder,save=False)
    # corpus_bow = [c for c in corpus_bow_full if len(c)>0]
    
print('Length of length of bag-of-word corpus: {}'.format(len(corpus_bow)))

#### filter out paragraphs with <20 words or contain 'titles'

In [None]:
raw_doc_new = list()
corpus_bow_new = list()
docs_new = list()

tuple_temp = [(x, y, z) for (x, y, z) in zip(raw_doc, corpus_bow, docs) if len(x.split())>20 and ('<Title>' not in x) ]

raw_doc_new, corpus_bow_new, docs_new = zip(*tuple_temp)

print('Length of corpus without "<Title>" and has more than 20 words: {}'.format(len(raw_doc_new)))

raw_doc_new[0]

### Run LDA using Mallet 

In [None]:
n_topics = 50
n_words = 20
np.random.seed(seed=1)
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus_bow_new, num_topics=n_topics, 
                                             id2word= dictionary, alpha= 1, optimize_interval=10, 
                                             iterations = 2000, 
                                             prefix=os.path.join(model_folder,"mallet_{}_topics_".format(n_topics)))
#initial alpha = 5/ n_topics = 5/ 50 = 0.1

In [None]:
print('calculating coherence socre for {} documents ......'.format(len(docs_new)))
coherence_model_lda = CoherenceModel(model=ldamallet, texts=docs_new, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
def save_results_to_excel(model = ldamallet, n_topics = n_topics, n_words = n_words):
    
    '''saves results to excel for better visualization'''

    # 1. Make a topic-key word table
    topic_df = pd.DataFrame(data = np.zeros((n_topics, n_words)), columns= ['word'+ str(x) for x in range(n_words)])
    for i in range(n_topics):
        topic_df.iloc[i] = pd.DataFrame(model.show_topic(topicid= i, topn= n_words))[0].tolist()
        
    print("First step done")
    
    # 2. Make a document-topic dataframe
    docs_df = pd.DataFrame(data = np.zeros(len(docs_new)), columns=['paragraph'])
    docs_df['paragraph'] = raw_doc_new
    col_names = ['T'+ str(i) for i in np.array(range(n_topics))]

    for col in col_names:
        docs_df[col]= 0 

    for row in range(docs_df.shape[0]):
        tp = model[corpus_bow_new[row]]
        for x in tp:
            docs_df.loc[row, 'T'+ str(x[0])] = x[1]
    
    print("Second step done")
    
    # 3. Make dataframe for topic-top document
    
    top_document_per_topic = []

    for t_id in range(n_topics):
        t = 'T'+ str(t_id)
        print("Topic {}:".format(t_id))
        print(model.show_topic(topicid= t_id, topn=n_words))
        print(str(raw_doc_new[docs_df[t].idxmax()]))
        top_document_per_topic.append(str(raw_doc_new[docs_df[t].idxmax()]))

    top_document_per_topic_df = pd.DataFrame(data = topic_df[topic_df.columns[0:]].apply(
        lambda x: ','.join(x.astype(str)), axis =1), columns =['topic'])
    
    top_document_per_topic_df['top_document']= top_document_per_topic 

    # transform  document-toipc mapping from wide to long format
    docs_df_long = pd.melt(docs_df,id_vars=['paragraph'], value_vars=['T'+ str(i) for i in range(50)] )
    docs_df_long= docs_df_long[docs_df_long['value']>0]

    docs_df_long.rename(columns={'variable':'topic','value':'probabiilty'}, inplace= True)
    docs_df_long.topic = docs_df_long.topic.apply(lambda x: x.replace('T',''))
          
    # save results to excel
    now = datetime.datetime.now()
    now = now.strftime("%Y_%m_%d")
    #print(now)
    writer = pd.ExcelWriter(path = os.path.join(results_folder,'Mallet_{}_topics_{}.xlsx'.format(n_topics, now)))
        
    top_document_per_topic_df.to_excel(writer,'Topic and Top Document')
    topic_df.to_excel(writer, 'Toipc and Key Word')
    docs_df_long.to_excel(writer, 'Document and Topic')
    
    writer.save()
    


### Some conversion from Mallet to Gensim LDA [for visualization](https://github.com/jerielizabeth/Gospel-of-Health-Notebooks/blob/master/blogPosts/pyLDAvis_and_Mallet.ipynb)

In [None]:
# The first step was to extract the data from the MALLET statefile and into a pandas dataframe.

import gzip
import os
import pandas as pd

statefile = ldamallet.fstate() 

def extract_params(statefile):
    """Extract the alpha and beta values from the statefile.

    Args:
        statefile (str): Path to statefile produced by MALLET.
    Returns:
        tuple: alpha (list), beta    
    """
    with gzip.open(statefile, 'r') as state:
        params = [x.decode('utf8').strip() for x in state.readlines()[1:3]]
    return (list(params[0].split(":")[1].split(" ")), float(params[1].split(":")[1]))


def state_to_df(statefile):
    """Transform state file into pandas dataframe.
    The MALLET statefile is tab-separated, and the first two rows contain the alpha and beta hypterparamters.
    
    Args:
        statefile (str): Path to statefile produced by MALLET.
    Returns:
        datframe: topic assignment for each token in each document of the model
    """
    return pd.read_csv(statefile,
                       compression='gzip',
                       sep=' ',
                       skiprows=[1,2]
                       )

params = extract_params(statefile)

alpha = [float(x) for x in params[0][1:]]
beta = params[1]
print("{}, {}".format(alpha, beta))

df = state_to_df(statefile)

df['type'] = df.type.astype(str)
df[:10]

# Get document lengths from statefile
docs = df.groupby('#doc')['type'].count().reset_index(name ='doc_length')

docs[:10]

# Get vocab and term frequencies from statefile
vocab = df['type'].value_counts().reset_index()
vocab.columns = ['type', 'term_freq']
vocab = vocab.sort_values(by='type', ascending=True)

vocab[:10]


In [None]:
# Topic-term matrix from state file
# https://ldavis.cpsievert.me/reviews/reviews.html

import sklearn.preprocessing

def pivot_and_smooth(df, smooth_value, rows_variable, cols_variable, values_variable):
    """
    Turns the pandas dataframe into a data matrix.
    Args:
        df (dataframe): aggregated dataframe 
        smooth_value (float): value to add to the matrix to account for the priors
        rows_variable (str): name of dataframe column to use as the rows in the matrix
        cols_variable (str): name of dataframe column to use as the columns in the matrix
        values_variable(str): name of the dataframe column to use as the values in the matrix
    Returns:
        dataframe: pandas matrix that has been normalized on the rows.
    """
    matrix = df.pivot(index=rows_variable, columns=cols_variable, values=values_variable).fillna(value=0)
    matrix = matrix.values + smooth_value
    
    normed = sklearn.preprocessing.normalize(matrix, norm='l1', axis=1)
    
    return pd.DataFrame(normed)

phi_df = df.groupby(['topic', 'type'])['type'].count().reset_index(name ='token_count')
phi_df = phi_df.sort_values(by='type', ascending=True)

phi_df[:10]

phi = pivot_and_smooth(phi_df, beta, 'topic', 'type', 'token_count')

# phi[:10]

theta_df = df.groupby(['#doc', 'topic'])['topic'].count().reset_index(name ='topic_count')

theta_df[:10]

theta = pivot_and_smooth(theta_df, alpha , '#doc', 'topic', 'topic_count')

In [None]:
# Create html visulaization using pyLDAvis
# Note: Results are wrong because bugs in function malletmodel2ldamodel()

data = {'topic_term_dists': phi, 
        'doc_topic_dists': theta,
        'doc_lengths': list(docs['doc_length']),
        'vocab': list(vocab['type']),
        'term_frequency': list(vocab['term_freq'])
       }

In [None]:
pyLDAvis.enable_notebook()
# ldamodel = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet)
viz_data= pyLDAvis.prepare(**data)
viz_data
# pyLDAvis.display(viz_data)

In [None]:
now = datetime.datetime.now()
now = now.strftime("%Y_%m_%d")
pyLDAvis.save_html(viz_data,  os.path.join(results_folder,'Mallet_{}_topics_{}.html'.format(n_topics, now))) 

In [None]:
## a better way to print 
def print_topics_gensim(topic_model, total_topics=1,
                        weight_threshold=0.0001,
                        display_weights=False,
                        num_terms=None):
    
    for index in range(total_topics):
        topic = topic_model.show_topic(index,topn=num_terms)
        topic = [(word, round(wt,4)) 
                 for word, wt in topic 
                 if abs(wt) >= weight_threshold]
        if display_weights:
            print('Topic #'+str(index+1)+' with weights')
            print (topic[:num_terms] if num_terms else topic)
        else:
            print ('Topic #'+str(index+1)+' without weights')
            tw = [term for term, wt in topic]
            print (tw[:num_terms] if num_terms else tw)
        print()
     

In [None]:
print_topics_gensim(ldamallet,total_topics=n_topics,display_weights=False,num_terms=n_words)

### Save Model and Results

In [None]:
ldamallet.save(os.path.join(model_folder,'mallet_weights_{}_{}'.format(n_topics, now)))
#test=gensim.models.wrappers.LdaMallet.load(os.path.join(model_folder,'mallet_weights'))

In [None]:
## this may take long time on domino 
# save_results_to_excel()