# <span style="color:blue">Prepare models for visualization (generation of Bins.js file) - 10 topics</span>

### Load packages

In [1]:
import gensim, os, time
import numpy as np
import pandas as pd
from gensim.test.utils import datapath
from numpy import savetxt

### Constants

In [2]:
# for reproducible results
random_state = 33
# if the test is True, the process runs on a smaller subset of raw data (json files)
test = False

if test:
    # path where the model files are stored
    path_model = r'D:\master\data science\semestre 4\M2.979 - tfm\data\bins_test\model'
    # path where all the files related to the visualization of the models are stored
    path_viz = r'D:\master\data science\semestre 4\M2.979 - tfm\data\bins_test\viz'
else:
    # path where the model files are stored
    path_model = r'D:\master\data science\semestre 4\M2.979 - tfm\data\bins\model'
    # path where all the files related to the visualization of the models are stored
    path_viz = r'D:\master\data science\semestre 4\M2.979 - tfm\data\bins\viz'

MMM_last_day = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
num_topics = 10
num_words = 10

### Function documents inside bin/month for visualization (Bins.js file)

In [3]:
def doc_bin_viz(bins_js_text, path_model, yyyy_MMM, doc_id_viz, df_export_doc_id):
    
    time_start = time.time()
    
    yyyy = yyyy_MMM[0:4]
    MMM = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'].index(yyyy_MMM[5:8])
    # path + name of the file that contains the model gotten at the end
    file_doc_topic_matrix = os.path.join(path_model, yyyy_MMM, 'model_document_topic_matrix.csv')
    df = pd.read_csv(file_doc_topic_matrix)
    df = df.sort_values(by='Topic', ascending=True)
    topics = df.Topic.unique()
    bins_js_text = bins_js_text + '"' + str(MMM) + '": {"tweet_Ids": ['
    
    documents = df.Document.unique()   
    last_document = len(documents) - 1
    iter_document = 0
    for index, row in df.iterrows():
        if iter_document < last_document:
            doc_id_viz += 1
            bins_js_text = bins_js_text + str(doc_id_viz) + ', '
            row = {'doc_id':int(row['Document']), 'doc_id_viz':doc_id_viz}
            df_export_doc_id.append(row)
        else:
            doc_id_viz += 1
            bins_js_text = bins_js_text + str(doc_id_viz) + '], '
            row = {'doc_id':int(row['Document']), 'doc_id_viz':int(doc_id_viz)}
            df_export_doc_id.append(row)
        iter_document += 1

    bins_js_text = bins_js_text + '"start_time": "' + str(MMM + 1) + '/1/' + str(yyyy) + ' 1:1", "bin_id": ' + str(MMM) + ', '
    
    # dataframe with doc ids and doc ids formatted for visualization
    df_export_doc_id = pd.DataFrame(df_export_doc_id)
    file_viz = os.path.join(path_viz, 'doc_id_2_doc_id_viz')
    df_export_doc_id.to_csv(file_viz + '.csv', index=False, header=True)
    
    time_end = time.time()
    hour, rem = divmod(time_end - time_start, 3600)
    minute, second = divmod(rem, 60)
    print('doc_bin_viz - time elapsed - {:0>2}:{:0>2}:{:05.2f}'.format(int(hour), int(minute), second))
    
    return bins_js_text, doc_id_viz

### Function topic-document matrix for visualization (Bins.js file)

In [4]:
def topic_doc_matrix_viz(bins_js_text, path_model, yyyy_MMM):
    
    time_start = time.time()
    
    yyyy = yyyy_MMM[0:4]
    MMM = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'].index(yyyy_MMM[5:8])
    # path + name of the file that contains the model gotten at the end
    file_doc_topic_matrix = os.path.join(path_model, yyyy_MMM, 'model_document_topic_matrix.csv')
    df = pd.read_csv(file_doc_topic_matrix, names=['doc_id', 'Topic', 'Probability'])
    
    file_viz = os.path.join(path_viz, 'doc_id_2_doc_id_viz')
    df_viz = pd.read_csv(file_viz + '.csv', names=['doc_id_viz', 'Document'])

    df = pd.merge(left=df, right=df_viz, left_on='doc_id', right_on='doc_id_viz')
    df = df.sort_values(by='Topic', ascending=True)
    topics = df.Topic.unique()
    
    bins_js_text = bins_js_text + '"topic_model": {"topic_doc": {'

    last_topic = len(topics) - 1
    iter_topic = 0
    for topic in topics:
        df_topic = df.loc[df['Topic'] == topic]
        bins_js_text = bins_js_text + '"' + str(MMM) + '_' + str(topic) + '": {'

        last_item = len(df_topic) - 1
        iter_index = 0
        for ind in df_topic.index:
            if iter_index < last_item:
                bins_js_text = bins_js_text + '"' + str(df_topic['Document'][ind]) + '": ' + str(df_topic['Probability'][ind]) + ', '
            else:
                if iter_topic < last_topic:
                    bins_js_text = bins_js_text + '"' + str(df_topic['Document'][ind]) + '": ' + str(df_topic['Probability'][ind]) + '},'
                else:
                    bins_js_text = bins_js_text + '"' + str(df_topic['Document'][ind]) + '": ' + str(df_topic['Probability'][ind]) + '}},'
            iter_index += 1
        iter_topic += 1
        
    time_end = time.time()
    hour, rem = divmod(time_end - time_start, 3600)
    minute, second = divmod(rem, 60)
    print('topic_doc_matrix_viz - time elapsed - {:0>2}:{:0>2}:{:05.2f}'.format(int(hour), int(minute), second))
    
    return bins_js_text

### Function document-topic matrix for visualization (Bins.js file)

In [5]:
def doc_topic_matrix_viz(bins_js_text, path_model, yyyy_MMM):
    
    time_start = time.time()
    
    yyyy = yyyy_MMM[0:4]
    MMM = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'].index(yyyy_MMM[5:8])
    # path + name of the file that contains the model gotten at the end
    file_doc_topic_matrix_full = os.path.join(path_model, yyyy_MMM, 'model_document_topic_matrix_full.csv')
    df = pd.read_csv(file_doc_topic_matrix_full)
    
    file_viz = os.path.join(path_viz, 'doc_id_2_doc_id_viz')
    df_viz = pd.read_csv(file_viz + '.csv')

    df = pd.merge(left=df, right=df_viz, left_on='Document', right_on='doc_id')
    df = df.sort_values(by='doc_id_viz', ascending=True)
    
    bins_js_text = bins_js_text + '"doc_topic": {'
                                                                                                     
    documents = df.Document.unique()                                                                                                    
    last_document = len(documents) - 1
    iter_document = 0
    for index, row in df.iterrows():
        bins_js_text = bins_js_text + '"' + str(int(row['doc_id_viz'])) + '": {'
        bins_js_text = bins_js_text + '"' + str(MMM) + '_0": ' + str(row['Topic0']) + ', '
        bins_js_text = bins_js_text + '"' + str(MMM) + '_1": ' + str(row['Topic1']) + ', '
        bins_js_text = bins_js_text + '"' + str(MMM) + '_2": ' + str(row['Topic2']) + ', '
        bins_js_text = bins_js_text + '"' + str(MMM) + '_3": ' + str(row['Topic3']) + ', '
        bins_js_text = bins_js_text + '"' + str(MMM) + '_4": ' + str(row['Topic4']) + ', '
        bins_js_text = bins_js_text + '"' + str(MMM) + '_5": ' + str(row['Topic5']) + ', '
        bins_js_text = bins_js_text + '"' + str(MMM) + '_6": ' + str(row['Topic6']) + ', '
        bins_js_text = bins_js_text + '"' + str(MMM) + '_7": ' + str(row['Topic7']) + ', '
        bins_js_text = bins_js_text + '"' + str(MMM) + '_8": ' + str(row['Topic8']) + ', '
        if iter_document < last_document:
            bins_js_text = bins_js_text + '"' + str(MMM) + '_9": ' + str(row['Topic9']) + '}, '
        else:
            bins_js_text = bins_js_text + '"' + str(MMM) + '_9": ' + str(row['Topic9']) + '}}, '
        iter_document += 1
    
    time_end = time.time()
    hour, rem = divmod(time_end - time_start, 3600)
    minute, second = divmod(rem, 60)
    print('doc_topic_matrix_viz - time elapsed - {:0>2}:{:0>2}:{:05.2f}'.format(int(hour), int(minute), second))
    
    return bins_js_text

### Format topic-term matrix for visualization (Bins.js file)

In [6]:
def topic_term_matrix_viz(bins_js_text, path_model, yyyy_MMM):
    
    time_start = time.time()
    
    yyyy = yyyy_MMM[0:4]
    MMM = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'].index(yyyy_MMM[5:8])
    # load trained/saved model
    file_model = os.path.join(path_model, yyyy_MMM, 'model')
    temp_file = datapath(file_model)
    model = gensim.models.ldamodel.LdaModel.load(temp_file)

    x = model.show_topics(num_topics=num_topics, num_words=num_words, formatted=False)
    topics_words = [(tp[0], ['"' + wd[0] + '": '+ str(wd[1]) for wd in tp[1]]) for tp in x]
    last_item = len(topics_words) - 1
    iter_index = 0
    bins_js_text = bins_js_text + '"topic_word": {'
    for topic,words in topics_words:
        if iter_index < last_item:
            bins_js_text = bins_js_text + '"' + str(MMM) + '_' + str(topic) + '": {' + ", ".join(words) + '},'
        else:
            bins_js_text = bins_js_text + '"' + str(MMM) + '_' + str(topic) + '": {' + ", ".join(words) + '}}, "topic_prob": ['
        iter_index += 1

    iter_index = 0
    for topic,words in topics_words:
        if iter_index < last_item:
            bins_js_text = bins_js_text + '"' + str(MMM) + '_' + str(topic) + '",'
        else:
            bins_js_text = bins_js_text + '"' + str(MMM) + '_' + str(topic) + '"]}, "end_time": "'
        iter_index += 1    

    bins_js_text = bins_js_text + str(MMM + 1) + '/' + str(MMM_last_day[MMM]) + '/' + yyyy + ' 23:59"}'
    
    time_end = time.time()
    hour, rem = divmod(time_end - time_start, 3600)
    minute, second = divmod(rem, 60)
    print('topic_term_matrix_viz - time elapsed - {:0>2}:{:0>2}:{:05.2f}'.format(int(hour), int(minute), second))
    
    return bins_js_text

### <span style="color:blue">Generate Bins.js file</span>

In [None]:
bins_js_text = 'function populate_bins_Instagram_2019(){var bin_data = {'

# INITIALITZATION
# to build a dataframe with doc ids and the new doc ids for visualization (id have to be reasign)
df_export_doc_id = []
# to assign the doc ids for visualization
doc_id_viz = 0

yyyy_MMM = '2019_Jan'
print('----- ' + yyyy_MMM + ' -----')
result = doc_bin_viz(bins_js_text, path_model, yyyy_MMM, doc_id_viz, df_export_doc_id)
bins_js_text = result[0]
doc_id_viz = result[1]
bins_js_text = topic_doc_matrix_viz(bins_js_text, path_model, yyyy_MMM)
bins_js_text = doc_topic_matrix_viz(bins_js_text, path_model, yyyy_MMM)
bins_js_text = topic_term_matrix_viz(bins_js_text, path_model, yyyy_MMM) + ', '

yyyy_MMM = '2019_Feb'
print('----- ' + yyyy_MMM + ' -----')
result = doc_bin_viz(bins_js_text, path_model, yyyy_MMM, doc_id_viz, df_export_doc_id)
bins_js_text = result[0]
doc_id_viz = result[1]
bins_js_text = topic_doc_matrix_viz(bins_js_text, path_model, yyyy_MMM)
bins_js_text = doc_topic_matrix_viz(bins_js_text, path_model, yyyy_MMM)
bins_js_text = topic_term_matrix_viz(bins_js_text, path_model, yyyy_MMM) + ', '

yyyy_MMM = '2019_Mar'
print('----- ' + yyyy_MMM + ' -----')
result = doc_bin_viz(bins_js_text, path_model, yyyy_MMM, doc_id_viz, df_export_doc_id)
bins_js_text = result[0]
doc_id_viz = result[1]
bins_js_text = topic_doc_matrix_viz(bins_js_text, path_model, yyyy_MMM)
bins_js_text = doc_topic_matrix_viz(bins_js_text, path_model, yyyy_MMM)
bins_js_text = topic_term_matrix_viz(bins_js_text, path_model, yyyy_MMM) + ', '

yyyy_MMM = '2019_Apr'
print('----- ' + yyyy_MMM + ' -----')
result = doc_bin_viz(bins_js_text, path_model, yyyy_MMM, doc_id_viz, df_export_doc_id)
bins_js_text = result[0]
doc_id_viz = result[1]
bins_js_text = topic_doc_matrix_viz(bins_js_text, path_model, yyyy_MMM)
bins_js_text = doc_topic_matrix_viz(bins_js_text, path_model, yyyy_MMM)
bins_js_text = topic_term_matrix_viz(bins_js_text, path_model, yyyy_MMM) + ', '

yyyy_MMM = '2019_May'
print('----- ' + yyyy_MMM + ' -----')
result = doc_bin_viz(bins_js_text, path_model, yyyy_MMM, doc_id_viz, df_export_doc_id)
bins_js_text = result[0]
doc_id_viz = result[1]
bins_js_text = topic_doc_matrix_viz(bins_js_text, path_model, yyyy_MMM)
bins_js_text = doc_topic_matrix_viz(bins_js_text, path_model, yyyy_MMM)
bins_js_text = topic_term_matrix_viz(bins_js_text, path_model, yyyy_MMM) + ', '

yyyy_MMM = '2019_Jun'
print('----- ' + yyyy_MMM + ' -----')
result = doc_bin_viz(bins_js_text, path_model, yyyy_MMM, doc_id_viz, df_export_doc_id)
bins_js_text = result[0]
doc_id_viz = result[1]
bins_js_text = topic_doc_matrix_viz(bins_js_text, path_model, yyyy_MMM)
bins_js_text = doc_topic_matrix_viz(bins_js_text, path_model, yyyy_MMM)
bins_js_text = topic_term_matrix_viz(bins_js_text, path_model, yyyy_MMM) + ', '

yyyy_MMM = '2019_Jul'
print('----- ' + yyyy_MMM + ' -----')
result = doc_bin_viz(bins_js_text, path_model, yyyy_MMM, doc_id_viz, df_export_doc_id)
bins_js_text = result[0]
doc_id_viz = result[1]
bins_js_text = topic_doc_matrix_viz(bins_js_text, path_model, yyyy_MMM)
bins_js_text = doc_topic_matrix_viz(bins_js_text, path_model, yyyy_MMM)
bins_js_text = topic_term_matrix_viz(bins_js_text, path_model, yyyy_MMM) + ', '

yyyy_MMM = '2019_Aug'
print('----- ' + yyyy_MMM + ' -----')
result = doc_bin_viz(bins_js_text, path_model, yyyy_MMM, doc_id_viz, df_export_doc_id)
bins_js_text = result[0]
doc_id_viz = result[1]
bins_js_text = topic_doc_matrix_viz(bins_js_text, path_model, yyyy_MMM)
bins_js_text = doc_topic_matrix_viz(bins_js_text, path_model, yyyy_MMM)
bins_js_text = topic_term_matrix_viz(bins_js_text, path_model, yyyy_MMM) + ', '

yyyy_MMM = '2019_Sep'
print('----- ' + yyyy_MMM + ' -----')
result = doc_bin_viz(bins_js_text, path_model, yyyy_MMM, doc_id_viz, df_export_doc_id)
bins_js_text = result[0]
doc_id_viz = result[1]
bins_js_text = topic_doc_matrix_viz(bins_js_text, path_model, yyyy_MMM)
bins_js_text = doc_topic_matrix_viz(bins_js_text, path_model, yyyy_MMM)
bins_js_text = topic_term_matrix_viz(bins_js_text, path_model, yyyy_MMM) + ', '

yyyy_MMM = '2019_Oct'
print('----- ' + yyyy_MMM + ' -----')
result = doc_bin_viz(bins_js_text, path_model, yyyy_MMM, doc_id_viz, df_export_doc_id)
bins_js_text = result[0]
doc_id_viz = result[1]
bins_js_text = topic_doc_matrix_viz(bins_js_text, path_model, yyyy_MMM)
bins_js_text = doc_topic_matrix_viz(bins_js_text, path_model, yyyy_MMM)
bins_js_text = topic_term_matrix_viz(bins_js_text, path_model, yyyy_MMM) + ', '

yyyy_MMM = '2019_Nov'
print('----- ' + yyyy_MMM + ' -----')
result = doc_bin_viz(bins_js_text, path_model, yyyy_MMM, doc_id_viz, df_export_doc_id)
bins_js_text = result[0]
doc_id_viz = result[1]
bins_js_text = topic_doc_matrix_viz(bins_js_text, path_model, yyyy_MMM)
bins_js_text = doc_topic_matrix_viz(bins_js_text, path_model, yyyy_MMM)
bins_js_text = topic_term_matrix_viz(bins_js_text, path_model, yyyy_MMM) + ', '

yyyy_MMM = '2019_Dec'
print('----- ' + yyyy_MMM + ' -----')
result = doc_bin_viz(bins_js_text, path_model, yyyy_MMM, doc_id_viz, df_export_doc_id)
bins_js_text = result[0]
doc_id_viz = result[1]
bins_js_text = topic_doc_matrix_viz(bins_js_text, path_model, yyyy_MMM)
bins_js_text = doc_topic_matrix_viz(bins_js_text, path_model, yyyy_MMM)
bins_js_text = topic_term_matrix_viz(bins_js_text, path_model, yyyy_MMM)

bins_js_text = bins_js_text + '};readBinJSON(bin_data);}'

----- 2019_Jan -----
doc_bin_viz - time elapsed - 00:00:06.12
topic_doc_matrix_viz - time elapsed - 00:00:53.99
doc_topic_matrix_viz - time elapsed - 01:41:15.78
topic_term_matrix_viz - time elapsed - 00:00:00.41
----- 2019_Feb -----
doc_bin_viz - time elapsed - 00:17:31.48
topic_doc_matrix_viz - time elapsed - 00:18:43.07
doc_topic_matrix_viz - time elapsed - 05:08:41.07
topic_term_matrix_viz - time elapsed - 00:00:00.51
----- 2019_Mar -----
doc_bin_viz - time elapsed - 00:45:37.89


  if (await self.run_code(code, result,  async_=asy)):


topic_doc_matrix_viz - time elapsed - 00:24:21.39
doc_topic_matrix_viz - time elapsed - 11:27:32.99
topic_term_matrix_viz - time elapsed - 00:00:00.80
----- 2019_Apr -----
doc_bin_viz - time elapsed - 01:27:41.19


  if (await self.run_code(code, result,  async_=asy)):


topic_doc_matrix_viz - time elapsed - 00:00:00.94
doc_topic_matrix_viz - time elapsed - 18:08:38.52
topic_term_matrix_viz - time elapsed - 00:00:01.00
----- 2019_May -----
doc_bin_viz - time elapsed - 01:54:08.12


  if (await self.run_code(code, result,  async_=asy)):


topic_doc_matrix_viz - time elapsed - 00:00:00.73
doc_topic_matrix_viz - time elapsed - 27:30:53.41
topic_term_matrix_viz - time elapsed - 00:00:01.54
----- 2019_Jun -----
doc_bin_viz - time elapsed - 02:57:31.88


  if (await self.run_code(code, result,  async_=asy)):


topic_doc_matrix_viz - time elapsed - 00:00:01.03


In [None]:
bins_js_text

In [None]:
file_viz = os.path.join(path_viz, 'Bins.js')

with open(file_viz, 'w') as text_file:
    text_file.write(bins_js_text)