# <span style="color:blue">Prepare models for visualization (generation of TopicSimilarity.js file) - 10 topics</span>

### Load packages

In [1]:
import gensim, os, time
import dateutil.parser
import numpy as np
import pandas as pd
from gensim.test.utils import datapath
from numpy import savetxt

### Constants

In [2]:
# for reproducible results
random_state = 33
# if the test is True, the process runs on a smaller subset of raw data (json files)
test = False

if test:
    # path where the model files are stored
    path_model = r'D:\master\data science\semestre 4\M2.979 - tfm\data\bins_test\model'
    # path where all the files related to the visualization of the models are stored
    path_viz = r'D:\master\data science\semestre 4\M2.979 - tfm\data\bins_test\viz'
else:
    # path where the model files are stored
    path_model = r'D:\master\data science\semestre 4\M2.979 - tfm\data\bins\model'
    # path where all the files related to the visualization of the models are stored
    path_viz = r'D:\master\data science\semestre 4\M2.979 - tfm\data\bins\viz'

yyyy = 2019
num_topics = num_words = 10

### Function get nodes/topics weights

In [3]:
def nodes_viz(topicsimilarity_js, path_model, yyyy):

    time_start = time.time()

    yyyy_str = str(yyyy)

    MMM_array = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

    MMM_array_last = len(MMM_array) - 1
    MMM_array_iter = 0

    for month in MMM_array:
        MMM = MMM_array.index(month)

        file_doc_topic_matrix = os.path.join(path_model, yyyy_str + '_' + month, 'model_document_topic_matrix.csv')
        df_doc_topic_matrix = pd.read_csv(file_doc_topic_matrix)
        df_doc_topic_matrix = df_doc_topic_matrix.sort_values(by='Topic', ascending=True)
        df_doc_topic_matrix = df_doc_topic_matrix.groupby(['Topic'], sort=True).size().reset_index(name='Count')

        topics = df_doc_topic_matrix.Topic.unique()
        count_max = df_doc_topic_matrix.loc[df_doc_topic_matrix['Count'].idxmax()]

        topic_last = len(topics) - 1
        topic_iter = 0
        for index, row in df_doc_topic_matrix.iterrows():
            value = int(100*(row['Count'])/(count_max[1]))
            if MMM_array_iter < MMM_array_last:
                topicsimilarity_js = topicsimilarity_js + '{"name": "' + str(MMM) + '_' + str(int(row['Topic'])) + '", "value": ' + str(value) + '},'
            else:
                if topic_iter < topic_last:
                    topicsimilarity_js = topicsimilarity_js + '{"name": "' + str(MMM) + '_' + str(int(row['Topic'])) + '", "value": ' + str(value) + '},'
                else:
                    topicsimilarity_js = topicsimilarity_js + '{"name": "' + str(MMM) + '_' + str(int(row['Topic'])) + '", "value": ' + str(value) + '}'
            topic_iter += 1

        MMM_array_iter += 1

    time_end = time.time()
    hour, rem = divmod(time_end - time_start, 3600)
    minute, second = divmod(rem, 60)
    print('documents_viz - time elapsed - {:0>2}:{:0>2}:{:05.2f}'.format(int(hour), int(minute), second))

    return topicsimilarity_js

### Function get nodes/topics links

In [4]:
def links_viz(topicsimilarity_js, path_model, yyyy):
    
    time_start = time.time()

    yyyy_str = str(yyyy)

    MMM_array = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

    MMM_array_last = len(MMM_array) - 1
    MMM_array_iter = 0

    for month in MMM_array:

        if MMM_array_iter < MMM_array_last:

            MMM_index = MMM_array.index(month)

            file_model_1 = os.path.join(path_model, yyyy_str + '_' + month, 'model')
            temp_file_1 = datapath(file_model_1)
            model_1 = gensim.models.ldamodel.LdaModel.load(temp_file_1)
            x_1 = model_1.show_topics(num_topics=num_topics, num_words=num_words, formatted=False)
            topics_words_1 = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x_1]

            file_model_2 = os.path.join(path_model, yyyy_str + '_' + MMM_array[MMM_index + 1], 'model')
            temp_file_2 = datapath(file_model_2)
            model_2 = gensim.models.ldamodel.LdaModel.load(temp_file_2)
            x_2 = model_2.show_topics(num_topics=num_topics, num_words=num_words, formatted=False)
            topics_words_2 = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x_2]

            iter_last_1 = len(topics_words_1) - 1
            iter_index_1 = 0
            for topic_1,words_1 in topics_words_1:
                source = str(MMM_index).lstrip('0') + str(topic_1)
                target_res = ''
                value_res = 0

                iter_last_2 = len(topics_words_2) - 1
                iter_index_2 = 0
                for topic_2,words_2 in topics_words_2:
                    # count duplicates between 2 arrays
                    value = len(set(words_1) & set(words_2))
                    if value > value_res:
                        target_res = str(MMM_index + 1) + str(topic_2)
                        value_res = len(set(words_1) & set(words_2))
                    iter_index_2 += 1

                if value_res > 0:
                    if MMM_array_iter < MMM_array_last - 1:
                        topicsimilarity_js = topicsimilarity_js + '{"source": ' + source + ', "target": ' + target_res + ', "value": ' + str(value_res) + '},'
                    else:
                        if iter_index_1 < iter_last_1:
                            topicsimilarity_js = topicsimilarity_js + '{"source": ' + source + ', "target": ' + target_res + ', "value": ' + str(value_res) + '},'
                        else:
                            topicsimilarity_js = topicsimilarity_js + '{"source": ' + source + ', "target": ' + target_res + ', "value": ' + str(value_res) + '}'

                iter_index_1 += 1

        MMM_array_iter += 1
    
    time_end = time.time()
    hour, rem = divmod(time_end - time_start, 3600)
    minute, second = divmod(rem, 60)
    print('documents_viz - time elapsed - {:0>2}:{:0>2}:{:05.2f}'.format(int(hour), int(minute), second))

    return topicsimilarity_js

### <span style="color:blue">Generate TopicSimilarity.js file</span>

In [5]:
topicsimilarity_js = 'function populate_similarity_Instagram_2019(){var sim_data = {"nodes": ['

In [6]:
topicsimilarity_js = nodes_viz(topicsimilarity_js, path_model, yyyy)

documents_viz - time elapsed - 00:00:02.31


In [7]:
topicsimilarity_js = topicsimilarity_js + '], "links": ['

In [8]:
topicsimilarity_js = links_viz(topicsimilarity_js, path_model, yyyy)

documents_viz - time elapsed - 00:00:01.31


In [9]:
topicsimilarity_js = topicsimilarity_js + ']};readSimilarityJSON(sim_data);}'

In [10]:
topicsimilarity_js

'function populate_similarity_Instagram_2019(){var sim_data = {"nodes": [{"name": "0_0", "value": 8},{"name": "0_1", "value": 8},{"name": "0_2", "value": 8},{"name": "0_3", "value": 10},{"name": "0_4", "value": 6},{"name": "0_5", "value": 4},{"name": "0_6", "value": 9},{"name": "0_7", "value": 100},{"name": "0_8", "value": 4},{"name": "0_9", "value": 1},{"name": "1_0", "value": 7},{"name": "1_1", "value": 7},{"name": "1_2", "value": 100},{"name": "1_3", "value": 12},{"name": "1_4", "value": 8},{"name": "1_5", "value": 5},{"name": "1_6", "value": 6},{"name": "1_7", "value": 5},{"name": "1_8", "value": 6},{"name": "1_9", "value": 5},{"name": "2_0", "value": 7},{"name": "2_1", "value": 2},{"name": "2_2", "value": 4},{"name": "2_3", "value": 13},{"name": "2_4", "value": 5},{"name": "2_5", "value": 4},{"name": "2_6", "value": 9},{"name": "2_7", "value": 9},{"name": "2_8", "value": 100},{"name": "2_9", "value": 3},{"name": "3_0", "value": 100},{"name": "3_1", "value": 9},{"name": "3_2", "val

In [11]:
file_viz = os.path.join(path_viz, 'TopicSimilarity.js')

with open(file_viz, 'w') as text_file:
    text_file.write(topicsimilarity_js)