# <span style="color:blue">Prepare models for visualization (generation of Doc.js file) - 10 topics</span>

### Load packages

In [1]:
import gensim, os, re, time
import dateutil.parser
import numpy as np
import pandas as pd
from gensim.test.utils import datapath
from numpy import savetxt

### Constants

In [2]:
# for reproducible results
random_state = 33
# if the test is True, the process runs on a smaller subset of raw data (json files)
test = False

if test:
    # path where the doc files are stored
    path_doc = r'D:\master\data science\semestre 4\M2.979 - tfm\data\bins_test\doc'
    # path where the meta files are stored
    path_meta = r'D:\master\data science\semestre 4\M2.979 - tfm\data\bins_test\meta'
    # path where all the files related to the visualization of the models are stored
    path_viz = r'D:\master\data science\semestre 4\M2.979 - tfm\data\bins_test\viz'
else:
    # path where the doc files are stored
    path_doc = r'D:\master\data science\semestre 4\M2.979 - tfm\data\bins\doc'
    # path where the meta files are stored
    path_meta = r'D:\master\data science\semestre 4\M2.979 - tfm\data\bins\meta'
    # path where all the files related to the visualization of the models are stored
    path_viz = r'D:\master\data science\semestre 4\M2.979 - tfm\data\bins\viz'

### Function to remove " from original documents

In [3]:
def documents_content_viz(path_doc, yyyy_MMM):

    time_start = time.time()

    file_doc = os.path.join(path_doc, yyyy_MMM, yyyy_MMM)
    df_doc = pd.read_csv(file_doc + '.csv', names=['id_doc', 'content'])

    # generate regular expression pattern
    re_pattern = r'(")'
    df_doc['content'] = df_doc['content'].apply(lambda x: re.sub(re_pattern, '', str(x)))
    df_doc.to_csv(file_doc + '_viz.csv', index=False, header=False)

    time_end = time.time()
    hour, rem = divmod(time_end - time_start, 3600)
    minute, second = divmod(rem, 60)
    print('documents_content_viz - time elapsed - {:0>2}:{:0>2}:{:05.2f}'.format(int(hour), int(minute), second))

### Function get documents metadata

In [4]:
def documents_viz(doc_js, path_doc, path_meta, yyyy_MMM):

    time_start = time.time()

    file_doc = os.path.join(path_doc, yyyy_MMM, yyyy_MMM)
    df_doc = pd.read_csv(file_doc + '_viz.csv', names=['id_doc', 'content'])

    file_meta = os.path.join(path_meta, yyyy_MMM, yyyy_MMM)
    df_meta = pd.read_csv(file_meta + '.csv', names=['id_meta', 'file', 'author_followers', 'author_full_name', 'author_id', 'author_image', 'author_name', 'author_url', 'date', 'date_from_provider', 'id', 'id_from_provider', 'image_url', 'link', 'location_latitude', 'location_longitude', 'place_country_code', 'place_name', 'place_street_address', 'provider', 'social_likes', 'social_replies'])

    df_merged = pd.merge(left=df_doc, right=df_meta, left_on='id_doc', right_on='id_meta')

    file_viz = os.path.join(path_viz, 'doc_id_2_doc_id_viz')
    df_viz = pd.read_csv(file_viz + '.csv')
    
    df_merged = pd.merge(left=df_merged, right=df_viz, left_on='id_doc', right_on='doc_id')
    df_merged = df_merged.sort_values(by='doc_id_viz', ascending=True)
    for i ,(index, row) in enumerate(df_merged.iterrows()):
        doc_js = doc_js + '"' + str(int(row['id_doc'])) + '": {"tweet_id": ' + str(int(row['doc_id_viz'])) + ', "author": "' + str(row['author_full_name']) + '", "tweet_date": "' + dateutil.parser.parse(str(row['date'])).strftime("%#m/%#d/%Y %#H:%#M") + '", "text": "' + str(row['content']) + '", "author_url": "' + str(row['author_url']) + '"},'

    time_end = time.time()
    hour, rem = divmod(time_end - time_start, 3600)
    minute, second = divmod(rem, 60)
    print('documents_viz - time elapsed - {:0>2}:{:0>2}:{:05.2f}'.format(int(hour), int(minute), second))
    
    return doc_js

### Function get documents metadata (for last month)

In [5]:
def documents_viz_last(doc_js, path_doc, path_meta, yyyy_MMM):

    time_start = time.time()

    file_doc = os.path.join(path_doc, yyyy_MMM, yyyy_MMM)
    df_doc = pd.read_csv(file_doc + '_viz.csv', names=['id_doc', 'content'])

    file_meta = os.path.join(path_meta, yyyy_MMM, yyyy_MMM)
    df_meta = pd.read_csv(file_meta + '.csv', names=['id_meta', 'file', 'author_followers', 'author_full_name', 'author_id', 'author_image', 'author_name', 'author_url', 'date', 'date_from_provider', 'id', 'id_from_provider', 'image_url', 'link', 'location_latitude', 'location_longitude', 'place_country_code', 'place_name', 'place_street_address', 'provider', 'social_likes', 'social_replies'])

    df_merged = pd.merge(left=df_doc, right=df_meta, left_on='id_doc', right_on='id_meta')

    file_viz = os.path.join(path_viz, 'doc_id_2_doc_id_viz')
    df_viz = pd.read_csv(file_viz + '.csv')
    
    df_merged = pd.merge(left=df_merged, right=df_viz, left_on='id_doc', right_on='doc_id')
    df_merged = df_merged.sort_values(by='doc_id_viz', ascending=True)
    for i ,(index, row) in enumerate(df_merged.iterrows()):
        if i < len(df_merged) - 1:
            doc_js = doc_js + '"' + str(int(row['id_doc'])) + '": {"tweet_id": ' + str(int(row['doc_id_viz'])) + ', "author": "' + str(row['author_full_name']) + '", "tweet_date": "' + dateutil.parser.parse(str(row['date'])).strftime("%#m/%#d/%Y %#H:%#M") + '", "text": "' + str(row['content']) + '", "author_url": "' + str(row['author_url']) + '"},'
        else:
            doc_js = doc_js + '"' + str(int(row['id_doc'])) + '": {"tweet_id": ' + str(int(row['doc_id_viz'])) + ', "author": "' + str(row['author_full_name']) + '", "tweet_date": "' + dateutil.parser.parse(str(row['date'])).strftime("%#m/%#d/%Y %#H:%#M") + '", "text": "' + str(row['content']) + '", "author_url": "' + str(row['author_url']) + '"}'

    time_end = time.time()
    hour, rem = divmod(time_end - time_start, 3600)
    minute, second = divmod(rem, 60)
    print('documents_viz - time elapsed - {:0>2}:{:0>2}:{:05.2f}'.format(int(hour), int(minute), second))
    
    return doc_js

### <span style="color:blue">Generate Doc.js file</span>

In [6]:
doc_js = 'function populate_tweets_Instagram_2019(){var tweet_data ={'

In [7]:
yyyy_MMM = '2019_Jan'
print('----- ' + yyyy_MMM + ' -----')
documents_content_viz(path_doc, yyyy_MMM)
doc_js = documents_viz(doc_js, path_doc, path_meta, yyyy_MMM)

yyyy_MMM = '2019_Feb'
print('----- ' + yyyy_MMM + ' -----')
documents_content_viz(path_doc, yyyy_MMM)
doc_js = documents_viz(doc_js, path_doc, path_meta, yyyy_MMM)

yyyy_MMM = '2019_Mar'
print('----- ' + yyyy_MMM + ' -----')
documents_content_viz(path_doc, yyyy_MMM)
doc_js = documents_viz(doc_js, path_doc, path_meta, yyyy_MMM)

yyyy_MMM = '2019_Apr'
print('----- ' + yyyy_MMM + ' -----')
documents_content_viz(path_doc, yyyy_MMM)
doc_js = documents_viz(doc_js, path_doc, path_meta, yyyy_MMM)

yyyy_MMM = '2019_May'
print('----- ' + yyyy_MMM + ' -----')
documents_content_viz(path_doc, yyyy_MMM)
doc_js = documents_viz(doc_js, path_doc, path_meta, yyyy_MMM)

yyyy_MMM = '2019_Jun'
print('----- ' + yyyy_MMM + ' -----')
documents_content_viz(path_doc, yyyy_MMM)
doc_js = documents_viz(doc_js, path_doc, path_meta, yyyy_MMM)

yyyy_MMM = '2019_Jul'
print('----- ' + yyyy_MMM + ' -----')
documents_content_viz(path_doc, yyyy_MMM)
doc_js = documents_viz(doc_js, path_doc, path_meta, yyyy_MMM)

yyyy_MMM = '2019_Aug'
print('----- ' + yyyy_MMM + ' -----')
documents_content_viz(path_doc, yyyy_MMM)
doc_js = documents_viz(doc_js, path_doc, path_meta, yyyy_MMM)

yyyy_MMM = '2019_Sep'
print('----- ' + yyyy_MMM + ' -----')
documents_content_viz(path_doc, yyyy_MMM)
doc_js = documents_viz(doc_js, path_doc, path_meta, yyyy_MMM)

yyyy_MMM = '2019_Oct'
print('----- ' + yyyy_MMM + ' -----')
documents_content_viz(path_doc, yyyy_MMM)
doc_js = documents_viz(doc_js, path_doc, path_meta, yyyy_MMM)

yyyy_MMM = '2019_Nov'
print('----- ' + yyyy_MMM + ' -----')
documents_content_viz(path_doc, yyyy_MMM)
doc_js = documents_viz(doc_js, path_doc, path_meta, yyyy_MMM)

yyyy_MMM = '2019_Dec'
print('----- ' + yyyy_MMM + ' -----')
documents_content_viz(path_doc, yyyy_MMM)
doc_js = documents_viz_last(doc_js, path_doc, path_meta, yyyy_MMM)

----- 2019_Jan -----
documents_content_viz - time elapsed - 00:00:02.92
documents_viz - time elapsed - 01:07:52.53
----- 2019_Feb -----
documents_content_viz - time elapsed - 00:00:03.32
documents_viz - time elapsed - 00:00:03.27
----- 2019_Mar -----
documents_content_viz - time elapsed - 00:00:04.20
documents_viz - time elapsed - 00:00:05.06
----- 2019_Apr -----
documents_content_viz - time elapsed - 00:00:04.47
documents_viz - time elapsed - 00:00:06.99
----- 2019_May -----
documents_content_viz - time elapsed - 00:00:05.22
documents_viz - time elapsed - 00:00:05.35
----- 2019_Jun -----
documents_content_viz - time elapsed - 00:00:04.96
documents_viz - time elapsed - 00:00:05.06
----- 2019_Jul -----
documents_content_viz - time elapsed - 00:00:02.45
documents_viz - time elapsed - 00:00:03.19
----- 2019_Aug -----
documents_content_viz - time elapsed - 00:00:03.35
documents_viz - time elapsed - 00:00:04.11
----- 2019_Sep -----
documents_content_viz - time elapsed - 00:00:04.23
document

In [8]:
doc_js = doc_js + '};readTweetJSON(tweet_data);}'

In [10]:
file_viz = os.path.join(path_viz, 'Doc.js')

with open(file_viz, 'w', encoding='utf-8') as text_file:
    text_file.write(doc_js)