In [6]:
'''
STATIC DATA FETCHER

# -----------------------------
Fetch the saved html files for each type of chart
'''
# ---------------------
# [Note : set base_path to have the correct relative path
# ---------------------

import os
import sys
import pandas as pd
from glob import glob 
from tqdm import tqdm
import multiprocessing as MP
from joblib import Parallel, delayed


class static_data_fetcher():

    def __init__(self, base_path = './'):
        self.chart_locationDir = {
            'companyNetworkViz': 'companyNetworkViz',
            'stackedComparison': 'stackedComparison',
            'EmbViz_all' : 'EmbViz_all',
            'HSCodeViz' : 'HSCodeViz',
            'TimeSeries' : 'TimeSeries',
            'sankeyDiagram' : 'sankeyDiagram'
        }
        self.base_path = base_path
        self.htmlCache_dir_name = 'htmlCache'
        return 
    '''
    sub_DIR is the epoch
    '''
    def fetch_saved_html(self, PanjivaRecordID, sub_DIR='01_2016'):
        result = {}
        PanjivaRecordID = int(PanjivaRecordID)
        for figure_type, loc in self.chart_locationDir.items():
            _dir = os.path.join(self.base_path, loc, self.htmlCache_dir_name + '_' + sub_DIR)
            # Search
            if figure_type == 'stackedComparison':
                result[figure_type] = {}
                _dir = os.path.join(_dir,  str(PanjivaRecordID) )
                files = glob(os.path.join(_dir,'**.html'))
                for f in files:
                    domain = os.path.basename(f).split('.')[0].split('__')[-1]
                    fh = open(f, 'r')
                    result[figure_type][domain] = fh.read()
                    fh.close()
            elif figure_type == 'sankeyDiagram':
                result[figure_type] = {}
                # fetch the 2 types of diagrams.
                files = glob(os.path.join(_dir,'**_{}_**.html'.format(PanjivaRecordID)))
                
                for f in files:
                    _type =  os.path.basename(f).split('.')[0].split('_')[-1] 
                    fh = open(f, 'r')
                    result[figure_type]['Sankey Diagram Type {}'.format(_type)] = fh.read()
                    fh.close()
            elif figure_type == 'EmbViz_all':
                try:
                    file = sorted(glob(os.path.join(_dir,'**_{}.html'.format(PanjivaRecordID))))[0]
                    fh = open(file, 'r')
                    result[figure_type] = fh.read()
                    fh.close()
                except:
                    result[figure_type] = None
            elif figure_type == 'companyNetworkViz':
                try:
                    file = sorted(glob(os.path.join(_dir,'**_{}_**.html'.format(PanjivaRecordID))))[0]
                    fh = open(file, 'r')
                    result[figure_type] = fh.read()
                    fh.close()
                except:
                    result[figure_type] = None
            elif figure_type == 'TimeSeries':
                result[figure_type] = {}
                try:
                    file1 = sorted(glob(os.path.join(_dir,'{}**Consignee**.html'.format(PanjivaRecordID))))[0]
                    file2 = sorted(glob(os.path.join(_dir,'{}**Shipper**.html'.format(PanjivaRecordID))))[0]
                    fh = open(file1, 'r')
                    result[figure_type]['Consignee'] = fh.read()
                    fh.close()
                    fh = open(file2, 'r')
                    result[figure_type]['Shipper'] = fh.read()
                    fh.close()
                except:
                    result[figure_type] = None
            elif figure_type == 'HSCodeViz':
                result[figure_type] = {}
                try:
                    file1 = sorted(glob(os.path.join(_dir,'{}**Consignee**.html'.format(PanjivaRecordID))))[0]
                    file2 = sorted(glob(os.path.join(_dir,'{}**Shipper**.html'.format(PanjivaRecordID))))[0]
                    fh = open(file1, 'r')
                    result[figure_type]['Consignee'] = fh.read()
                    fh.close()
                    fh = open(file2, 'r')
                    result[figure_type]['Shipper'] = fh.read()
                    fh.close()
                except:
                    result[figure_type] = None
        return result
         
'''
# SAMPLE CALL
'''

# obj = static_data_fetcher(base_path='./')
# result = obj.fetch_saved_html(120901356)

In [8]:
_list =  [120901356,
121983692,
121896803,
121682646,
121852671,
121048913,
121465508,
121684779,
121097684,
121973404,
121211441]

In [11]:
for l in _list:
    result = obj.fetch_saved_html(l)
    print(l, [(k,type(result[k])) for k in result.keys()])

120901356 [('companyNetworkViz', <class 'str'>), ('stackedComparison', <class 'dict'>), ('EmbViz_all', <class 'str'>), ('HSCodeViz', <class 'dict'>), ('TimeSeries', <class 'dict'>), ('sankeyDiagram', <class 'dict'>)]
121983692 [('companyNetworkViz', <class 'str'>), ('stackedComparison', <class 'dict'>), ('EmbViz_all', <class 'str'>), ('HSCodeViz', <class 'dict'>), ('TimeSeries', <class 'dict'>), ('sankeyDiagram', <class 'dict'>)]
121896803 [('companyNetworkViz', <class 'str'>), ('stackedComparison', <class 'dict'>), ('EmbViz_all', <class 'str'>), ('HSCodeViz', <class 'dict'>), ('TimeSeries', <class 'dict'>), ('sankeyDiagram', <class 'dict'>)]
121682646 [('companyNetworkViz', <class 'str'>), ('stackedComparison', <class 'dict'>), ('EmbViz_all', <class 'str'>), ('HSCodeViz', <class 'dict'>), ('TimeSeries', <class 'dict'>), ('sankeyDiagram', <class 'dict'>)]
121852671 [('companyNetworkViz', <class 'str'>), ('stackedComparison', <class 'dict'>), ('EmbViz_all', <class 'str'>), ('HSCodeViz',

In [15]:
result['sankeyDiagram']['Sankey Diagram Type 2']

'<div>            <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js?config=TeX-AMS-MML_SVG"></script><script type="text/javascript">if (window.MathJax) {MathJax.Hub.Config({SVG: {font: "STIX-Web"}});}</script>                <script type="text/javascript">window.PlotlyConfig = {MathJaxConfig: \'local\'};</script>\n        <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>                <div id="65c87b7f-a166-44f9-b190-ce7fc122b52b" class="plotly-graph-div" style="height:600px; width:100%;"></div>            <script type="text/javascript">                                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("65c87b7f-a166-44f9-b190-ce7fc122b52b")) {                    Plotly.newPlot(                        "65c87b7f-a166-44f9-b190-ce7fc122b52b",                        [{"link": {"source": [313, 199, 178, 340, 195, 118, 352, 353, 295, 277, 156, 170, 293, 317, 238, 200, 218, 151