In [1]:
import os
import glob
from functools import reduce

import pandas as pd
import numpy as np
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import pathlib

import ReadParameterFile
import NameMapper
import GraphPlotter

In [2]:
params = ReadParameterFile.get_parameter_dict("semiannually-params")
prop = fm.FontProperties(fname="simhei.ttf")

In [3]:
params

{'end_month': '12',
 'end_year': '2017',
 'graph_types': ['positive', 'negative', 'overall'],
 'input_dir': '../temp',
 'languages': ['english', 'chinese', 'mix'],
 'leader_different': ['lim kit siang',
  'lim guan eng',
  'anwar',
  'mahathir',
  'abdul hadi awang',
  'liow tiong lai',
  'subramaniam',
  'najib'],
 'leader_same_bn': ['liow tiong lai',
  'subramaniam',
  'najib',
  'mah siew keong'],
 'leader_same_pakatan': ['lim kit siang',
  'lim guan eng',
  'anwar',
  'mahathir',
  'abdul hadi awang'],
 'output_dir': './results',
 'party_different': ['dap', 'pkr', 'ppbm', 'pas', 'mca', 'umno', 'mic'],
 'party_same_bn': ['mca', 'mic', 'umno', 'pbb', 'gerakan'],
 'party_same_pakatan': ['dap', 'pkr', 'pas', 'ppbm', 'amanah'],
 'start_month': '7',
 'start_year': '2017'}

In [4]:
# Extracted from : https://stackoverflow.com/a/2186555
def list_all_files(root_path, ext=".csv", front="20"):
    cfiles = []
    for root, dirs, files in os.walk(root_path):
        for file in files:
            if file.startswith(front) and file.endswith(ext):
                cfiles.append(os.path.join(root, file))
    return cfiles

In [5]:
def split_files_by_keyword(files, keyword):
    groups = {}
    for kw in keyword:
        groups[kw] = [f for f in files if kw in f]
    return groups
            

In [6]:
def filter_files_by_time_range(files, start_year=2017, end_year=2017, start_month=7, end_month=12):
    # transform args to int as keepsafe
    start_year = int(start_year)
    end_year = int(end_year)
    start_month = int(start_month)
    end_month = int(end_month)
    
    dates = []
    if start_year == end_year:
        dates = [str(start_year) + "_{:02d}".format(n) for n in range(start_month, end_month + 1)]
    return split_files_by_keyword(files, dates)

In [7]:
def flatten_dict(d):
    ret = []
    for k in d:
        ret = ret + d[k]
    return ret

In [8]:
"""
files : array of files

overall is default behavior
"""
name_dict = NameMapper.map_name_from_file("name_mapping.csv")
def aggregate_tables(files, positive=False, negative=False):
    global name_dict
    ret_dict = {}
    for f in files:
        df = pd.read_csv(f, sep=',')
        # Add all names into dict
        for n in set(df['name'].tolist()):
            # Do overall 
            if n in ret_dict or name_dict[n] in ret_dict:
                ret_dict[name_dict[n]]['overall'] += df.loc[df['name']==n].as_matrix()[0][2:] + df.loc[df['name']==n].as_matrix()[1][2:]
            else:
                ret_dict[name_dict[n]] = {'overall': df.loc[df['name']==n].as_matrix()[0][2:] + df.loc[df['name']==n].as_matrix()[1][2:]}
            if positive:
                if "positive" in ret_dict[name_dict[n]]:
                    ret_dict[name_dict[n]]['positive'] += df.loc[(df['name']==n) & (df['scale']==1)].as_matrix()[0][2:]
                else:
                    ret_dict[name_dict[n]]['positive'] = df.loc[(df['name']==n) & (df['scale']==1)].as_matrix()[0][2:]

            if negative:
                if "negative" in ret_dict[name_dict[n]]:
                    ret_dict[name_dict[n]]['negative'] += df.loc[(df['name']==n) & (df['scale']==2)].as_matrix()[0][2:]
                else:
                    ret_dict[name_dict[n]]['negative'] = df.loc[(df['name']==n) & (df['scale']==2)].as_matrix()[0][2:]

    return ret_dict
                

In [9]:
"""
arr : np.array

returns an array of 5 values --> 5 weeks
"""
def summarize_values_to_week(arr):
    return [arr[0:7].sum(), arr[7:14].sum(), arr[14:21].sum(), arr[21:28].sum(), arr[28:31].sum()]

In [10]:
"""
marr : array of dicts

"""
def flatten_map(marr):
    ret_dict = []
    for key in marr[0]:
        new_dict = {key: []}
        for d in marr:
            new_dict[key] += [d[key]]
        ret_dict += [new_dict]
    return ret_dict

In [11]:
def flatten_inner_array(x):
    for i, n in enumerate(x):
        x[i][list(n.keys())[0]] = flatten_map(x[i][list(n.keys())[0]])
        for j, n2 in enumerate(x[i][list(n.keys())[0]]):
            x[i][list(n.keys())[0]][j][list(n2.keys())[0]] = np.concatenate(n2[list(n2.keys())[0]]).ravel().tolist()
    return x

In [12]:
def flatten_array_to_map(marr):
    dict_merge = lambda a,b: a.update(b) or a
    return reduce((lambda x, y: dict_merge(x, y)), marr)

In [13]:
prep_data_cache = None
def prepare_data_from_source(input_dir, languages, pref_language, candidate, graph_types, filters=None, clear_cache=False):
    global prep_data_cache
    if clear_cache:
        prep_data_cache = None
    if not prep_data_cache:
        files = list_all_files(input_dir)
        g = split_files_by_keyword(files, languages)
        d = split_files_by_keyword(g[pref_language], [candidate])
        d = filter_files_by_time_range(d[candidate])
        tables = [aggregate_tables(d[k], positive=True, negative=True) for k in d]
        for _t in tables:
            for k in _t:
                for types in graph_types:
                    _t[k][types] = summarize_values_to_week(_t[k][types])

        x = flatten_map(tables)

        x_last = flatten_inner_array(x)
        xl2 = x_last
        xl3 = flatten_array_to_map(xl2)

        for n in xl3:
            xl3[n] = flatten_array_to_map(xl3[n])
        if filters:
            filtered_dict = flatten_array_to_map([{name_dict[n]: xl3[name_dict[n]]} for n in filters if name_dict[n] in xl3])
        else:
            filtered_dict = flatten_array_to_map([xl3])
            
        prep_data_cache = filtered_dict    
    else:
        if filters:
            filtered_dict = flatten_array_to_map([{name_dict[n]: prep_data_cache[name_dict[n]]} for n in filters if name_dict[n] in prep_data_cache])
        else:
            filtered_dict = flatten_array_to_map([prep_data_cache])
    return filtered_dict

def clear_prep_data_cache():
    global prep_data_cache
    prep_data_cache = None

In [14]:
#filtered_dict = prepare_data_from_source(params['input_dir'], params['languages'], 'english', 'leader', params['graph_types'])


In [15]:
def plot_semiannual_graph(name, data, category, plot_kind="overall", save=False, show=True, suffix=None):
    if isinstance(suffix,str):
        if suffix == "chinese":
            plt.rcParams["font.family"] = "simhei"
        else:
            plt.rcParams["font.family"] = "sans-serif"
    weeks = [i + 1 for i in range(0, 6 * 5)]
    dpi = 96
    plt.figure(figsize=(1366/dpi,768/dpi), dpi=dpi)
    plt.gca().set_color_cycle(['green', 'red', 'blue', 'purple', 'yellow', 'gray', 'orange', 'magenta'])
    
    plt.xlabel("Week")
    plt.ylabel("Polarity")
    plt.xticks(weeks, ["W" + str((i % 5)) if i % 5 == 1 else str((i % 5 + 1)) for i in weeks])
    
    graph_title = name + " " + plot_kind
    
    plt.suptitle(graph_title, fontsize=20)
    for n in data:
        plt.plot(weeks, data[n][plot_kind], label=n)
        
    plt.legend()
    
    if save:
        root_dir = params['output_dir']
        graph_dir = "semiannual_plot"
        current_run_dir = params['start_year'] + "-" + params['start_month'] + "__" + params['end_year'] + "-" + params['end_month']
        endpath = os.path.join(root_dir, graph_dir, current_run_dir, category)
        
        pathlib.Path(endpath).mkdir(parents=True, exist_ok=True)
        filename =category + "_" + plot_kind
        if isinstance(suffix, str):
            filename += "_" + suffix
        filename += ".png"
        print(filename)
        plt.savefig(os.path.join(endpath, filename), dpi=dpi)
        
    if show:
        plt.show()
    
    plt.close('all')
    

In [16]:
def plot_helper(lang):
    date = params["start_year"] + "-" + params["start_month"] + " -- " + params["end_year"] + "-" + params["end_month"]
    
    filtered_dict = prepare_data_from_source(params['input_dir'], params['languages'], lang, 'leader', params['graph_types'], params['leader_different'], clear_cache=True)
    GraphPlotter.plot_semiannual_graph("Different coalition leader "+ date, filtered_dict, "leader_different", save=True, show=False, suffix=lang, params=params)
    GraphPlotter.plot_semiannual_graph("Different coalition leader "+ date, filtered_dict, "leader_different", plot_kind="positive", save=True, show=False, suffix=lang, params=params)
    GraphPlotter.plot_semiannual_graph("Different coalition leader "+ date, filtered_dict, "leader_different", plot_kind="negative", save=True, show=False, suffix=lang, params=params)

    filtered_dict = prepare_data_from_source(params['input_dir'], params['languages'], lang, 'party', params['graph_types'], params['party_different'], clear_cache=True)
    GraphPlotter.plot_semiannual_graph("Different coalition party", filtered_dict, "party_different", save=True, show=False, suffix=lang, params=params)
    GraphPlotter.plot_semiannual_graph("Different coalition party", filtered_dict, "party_different", plot_kind="positive", save=True, show=False, suffix=lang, params=params)
    GraphPlotter.plot_semiannual_graph("Different coalition party", filtered_dict, "party_different", plot_kind="negative", save=True, show=False, suffix=lang, params=params)
    
    cat_type = "party_same_bn"
    filtered_dict = prepare_data_from_source(params['input_dir'], params['languages'], lang, 'party', params['graph_types'], params[cat_type], clear_cache=True)
    GraphPlotter.plot_semiannual_graph("BN party coalition "+ date, filtered_dict, cat_type, save=True, show=False, suffix=lang, params=params)
    GraphPlotter.plot_semiannual_graph("BN party coalition "+ date, filtered_dict, cat_type, plot_kind="positive", save=True, show=False, suffix=lang, params=params)
    GraphPlotter.plot_semiannual_graph("BN party coalition "+ date, filtered_dict, cat_type, plot_kind="negative", save=True, show=False, suffix=lang, params=params)
    
    cat_type = "party_same_pakatan"
    filtered_dict = prepare_data_from_source(params['input_dir'], params['languages'], lang, 'party', params['graph_types'], params[cat_type], clear_cache=True)
    GraphPlotter.plot_semiannual_graph("Pakatan party coalition "+ date, filtered_dict, cat_type, save=True, show=False, suffix=lang, params=params)
    GraphPlotter.plot_semiannual_graph("Pakatan party coalition "+ date, filtered_dict, cat_type, plot_kind="positive", save=True, show=False, suffix=lang, params=params)
    GraphPlotter.plot_semiannual_graph("Pakatan party coalition "+ date, filtered_dict, cat_type, plot_kind="negative", save=True, show=False, suffix=lang, params=params)
    
    cat_type = "leader_same_bn"
    filtered_dict = prepare_data_from_source(params['input_dir'], params['languages'], lang, 'leader', params['graph_types'], params[cat_type], clear_cache=True)
    GraphPlotter.plot_semiannual_graph("BN leaders coalition "+ date, filtered_dict, cat_type, save=True, show=False, suffix=lang, params=params)
    GraphPlotter.plot_semiannual_graph("BN leaders coalition "+ date, filtered_dict, cat_type, plot_kind="positive", save=True, show=False, suffix=lang, params=params)
    GraphPlotter.plot_semiannual_graph("BN leaders coalition "+ date, filtered_dict, cat_type, plot_kind="negative", save=True, show=False, suffix=lang, params=params)

    cat_type = "leader_same_pakatan"
    filtered_dict = prepare_data_from_source(params['input_dir'], params['languages'], lang, 'leader', params['graph_types'], params[cat_type], clear_cache=True)
    GraphPlotter.plot_semiannual_graph("Pakatan leaders coalition "+ date, filtered_dict, cat_type, save=True, show=False, suffix=lang, params=params)
    GraphPlotter.plot_semiannual_graph("Pakatan leaders coalition "+ date, filtered_dict, cat_type, plot_kind="positive", save=True, show=False, suffix=lang, params=params)
    GraphPlotter.plot_semiannual_graph("Pakatan leaders coalition "+ date, filtered_dict, cat_type, plot_kind="negative", save=True, show=False, suffix=lang, params=params)

In [17]:
name_dict = NameMapper.map_name_from_file("name_mapping.csv")
plot_helper("english")
plot_helper("mix")
name_dict = NameMapper.map_name_from_file("name_mapping_cn.csv")
plot_helper("chinese")



leader_different_overall_english.png
leader_different_positive_english.png
leader_different_negative_english.png
party_different_overall_english.png
party_different_positive_english.png
party_different_negative_english.png
party_same_bn_overall_english.png
party_same_bn_positive_english.png
party_same_bn_negative_english.png
party_same_pakatan_overall_english.png
party_same_pakatan_positive_english.png
party_same_pakatan_negative_english.png
leader_same_bn_overall_english.png
leader_same_bn_positive_english.png
leader_same_bn_negative_english.png
leader_same_pakatan_overall_english.png
leader_same_pakatan_positive_english.png
leader_same_pakatan_negative_english.png
leader_different_overall_mix.png
leader_different_positive_mix.png
leader_different_negative_mix.png
party_different_overall_mix.png
party_different_positive_mix.png
party_different_negative_mix.png
party_same_bn_overall_mix.png
party_same_bn_positive_mix.png
party_same_bn_negative_mix.png
party_same_pakatan_overall_mix.png