In [139]:
import os
import glob

import pandas as pd
import numpy as np

import ReadParameterFile

In [83]:
params = ReadParameterFile.get_parameter_dict("semiannually-params")

In [84]:
params

{'end_month': '12',
 'end_year': '2017',
 'graph_types': ['positive', 'negative', 'overall'],
 'input_dir': '../temp',
 'languages': ['english', 'chinese', 'mix'],
 'leader_different': ['lim kit siang',
  'lim guan eng',
  'anwar',
  'mahathir',
  'abdul haji awang',
  'liow tiong lai',
  'subramaniam',
  'najib razak'],
 'leader_same_bn': ['liow tiong lai',
  'subramaniam',
  'najib razak',
  'mah siew keong'],
 'leader_same_pakatan': ['lim kit siang',
  'lim guan eng',
  'anwar',
  'mahathir',
  'abdul hadi awang'],
 'output_dir': '../results',
 'party_different': ['dap', 'pkr', 'ppbm', 'pas', 'mca', 'umno', 'mic'],
 'party_same_bn': ['mca', 'mic', 'umno', 'pbb', 'gerakan'],
 'party_same_pakatan': ['dap', 'pkr', 'pas', 'ppbm', 'amanah'],
 'start_month': '7',
 'start_year': '2017'}

In [85]:
# Extracted from : https://stackoverflow.com/a/2186555
def list_all_files(root_path, ext=".csv", front="20"):
    cfiles = []
    for root, dirs, files in os.walk(root_path):
        for file in files:
            if file.startswith(front) and file.endswith(ext):
                cfiles.append(os.path.join(root, file))
    return cfiles

In [102]:
def split_files_by_keyword(files, keyword):
    groups = {}
    for kw in keyword:
        groups[kw] = [f for f in files if kw in f]
    return groups
            

In [108]:
def filter_files_by_time_range(files, start_year=2017, end_year=2017, start_month=7, end_month=12):
    # transform args to int as keepsafe
    start_year = int(start_year)
    end_year = int(end_year)
    start_month = int(start_month)
    end_month = int(end_month)
    
    dates = []
    if start_year == end_year:
        dates = [str(start_year) + "_{:02d}".format(n) for n in range(start_month, end_month + 1)]
    return split_files_by_keyword(files, dates)

In [135]:
def flatten_dict(d):
    ret = []
    for k in d:
        ret = ret + d[k]
    return ret

In [327]:
"""
files : array of files

overall is default behavior
"""
def aggregate_tables(files, positive=True, negative=True):
    ret_dict = {}
    for f in files:
        df = pd.read_csv(f, sep=',')
        # Add all names into dict
        for n in set(df['name'].tolist()):
            # Do overall 
            if n in ret_dict:
                ret_dict[n]['overall'] += df.loc[df['name']==n].as_matrix()[0][2:] + df.loc[df['name']==n].as_matrix()[1][2:]
            if n not in ret_dict:
                ret_dict[n] = {'overall': df.loc[df['name']==n].as_matrix()[0][2:] + df.loc[df['name']==n].as_matrix()[1][2:]}
            if positive:
                if "positive" in ret_dict[n]:
                    ret_dict[n]['positive'] += df.loc[(df['name']==n) & (df['scale']==1)].as_matrix()[0][2:]
                else:
                    ret_dict[n]['positive'] = df.loc[(df['name']==n) & (df['scale']==1)].as_matrix()[0][2:]

            if negative:
                if "negative" in ret_dict[n]:
                    ret_dict[n]['negative'] += df.loc[(df['name']==n) & (df['scale']==2)].as_matrix()[0][2:]
                else:
                    ret_dict[n]['negative'] = df.loc[(df['name']==n) & (df['scale']==2)].as_matrix()[0][2:]

    return ret_dict
                

In [220]:
"""
arr : np.array

returns an array of 5 values --> 5 weeks
"""
def summarize_values_to_week(arr):
    return [arr[0:7].sum(), arr[7:14].sum(), arr[14:21].sum(), arr[21:28].sum(), arr[28:31].sum()]

In [262]:
files = list_all_files(params['input_dir'])
languages = params['languages']

In [322]:
g = split_files_by_keyword(files, languages)

In [323]:
g

{'chinese': ['../temp/chinese/leader/2017_01_chinese__leader.csv',
  '../temp/chinese/leader/2017_02_chinese__leader.csv',
  '../temp/chinese/leader/2017_03_chinese__leader.csv',
  '../temp/chinese/leader/2017_04_chinese__leader.csv',
  '../temp/chinese/leader/2017_05_chinese__leader.csv',
  '../temp/chinese/leader/2017_06_chinese__leader.csv',
  '../temp/chinese/leader/2017_07_chinese__leader.csv',
  '../temp/chinese/leader/2017_08_chinese__leader.csv',
  '../temp/chinese/leader/2017_09_chinese__leader.csv',
  '../temp/chinese/leader/2017_10_chinese__leader.csv',
  '../temp/chinese/leader/2017_11_chinese__leader.csv',
  '../temp/chinese/leader/2017_12_chinese__leader.csv',
  '../temp/chinese/others/leader/2017_01_others_leader.csv',
  '../temp/chinese/others/leader/2017_02_others_leader.csv',
  '../temp/chinese/others/leader/2017_03_others_leader.csv',
  '../temp/chinese/others/leader/2017_04_others_leader.csv',
  '../temp/chinese/others/leader/2017_05_others_leader.csv',
  '../temp/c

In [324]:
d = split_files_by_keyword(g['english'], ['leader'])
d = filter_files_by_time_range(d['leader'])
d

{'2017_07': ['../temp/english/facebook/leader/2017_07_facebook_leader.csv',
  '../temp/english/others/leader/2017_07_others_leader.csv',
  '../temp/english/twitter/leader/2017_07_twitter_leader.csv'],
 '2017_08': ['../temp/english/facebook/leader/2017_08_facebook_leader.csv',
  '../temp/english/others/leader/2017_08_others_leader.csv',
  '../temp/english/twitter/leader/2017_08_twitter_leader.csv'],
 '2017_09': ['../temp/english/facebook/leader/2017_09_facebook_leader.csv',
  '../temp/english/others/leader/2017_09_others_leader.csv',
  '../temp/english/twitter/leader/2017_09_twitter_leader.csv'],
 '2017_10': ['../temp/english/facebook/leader/2017_10_facebook_leader.csv',
  '../temp/english/others/leader/2017_10_others_leader.csv',
  '../temp/english/twitter/leader/2017_10_twitter_leader.csv'],
 '2017_11': ['../temp/english/facebook/leader/2017_11_facebook_leader.csv',
  '../temp/english/others/leader/2017_11_others_leader.csv',
  '../temp/english/twitter/leader/2017_11_twitter_leader.cs

In [328]:
tables = [aggregate_tables(d[k]) for k in d]
len(tables)

6

In [329]:
for _t in tables:
    for k in _t:
        _t[k]['overall'] = summarize_values_to_week(_t[k]['overall'])
        _t[k]['positive'] = summarize_values_to_week(_t[k]['positive'])
        _t[k]['negative'] = summarize_values_to_week(_t[k]['negative'])
tables

[{'abdul hadi awang': {'negative': [1, 0, 0, 0, 0],
   'overall': [2, 0, 0, 0, 0],
   'positive': [1, 0, 0, 0, 0]},
  'ahmad yakob': {'negative': [0, 0, 0, 0, 0],
   'overall': [0, 0, 0, 0, 0],
   'positive': [0, 0, 0, 0, 0]},
  'ahmah zahid hamidi': {'negative': [0, 0, 0, 0, 0],
   'overall': [0, 0, 0, 0, 0],
   'positive': [0, 0, 0, 0, 0]},
  'amar nik abdullah': {'negative': [0, 0, 0, 1, 0],
   'overall': [0, 0, 0, 1, 0],
   'positive': [0, 0, 0, 0, 0]},
  'anwar': {'negative': [4, 9, 10, 4, 4],
   'overall': [8, 21, 26, 13, 5],
   'positive': [4, 12, 16, 9, 1]},
  'azmin ali': {'negative': [0, 1, 0, 0, 0],
   'overall': [1, 1, 2, 1, 0],
   'positive': [1, 0, 2, 1, 0]},
  'charles anthony santiago': {'negative': [0, 0, 0, 0, 0],
   'overall': [0, 0, 0, 0, 0],
   'positive': [0, 0, 0, 0, 0]},
  'chen man hin': {'negative': [0, 0, 0, 0, 0],
   'overall': [0, 0, 0, 0, 0],
   'positive': [0, 0, 0, 0, 0]},
  'gobind singh deo': {'negative': [0, 2, 1, 1, 0],
   'overall': [0, 3, 1, 1, 0],

In [331]:
tables[0]

{'abdul hadi awang': {'negative': [1, 0, 0, 0, 0],
  'overall': [2, 0, 0, 0, 0],
  'positive': [1, 0, 0, 0, 0]},
 'ahmad yakob': {'negative': [0, 0, 0, 0, 0],
  'overall': [0, 0, 0, 0, 0],
  'positive': [0, 0, 0, 0, 0]},
 'ahmah zahid hamidi': {'negative': [0, 0, 0, 0, 0],
  'overall': [0, 0, 0, 0, 0],
  'positive': [0, 0, 0, 0, 0]},
 'amar nik abdullah': {'negative': [0, 0, 0, 1, 0],
  'overall': [0, 0, 0, 1, 0],
  'positive': [0, 0, 0, 0, 0]},
 'anwar': {'negative': [4, 9, 10, 4, 4],
  'overall': [8, 21, 26, 13, 5],
  'positive': [4, 12, 16, 9, 1]},
 'azmin ali': {'negative': [0, 1, 0, 0, 0],
  'overall': [1, 1, 2, 1, 0],
  'positive': [1, 0, 2, 1, 0]},
 'charles anthony santiago': {'negative': [0, 0, 0, 0, 0],
  'overall': [0, 0, 0, 0, 0],
  'positive': [0, 0, 0, 0, 0]},
 'chen man hin': {'negative': [0, 0, 0, 0, 0],
  'overall': [0, 0, 0, 0, 0],
  'positive': [0, 0, 0, 0, 0]},
 'gobind singh deo': {'negative': [0, 2, 1, 1, 0],
  'overall': [0, 3, 1, 1, 0],
  'positive': [0, 1, 0, 0