In [139]:
import os
import glob

import pandas as pd
import numpy as np

import ReadParameterFile

In [83]:
params = ReadParameterFile.get_parameter_dict("semiannually-params")

In [84]:
params

{'end_month': '12',
 'end_year': '2017',
 'graph_types': ['positive', 'negative', 'overall'],
 'input_dir': '../temp',
 'languages': ['english', 'chinese', 'mix'],
 'leader_different': ['lim kit siang',
  'lim guan eng',
  'anwar',
  'mahathir',
  'abdul haji awang',
  'liow tiong lai',
  'subramaniam',
  'najib razak'],
 'leader_same_bn': ['liow tiong lai',
  'subramaniam',
  'najib razak',
  'mah siew keong'],
 'leader_same_pakatan': ['lim kit siang',
  'lim guan eng',
  'anwar',
  'mahathir',
  'abdul hadi awang'],
 'output_dir': '../results',
 'party_different': ['dap', 'pkr', 'ppbm', 'pas', 'mca', 'umno', 'mic'],
 'party_same_bn': ['mca', 'mic', 'umno', 'pbb', 'gerakan'],
 'party_same_pakatan': ['dap', 'pkr', 'pas', 'ppbm', 'amanah'],
 'start_month': '7',
 'start_year': '2017'}

In [85]:
# Extracted from : https://stackoverflow.com/a/2186555
def list_all_files(root_path, ext=".csv", front="20"):
    cfiles = []
    for root, dirs, files in os.walk(root_path):
        for file in files:
            if file.startswith(front) and file.endswith(ext):
                cfiles.append(os.path.join(root, file))
    return cfiles

In [102]:
def split_files_by_keyword(files, keyword):
    groups = {}
    for kw in keyword:
        groups[kw] = [f for f in files if kw in f]
    return groups
            

In [108]:
def filter_files_by_time_range(files, start_year=2017, end_year=2017, start_month=7, end_month=12):
    # transform args to int as keepsafe
    start_year = int(start_year)
    end_year = int(end_year)
    start_month = int(start_month)
    end_month = int(end_month)
    
    dates = []
    if start_year == end_year:
        dates = [str(start_year) + "_{:02d}".format(n) for n in range(start_month, end_month + 1)]
    return split_files_by_keyword(files, dates)

In [135]:
def flatten_dict(d):
    ret = []
    for k in d:
        ret = ret + d[k]
    return ret

In [225]:
"""
files : array of files

overall is default behavior
"""
def aggregate_tables(files, positive=True, negative=True):
    ret_dict = {}
    for f in files:
        df = pd.read_csv(f, sep=',')
        
        # Add all names into dict
        for n in set(df['name'].tolist()):
            # Do overall 
            if n in ret_dict:
                np.add(ret_dict[n]['overall'], np.add(df.loc[df['name']==n].as_matrix()[0][2:],df.loc[df['name']==n].as_matrix()[1][2:]))
            if n not in ret_dict:
                ret_dict[n] = {'overall': np.add(df.loc[df['name']==n].as_matrix()[0][2:],df.loc[df['name']==n].as_matrix()[1][2:])}
                
        if positive:
            for n in set(df['name'].tolist()):
                if "positive" in ret_dict[n]:
                    np.add(ret_dict[n]['positive'], df.loc[(df['name']==n) & (df['scale']==1)].as_matrix()[0][2:])
                else:
                    ret_dict[n]['positive'] = df.loc[(df['name']==n) & (df['scale']==1)].as_matrix()[0][2:]
                    
        if negative:
            for n in set(df['name'].tolist()):
                if "negative" in ret_dict[n]:
                    np.add(ret_dict[n]['negative'], df.loc[(df['name']==n) & (df['scale']==2)].as_matrix()[0][2:])
                else:
                    ret_dict[n]['negative'] = df.loc[(df['name']==n) & (df['scale']==2)].as_matrix()[0][2:]
        
    return ret_dict
                

In [220]:
"""
arr : np.array

returns an array of 5 values --> 5 weeks
"""
def summarize_values_to_week(arr):
    return [arr[0:7].sum(), arr[7:14].sum(), arr[14:21].sum(), arr[21:28].sum(), arr[28:31].sum()]

In [113]:
files = list_all_files(params['input_dir'])
languages = params['languages']

In [121]:
g = split_files_by_keyword(files, languages)

In [122]:
g

{'chinese': ['../temp/chinese/leader/2017_01_chinese__leader.csv',
  '../temp/chinese/leader/2017_02_chinese__leader.csv',
  '../temp/chinese/leader/2017_03_chinese__leader.csv',
  '../temp/chinese/leader/2017_04_chinese__leader.csv',
  '../temp/chinese/leader/2017_05_chinese__leader.csv',
  '../temp/chinese/leader/2017_06_chinese__leader.csv',
  '../temp/chinese/leader/2017_07_chinese__leader.csv',
  '../temp/chinese/leader/2017_08_chinese__leader.csv',
  '../temp/chinese/leader/2017_09_chinese__leader.csv',
  '../temp/chinese/leader/2017_10_chinese__leader.csv',
  '../temp/chinese/leader/2017_11_chinese__leader.csv',
  '../temp/chinese/leader/2017_12_chinese__leader.csv',
  '../temp/chinese/others/leader/2017_01_others_leader.csv',
  '../temp/chinese/others/leader/2017_02_others_leader.csv',
  '../temp/chinese/others/leader/2017_03_others_leader.csv',
  '../temp/chinese/others/leader/2017_04_others_leader.csv',
  '../temp/chinese/others/leader/2017_05_others_leader.csv',
  '../temp/c

In [203]:
d = split_files_by_keyword(g['chinese'], ['leader'])
d = filter_files_by_time_range(d['leader'])
d

{'2017_07': ['../temp/chinese/leader/2017_07_chinese__leader.csv',
  '../temp/chinese/others/leader/2017_07_others_leader.csv'],
 '2017_08': ['../temp/chinese/leader/2017_08_chinese__leader.csv',
  '../temp/chinese/others/leader/2017_08_others_leader.csv'],
 '2017_09': ['../temp/chinese/leader/2017_09_chinese__leader.csv',
  '../temp/chinese/others/leader/2017_09_others_leader.csv'],
 '2017_10': ['../temp/chinese/leader/2017_10_chinese__leader.csv',
  '../temp/chinese/others/leader/2017_10_others_leader.csv'],
 '2017_11': ['../temp/chinese/leader/2017_11_chinese__leader.csv',
  '../temp/chinese/others/leader/2017_11_others_leader.csv'],
 '2017_12': ['../temp/chinese/leader/2017_12_chinese__leader.csv',
  '../temp/chinese/others/leader/2017_12_others_leader.csv']}

In [206]:
df = pd.read_csv(d['2017_09'][0], sep=',')
df

Unnamed: 0,name,scale,1,2,3,4,5,6,7,8,...,22,23,24,25,26,27,28,29,30,31
0,纳吉·阿都拉萨,1,6,6,11,14,17,27,10,23,...,14,10,7,4,8,5,17,10,4,0
1,纳吉·阿都拉萨,2,1,2,0,0,2,2,3,1,...,0,1,2,3,1,1,2,1,0,0
2,安华·依布拉欣,1,2,2,2,2,3,1,2,1,...,1,2,2,1,0,1,0,0,1,0
3,安华·依布拉欣,2,0,2,0,0,1,1,1,0,...,2,1,0,1,0,0,0,0,0,0
4,马哈迪·莫哈末,1,1,14,7,7,6,13,6,16,...,9,8,12,8,3,3,4,5,3,0
5,马哈迪·莫哈末,2,1,3,0,0,4,2,5,1,...,0,2,4,0,0,2,2,0,1,0
6,林冠英,1,0,6,2,7,3,2,1,26,...,1,2,8,18,3,2,1,2,6,0
7,林冠英,2,0,0,0,1,3,0,0,5,...,1,1,0,2,2,2,4,1,0,0
8,廖中莱,1,0,0,0,0,0,0,2,1,...,0,0,0,0,2,0,0,1,0,0
9,廖中莱,2,0,0,0,0,1,0,2,0,...,0,0,1,0,0,1,0,0,0,0


In [226]:
t = aggregate_tables(d['2017_09'])

In [227]:
t

{'卡巴星': {'overall': array([0, 0, 0, 3, 1, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,
         0, 1, 0, 0, 0, 0, 0, 0], dtype=object),
  'positive': array([0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,
         0, 1, 0, 0, 0, 0, 0, 0], dtype=object)},
 '哈迪阿旺': {'overall': array([1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0], dtype=object),
  'positive': array([1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0], dtype=object)},
 '安华·依布拉欣': {'overall': array([2, 4, 2, 2, 4, 2, 3, 1, 0, 0, 1, 0, 1, 0, 2, 0, 0, 0, 1, 0, 1, 3, 3,
         2, 2, 0, 1, 0, 0, 1, 0], dtype=object),
  'positive': array([2, 2, 2, 2, 3, 1, 2, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 1, 1, 2,
         2, 1, 0, 1, 0, 0, 1, 0], dtype=object)},
 '廖中莱': {'overall': array([0, 0, 0, 0, 1, 0, 4, 1, 0, 0, 2, 3, 0, 0, 1, 0, 1, 3, 1, 1, 0, 0, 0,
         1, 0, 2, 1, 0, 1, 0, 0], dtype=obje

In [222]:
for k in t:
    t[k]['overall'] = summarize_values_to_week(t[k]['overall'])
t

{'卡巴星': {'overall': [6, 1, 0, 3, 0]},
 '哈迪阿旺': {'overall': [1, 1, 0, 0, 0]},
 '安华·依布拉欣': {'overall': [19, 3, 4, 11, 1]},
 '廖中莱': {'overall': [5, 6, 7, 4, 1]},
 '拉菲兹·南利': {'overall': [6, 1, 0, 0, 0]},
 '曹观友': {'overall': [0, 0, 1, 1, 0]},
 '林冠英': {'overall': [25, 76, 44, 47, 9]},
 '潘儉偉': {'overall': [0, 0, 0, 0, 0]},
 '纳吉·阿都拉萨': {'overall': [101, 107, 120, 75, 15]},
 '罗斯玛·曼梳': {'overall': [9, 10, 6, 7, 0]},
 '许子根': {'overall': [4, 11, 1, 5, 0]},
 '郑雨周': {'overall': [0, 3, 0, 0, 0]},
 '马哈迪·莫哈末': {'overall': [69, 57, 34, 57, 9]},
 '魏家祥': {'overall': [1, 0, 3, 1, 0]}}