In [127]:
import os
import glob
from functools import reduce

import pandas as pd
import numpy as np

import ReadParameterFile

In [2]:
params = ReadParameterFile.get_parameter_dict("semiannually-params")

In [3]:
params

{'end_month': '12',
 'end_year': '2017',
 'graph_types': ['positive', 'negative', 'overall'],
 'input_dir': '../temp',
 'languages': ['english', 'chinese', 'mix'],
 'leader_different': ['lim kit siang',
  'lim guan eng',
  'anwar',
  'mahathir',
  'abdul haji awang',
  'liow tiong lai',
  'subramaniam',
  'najib razak'],
 'leader_same_bn': ['liow tiong lai',
  'subramaniam',
  'najib razak',
  'mah siew keong'],
 'leader_same_pakatan': ['lim kit siang',
  'lim guan eng',
  'anwar',
  'mahathir',
  'abdul hadi awang'],
 'output_dir': '../results',
 'party_different': ['dap', 'pkr', 'ppbm', 'pas', 'mca', 'umno', 'mic'],
 'party_same_bn': ['mca', 'mic', 'umno', 'pbb', 'gerakan'],
 'party_same_pakatan': ['dap', 'pkr', 'pas', 'ppbm', 'amanah'],
 'start_month': '7',
 'start_year': '2017'}

In [4]:
# Extracted from : https://stackoverflow.com/a/2186555
def list_all_files(root_path, ext=".csv", front="20"):
    cfiles = []
    for root, dirs, files in os.walk(root_path):
        for file in files:
            if file.startswith(front) and file.endswith(ext):
                cfiles.append(os.path.join(root, file))
    return cfiles

In [5]:
def split_files_by_keyword(files, keyword):
    groups = {}
    for kw in keyword:
        groups[kw] = [f for f in files if kw in f]
    return groups
            

In [6]:
def filter_files_by_time_range(files, start_year=2017, end_year=2017, start_month=7, end_month=12):
    # transform args to int as keepsafe
    start_year = int(start_year)
    end_year = int(end_year)
    start_month = int(start_month)
    end_month = int(end_month)
    
    dates = []
    if start_year == end_year:
        dates = [str(start_year) + "_{:02d}".format(n) for n in range(start_month, end_month + 1)]
    return split_files_by_keyword(files, dates)

In [7]:
def flatten_dict(d):
    ret = []
    for k in d:
        ret = ret + d[k]
    return ret

In [8]:
"""
files : array of files

overall is default behavior
"""
def aggregate_tables(files, positive=False, negative=False):
    ret_dict = {}
    for f in files:
        df = pd.read_csv(f, sep=',')
        # Add all names into dict
        for n in set(df['name'].tolist()):
            # Do overall 
            if n in ret_dict:
                ret_dict[n]['overall'] += df.loc[df['name']==n].as_matrix()[0][2:] + df.loc[df['name']==n].as_matrix()[1][2:]
            if n not in ret_dict:
                ret_dict[n] = {'overall': df.loc[df['name']==n].as_matrix()[0][2:] + df.loc[df['name']==n].as_matrix()[1][2:]}
            if positive:
                if "positive" in ret_dict[n]:
                    ret_dict[n]['positive'] += df.loc[(df['name']==n) & (df['scale']==1)].as_matrix()[0][2:]
                else:
                    ret_dict[n]['positive'] = df.loc[(df['name']==n) & (df['scale']==1)].as_matrix()[0][2:]

            if negative:
                if "negative" in ret_dict[n]:
                    ret_dict[n]['negative'] += df.loc[(df['name']==n) & (df['scale']==2)].as_matrix()[0][2:]
                else:
                    ret_dict[n]['negative'] = df.loc[(df['name']==n) & (df['scale']==2)].as_matrix()[0][2:]

    return ret_dict
                

In [9]:
"""
arr : np.array

returns an array of 5 values --> 5 weeks
"""
def summarize_values_to_week(arr):
    return [arr[0:7].sum(), arr[7:14].sum(), arr[14:21].sum(), arr[21:28].sum(), arr[28:31].sum()]

In [66]:
"""
marr : array of dicts

"""
def flatten_map(marr):
    ret_dict = []
    for key in marr[0]:
        new_dict = {key: []}
        for d in marr:
            new_dict[key] += [d[key]]
        ret_dict += [new_dict]
    return ret_dict

In [99]:
def flatten_inner_array(x):
    for i, n in enumerate(x):
        x[i][list(n.keys())[0]] = flatten_map(x[i][list(n.keys())[0]])
        for j, n2 in enumerate(x[i][list(n.keys())[0]]):
            x[i][list(n.keys())[0]][j][list(n2.keys())[0]] = np.concatenate(n2[list(n2.keys())[0]]).ravel().tolist()
    return x

In [157]:
def flatten_array_to_map(marr):
    dict_merge = lambda a,b: a.update(b) or a
    return reduce((lambda x, y: dict_merge(x, y)), marr)

In [10]:
files = list_all_files(params['input_dir'])
languages = params['languages']

In [11]:
g = split_files_by_keyword(files, languages)

In [None]:
g

In [12]:
d = split_files_by_keyword(g['chinese'], ['leader'])
d = filter_files_by_time_range(d['leader'])
d

{'2017_07': ['../temp/chinese/leader/2017_07_chinese__leader.csv',
  '../temp/chinese/others/leader/2017_07_others_leader.csv'],
 '2017_08': ['../temp/chinese/leader/2017_08_chinese__leader.csv',
  '../temp/chinese/others/leader/2017_08_others_leader.csv'],
 '2017_09': ['../temp/chinese/leader/2017_09_chinese__leader.csv',
  '../temp/chinese/others/leader/2017_09_others_leader.csv'],
 '2017_10': ['../temp/chinese/leader/2017_10_chinese__leader.csv',
  '../temp/chinese/others/leader/2017_10_others_leader.csv'],
 '2017_11': ['../temp/chinese/leader/2017_11_chinese__leader.csv',
  '../temp/chinese/others/leader/2017_11_others_leader.csv'],
 '2017_12': ['../temp/chinese/leader/2017_12_chinese__leader.csv',
  '../temp/chinese/others/leader/2017_12_others_leader.csv']}

In [118]:
tables = [aggregate_tables(d[k], positive=True, negative=True) for k in d]
len(tables)

6

In [119]:
for _t in tables:
    for k in _t:
        for types in params['graph_types']:
            _t[k][types] = summarize_values_to_week(_t[k][types])


In [122]:
print(len(list(tables[0].keys())), tables)

14 [{'罗斯玛·曼梳': {'overall': [21, 18, 4, 10, 8], 'positive': [19, 14, 4, 10, 8], 'negative': [2, 4, 0, 0, 0]}, '纳吉·阿都拉萨': {'overall': [237, 247, 262, 226, 70], 'positive': [212, 223, 237, 205, 55], 'negative': [25, 24, 25, 21, 15]}, '林冠英': {'overall': [125, 79, 85, 50, 16], 'positive': [69, 61, 67, 47, 14], 'negative': [56, 18, 18, 3, 2]}, '卡巴星': {'overall': [8, 16, 2, 2, 2], 'positive': [6, 14, 0, 2, 2], 'negative': [2, 2, 2, 0, 0]}, '许子根': {'overall': [18, 12, 0, 0, 2], 'positive': [0, 4, 0, 0, 2], 'negative': [18, 8, 0, 0, 0]}, '潘儉偉': {'overall': [2, 0, 0, 0, 0], 'positive': [2, 0, 0, 0, 0], 'negative': [0, 0, 0, 0, 0]}, '拉菲兹·南利': {'overall': [0, 0, 0, 0, 1], 'positive': [0, 0, 0, 0, 1], 'negative': [0, 0, 0, 0, 0]}, '曹观友': {'overall': [5, 0, 1, 2, 0], 'positive': [4, 0, 1, 2, 0], 'negative': [1, 0, 0, 0, 0]}, '廖中莱': {'overall': [12, 10, 2, 4, 3], 'positive': [10, 4, 2, 2, 2], 'negative': [2, 6, 0, 2, 1]}, '安华·依布拉欣': {'overall': [59, 47, 62, 26, 13], 'positive': [44, 29, 50, 20, 10], 

In [164]:
x = flatten_map(tables)

x_last = flatten_inner_array(x)
xl2 = x_last
xl3 = flatten_array_to_map(xl2)

for n in xl3:
    xl3[n] = flatten_array_to_map(xl3[n])
print(xl3)

{'罗斯玛·曼梳': {'overall': [21, 18, 4, 10, 8, 27, 18, 8, 22, 4, 18, 20, 12, 15, 0, 6, 8, 4, 14, 2, 10, 16, 6, 18, 2, 16, 8, 17, 6, 0], 'positive': [19, 14, 4, 10, 8, 25, 18, 8, 22, 2, 18, 20, 10, 15, 0, 6, 8, 4, 14, 2, 6, 14, 6, 18, 2, 12, 6, 17, 6, 0], 'negative': [2, 4, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 4, 2, 0, 0, 0, 4, 2, 0, 0, 0]}, '纳吉·阿都拉萨': {'overall': [237, 247, 262, 226, 70, 161, 93, 142, 211, 54, 206, 217, 244, 153, 30, 157, 137, 106, 225, 33, 143, 188, 133, 123, 37, 250, 139, 173, 123, 48], 'positive': [212, 223, 237, 205, 55, 144, 73, 117, 172, 50, 183, 190, 210, 133, 28, 134, 113, 93, 189, 24, 105, 144, 115, 112, 32, 219, 110, 152, 107, 44], 'negative': [25, 24, 25, 21, 15, 17, 20, 25, 39, 4, 23, 27, 34, 20, 2, 23, 24, 13, 36, 9, 38, 44, 18, 11, 5, 31, 29, 21, 16, 4]}, '林冠英': {'overall': [125, 79, 85, 50, 16, 102, 41, 43, 71, 21, 52, 159, 96, 98, 18, 77, 51, 69, 86, 17, 186, 239, 93, 65, 13, 66, 108, 96, 156, 24], 'positive': [69, 61, 67, 47, 14, 77, 35, 30

{'罗斯玛·曼梳': {'overall': [21, 18, 4, 10, 8, 27, 18, 8, 22, 4, 18, 20, 12, 15, 0, 6, 8, 4, 14, 2, 10, 16, 6, 18, 2, 16, 8, 17, 6, 0], 'positive': [19, 14, 4, 10, 8, 25, 18, 8, 22, 2, 18, 20, 10, 15, 0, 6, 8, 4, 14, 2, 6, 14, 6, 18, 2, 12, 6, 17, 6, 0], 'negative': [2, 4, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 4, 2, 0, 0, 0, 4, 2, 0, 0, 0]}, '纳吉·阿都拉萨': {'overall': [237, 247, 262, 226, 70, 161, 93, 142, 211, 54, 206, 217, 244, 153, 30, 157, 137, 106, 225, 33, 143, 188, 133, 123, 37, 250, 139, 173, 123, 48], 'positive': [212, 223, 237, 205, 55, 144, 73, 117, 172, 50, 183, 190, 210, 133, 28, 134, 113, 93, 189, 24, 105, 144, 115, 112, 32, 219, 110, 152, 107, 44], 'negative': [25, 24, 25, 21, 15, 17, 20, 25, 39, 4, 23, 27, 34, 20, 2, 23, 24, 13, 36, 9, 38, 44, 18, 11, 5, 31, 29, 21, 16, 4]}, '林冠英': {'overall': [125, 79, 85, 50, 16, 102, 41, 43, 71, 21, 52, 159, 96, 98, 18, 77, 51, 69, 86, 17, 186, 239, 93, 65, 13, 66, 108, 96, 156, 24], 'positive': [69, 61, 67, 47, 14, 77, 35, 30