In [None]:
import os
import glob

import pandas as pd
import numpy as np

import ReadParameterFile

In [None]:
params = ReadParameterFile.get_parameter_dict("semiannually-params")

In [None]:
params

In [None]:
# Extracted from : https://stackoverflow.com/a/2186555
def list_all_files(root_path, ext=".csv", front="20"):
    cfiles = []
    for root, dirs, files in os.walk(root_path):
        for file in files:
            if file.startswith(front) and file.endswith(ext):
                cfiles.append(os.path.join(root, file))
    return cfiles

In [None]:
def split_files_by_keyword(files, keyword):
    groups = {}
    for kw in keyword:
        groups[kw] = [f for f in files if kw in f]
    return groups
            

In [None]:
def filter_files_by_time_range(files, start_year=2017, end_year=2017, start_month=7, end_month=12):
    # transform args to int as keepsafe
    start_year = int(start_year)
    end_year = int(end_year)
    start_month = int(start_month)
    end_month = int(end_month)
    
    dates = []
    if start_year == end_year:
        dates = [str(start_year) + "_{:02d}".format(n) for n in range(start_month, end_month + 1)]
    return split_files_by_keyword(files, dates)

In [None]:
def flatten_dict(d):
    ret = []
    for k in d:
        ret = ret + d[k]
    return ret

In [None]:
"""
files : array of files

overall is default behavior
"""
def aggregate_tables(files, positive=False, negative=False):
    ret_dict = {}
    for f in files:
        df = pd.read_csv(f, sep=',')
        # Add all names into dict
        for n in set(df['name'].tolist()):
            # Do overall 
            if n in ret_dict:
                ret_dict[n]['overall'] += df.loc[df['name']==n].as_matrix()[0][2:] + df.loc[df['name']==n].as_matrix()[1][2:]
            if n not in ret_dict:
                ret_dict[n] = {'overall': df.loc[df['name']==n].as_matrix()[0][2:] + df.loc[df['name']==n].as_matrix()[1][2:]}
            if positive:
                if "positive" in ret_dict[n]:
                    ret_dict[n]['positive'] += df.loc[(df['name']==n) & (df['scale']==1)].as_matrix()[0][2:]
                else:
                    ret_dict[n]['positive'] = df.loc[(df['name']==n) & (df['scale']==1)].as_matrix()[0][2:]

            if negative:
                if "negative" in ret_dict[n]:
                    ret_dict[n]['negative'] += df.loc[(df['name']==n) & (df['scale']==2)].as_matrix()[0][2:]
                else:
                    ret_dict[n]['negative'] = df.loc[(df['name']==n) & (df['scale']==2)].as_matrix()[0][2:]

    return ret_dict
                

In [None]:
"""
arr : np.array

returns an array of 5 values --> 5 weeks
"""
def summarize_values_to_week(arr):
    return [arr[0:7].sum(), arr[7:14].sum(), arr[14:21].sum(), arr[21:28].sum(), arr[28:31].sum()]

In [None]:
files = list_all_files(params['input_dir'])
languages = params['languages']

In [None]:
g = split_files_by_keyword(files, languages)

In [None]:
g

In [None]:
d = split_files_by_keyword(g['chinese'], ['leader'])
d = filter_files_by_time_range(d['leader'])
d

In [None]:
tables = [aggregate_tables(d[k], positive=True) for k in d]
len(tables)

In [None]:
for _t in tables:
    for k in _t:
        _t[k]['overall'] = summarize_values_to_week(_t[k]['overall'])
        
        if "positive" in _t[k]:
            _t[k]['positive'] = summarize_values_to_week(_t[k]['positive'])
        if "negative" in _t[k]:
            _t[k]['negative'] = summarize_values_to_week(_t[k]['negative'])
tables

In [None]:
for i in tables:
    print(i['卡巴星'])