In [3]:
import sys 
sys.path.insert(0, '../')
import utils

from tqdm import tqdm
import pandas as pd
from datetime import datetime as dtlib
import os



In [4]:
obs_path = '../../data/clean_data/obs_mtl.csv'
preds_path = ['../../data/clean_data/pred_rf2.csv',
             '../../data/clean_data/pred_rf4.csv']
models_name = ['rf2', 'rf4']

agg_function = [utils.agg_max, utils.agg_min, utils.agg_maxabs_sign_at_1, utils.agg_maxabs_sign_at_4]
agg_function_name = ['agg_max', 'agg_min', 'agg_maxabs_sign_at_1', 'agg_maxabs_sign_at_4']

norm_function = [utils.norm_allcolumns_allrows, utils.norm_allcolumns_perday, utils.norm_percolumn_allrows,
                 utils.norm_percolumn_perday]
norm_function_name = ['norm_allstop_allperiod', 'norm_allstop_perday',
                      'norm_perstop_allperiod', 'norm_perstop_perday']

norm_year_function = [utils.norm_allcolumns_allrows, utils.norm_percolumn_allrows, utils.norm_allcolumns_perrow]
norm_year_function_name = ['norm_allstop_allday', 'norm_perstop_allday', 'norm_allstop_perday']

index_ = 'Datetime'

json_path = '../../dashboards/data/json/'
csv_path = '../../dashboards/data/csv/'

In [5]:
obs = pd.read_csv(obs_path)
preds = [pd.read_csv(i) for i in preds_path]

In [6]:
if not os.path.exists(json_path):
    os.makedirs(json_path)
if not os.path.exists(csv_path):
    os.makedirs(csv_path)

# Create Dates

In [7]:
dates = {}
dates['year'] = {}
dates['year_list'] = sorted(list(set([i[:4] for i in obs[index_].values])))
dates['timestep_list'] = sorted(list(set([i[11:] for i in obs[index_].values])))
data = [[i[:4],i[5:10]] for i in sorted(list(set([j[:10] for j in obs[index_].values])))]
days = pd.DataFrame(data = data, columns = ['year', 'day']).set_index('year')

for i in dates['year_list']:
    dates['year'][i] = {}
    dates['year'][i]['mm'] = [i[:2] for i in days.loc[i].values.flatten().tolist()]
    dates['year'][i]['month_en'] = [dtlib.strptime(i+'-'+d,'%Y-%m-%d').strftime("%B") for d in days.loc[i].values.flatten().tolist()]
    dates['year'][i]['mm-dd'] = days.loc[i].values.flatten().tolist()
    dates['year'][i]['weekday_en'] = [dtlib.strptime(i+'-'+d,'%Y-%m-%d').strftime("%A") for d in days.loc[i].values.flatten().tolist()]

# Save Dates

In [6]:
with open(json_path+'dates.json', "w") as text_file:
    text_file.write("dates = {};".format(dates))

# Create Residues

In [7]:
#residues[year][pred_name][norm]
residues = {}
for y in dates['year_list']:
    residues[y] = {}
    for p, mn in zip(preds, models_name):
        residues[y]['residues_'+mn] = {}
        residues[y]['residues_'+mn]['no_norm'] = (obs.set_index(index_).loc[y:str(int(y)+1)] -
                                      p.set_index(index_).loc[y:str(int(y)+1)]).reset_index(index_) 
        

In [8]:
#normalize residues
for y in tqdm(dates['year_list']):
    for p, mn in zip(preds, models_name):
        for nf, nfn in zip(norm_function, norm_function_name):
            residues[y]['residues_'+mn][nfn] = nf(residues[y]['residues_'+mn]['no_norm'])


100%|██████████| 3/3 [02:24<00:00, 48.20s/it]


# Save Residues

In [193]:
for y in tqdm(dates['year_list']):
    for md in dates['year'][y]['mm-dd']:
        m = md[:2]
        d = md[3:]
        for mn in models_name:
            for nfn in norm_function_name+['no_norm']:
                path = '{}residues/{}/{}/{}/'.format(csv_path, y, m, d)
                if not os.path.exists(path):
                    os.makedirs(path)
                path_file = '{}{}__{}.csv'.format(path, 'residues_'+mn, nfn)
                df = residues[y]['residues_'+mn][nfn].copy()
                stop_id = df.set_index(index_).columns.values.tolist()
                df['ymd'] = [i[:10] for i in df.set_index(index_).index.values]
                df[index_] = [i[11:] for i in df.set_index(index_).index.values]
                df = df.set_index('ymd').loc['{}-{}'.format(y,md)]
                df = df.reset_index()[[index_]+stop_id]
                utils.stack_df(df).to_csv(path_file, index=False)

100%|██████████| 3/3 [09:36<00:00, 188.70s/it]


# Create Residues aggregated for year visualization

In [9]:
#residues_aggyear[year][pred_name][agg_function_name][norm]
residues_aggyear = {}

for y in tqdm(dates['year_list']):
    residues_aggyear[y] = {}
    for p, mn in zip(preds, models_name):
        residues_aggyear[y]['residues_'+mn]={}
        for agg_f, agg_f_n in zip(agg_function, agg_function_name):
            residues_aggyear[y]['residues_'+mn][agg_f_n] = {}
            residues_aggyear[y]['residues_'+mn][agg_f_n]['no_norm'] = utils.agg_year_df(residues[y]['residues_'+mn]['no_norm'], agg_f)

100%|██████████| 3/3 [02:10<00:00, 43.61s/it]


In [10]:
#normalize residues_aggyear
for y in tqdm(dates['year_list']):
    for p, mn in zip(preds, models_name):
        for agg_f, agg_f_n in zip(agg_function, agg_function_name):
            for nyf, nyfn in zip(norm_year_function, norm_year_function_name): 
                residues_aggyear[y]['residues_'+mn][agg_f_n][nyfn] = nyf(residues_aggyear[y]['residues_'+mn][agg_f_n]['no_norm'])

100%|██████████| 3/3 [00:04<00:00,  1.49s/it]


# Save residues aggregated for year visualization 

In [11]:
for y in tqdm(dates['year_list']):
    for mn in models_name:
        for agg_f_n in agg_function_name:
            for nyfn in norm_year_function_name+['no_norm']: 
                path = '{}residues_visuyear/{}/'.format(csv_path, y)
                if not os.path.exists(path):
                    os.makedirs(path)
                path_file = '{}{}__{}__{}.csv'.format(path, 'residues_'+mn, agg_f_n, nyfn)
                df = utils.stack_df(residues_aggyear[y]['residues_'+mn][agg_f_n][nyfn])
                df.to_csv(path_file, index=False)


100%|██████████| 3/3 [00:06<00:00,  2.16s/it]


In [None]:
#Draft

In [199]:
#import matplotlib.pyplot as plt
#o = obs.set_index('Datetime').loc['2017-03-15 00:00:00':'2017-03-15 23:45:00'][['11']].copy()
#p = preds[1].set_index('Datetime').loc['2017-03-15 00:00:00':'2017-03-15 23:45:00'][['11']].copy()
#o.columns = ['obs']
#p.columns = ['rf4']
#o.join(p).plot(figsize=(10,4))
#plt.show()