In [1]:
import sys 
sys.path.insert(0, '../')
import utils

from tqdm import tqdm
import pandas as pd
from datetime import datetime as dtlib
import os


In [2]:
obs_path = '../../data/clean_data/obs_mtl.csv'
preds_path = ['../../data/clean_data/pred_rf2.csv',
             '../../data/clean_data/pred_rf4.csv']
models_name = ['rf2', 'rf4']

agg_function = [utils.agg_max, utils.agg_min, utils.agg_maxabs_sign_at_1, utils.agg_maxabs_sign_at_4]
agg_function_name = ['agg_max', 'agg_min', 'agg_maxabs_sign_at_1', 'agg_maxabs_sign_at_4']

norm_function = [utils.norm_allcolumns_allrows, utils.norm_allcolumns_perday, utils.norm_percolumn_allrows,
                 utils.norm_percolumn_perday]
norm_function_name = ['norm_allstop_allperiod', 'norm_allstop_perday',
                      'norm_perstop_allperiod', 'norm_perstop_perday']

norm_year_function = [utils.norm_allcolumns_allrows, utils.norm_percolumn_allrows, utils.norm_allcolumns_perrow]
norm_year_function_name = ['norm_allstop_allday', 'norm_perstop_allday', 'norm_allstop_perday']

index_ = 'Datetime'

json_path = '../../dashboards/data/json/'
csv_path = '../../dashboards/data/csv/'

In [3]:
obs = pd.read_csv(obs_path)
preds = [pd.read_csv(i) for i in preds_path]

In [4]:
if not os.path.exists(json_path):
    os.makedirs(json_path)
if not os.path.exists(csv_path):
    os.makedirs(csv_path)

In [5]:
# Dates

In [6]:
dates = {}
dates['year'] = {}
dates['year_list'] = sorted(list(set([i[:4] for i in obs[index_].values])))
dates['timestep_list'] = sorted(list(set([i[11:] for i in obs[index_].values])))
data = [[i[:4],i[5:10]] for i in sorted(list(set([j[:10] for j in obs[index_].values])))]
days = pd.DataFrame(data = data, columns = ['year', 'day']).set_index('year')

for i in dates['year_list']:
    dates['year'][i] = {}
    dates['year'][i]['mm'] = [i[:2] for i in days.loc[i].values.flatten().tolist()]
    dates['year'][i]['month_en'] = [dtlib.strptime(i+'-'+d,'%Y-%m-%d').strftime("%B") for d in days.loc[i].values.flatten().tolist()]
    dates['year'][i]['mm-dd'] = days.loc[i].values.flatten().tolist()
    dates['year'][i]['weekday_en'] = [dtlib.strptime(i+'-'+d,'%Y-%m-%d').strftime("%A") for d in days.loc[i].values.flatten().tolist()]

In [7]:
# Create Observation\_data

In [8]:
observation_data = {}
for y in dates['year_list']:
    observation_data[y] = obs.set_index(index_).loc[y:str(int(y)+1)].reset_index()

In [9]:
# Save Observation\_data

In [10]:
for y in tqdm(dates['year_list']):
    for md in dates['year'][y]['mm-dd']:
        m = md[:2]
        d = md[3:]
        path = '{}observation/{}/{}/{}/'.format(csv_path, y, m, d)
        if not os.path.exists(path):
            os.makedirs(path)
        path_file = '{}observation.csv'.format(path)
        df = observation_data[y].copy()
        stop_id = df.set_index(index_).columns.values.tolist()
        df['ymd'] = [i[:10] for i in df.set_index(index_).index.values]
        df[index_] = [i[11:] for i in df.set_index(index_).index.values]
        df = df.set_index('ymd').loc['{}-{}'.format(y,md)]
        df = df.reset_index()[[index_]+stop_id]
        utils.stack_df(df).to_csv(path_file, index=False)

100%|██████████| 3/3 [00:55<00:00, 18.02s/it]


In [11]:
# Create Prediction\_data

In [12]:
prediction_data = {}
for y in dates['year_list']:
    prediction_data[y] = {}
    for p, mn in zip(preds, models_name):
        prediction_data[y]['preds_'+mn] = p.set_index(index_).loc[y:str(int(y)+1)].reset_index() 
        

In [13]:
# Save Prediction\_data

In [14]:
for y in tqdm(dates['year_list']):
    for md in dates['year'][y]['mm-dd']:
        m = md[:2]
        d = md[3:]
        for mn in models_name:
            path = '{}preds/{}/{}/{}/'.format(csv_path, y, m, d)
            if not os.path.exists(path):
                os.makedirs(path)
            path_file = '{}{}.csv'.format(path, 'preds_'+mn)
            df = prediction_data[y]['preds_'+mn].copy()
            stop_id = df.set_index(index_).columns.values.tolist()
            df['ymd'] = [i[:10] for i in df.set_index(index_).index.values]
            df[index_] = [i[11:] for i in df.set_index(index_).index.values]
            df = df.set_index('ymd').loc['{}-{}'.format(y,md)]
            df = df.reset_index()[[index_]+stop_id]
            utils.stack_df(df).to_csv(path_file, index=False)

100%|██████████| 3/3 [02:01<00:00, 41.19s/it]
