# RF INVERTED - multi/uni -> UNI (one model per time series)

In [1]:
import sys
sys.path.insert(0, '../../utils/')
from utils import *
from pylab import *
from utils_date import *
import pickle
from tqdm import tqdm
import copy

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline

import itertools

In [2]:
def create_xy_dataset(df_Xy, time_series, features_exogenous, features_context):
    df_Xy = copy.deepcopy(df_Xy[time_series+features_exogenous+features_context].dropna())
    days = sorted(list(set([i[:10] for i in df_Xy.index.values])))
    Xnames = [f+'-T'+str(ix)for f in features_exogenous for ix in np.arange(96)] + features_context
    
    X = []
    list_y=[]
    for d in tqdm(days,desc='Days loop'):
        ex = df_Xy.loc[d+' 00:00:00': d+ ' 23:45:00'][features_exogenous].values.T.flatten()
        co = df_Xy.loc[[d+' 00:00:00']][features_context].values.flatten()
        X.append(np.concatenate([ex, co]))
        y = []
        for s in time_series:
            y.append(df_Xy.loc[d+' 00:00:00': d+ ' 23:45:00'][s].values)
        list_y.append(y)
        
    return np.array(X), np.swapaxes(np.array(list_y),0,1), Xnames, days

In [3]:
AT=5
def mse(obs, pred):
    return ((pred - obs) ** 2).mean()

def rmse(obs, pred):
    return np.sqrt(mse(obs, pred))

def mae(obs, pred):
    return np.absolute(pred - obs).mean()

def mape_at(obs, pred):
    mask = obs >= AT
    return ((np.absolute(pred[mask] - obs[mask]) / obs[mask]).mean())*100

In [27]:
def fit_predict(X, y, cv, param_grid, scaler_choice_X, scaler_choice_y):
    
    pred_train_array = []
    pred_val_array = []
    ytrain_array = []
    yval_array = []
    
    for ix_train,ix_val in cv.split(X):
        Xtrain,Xval = X[ix_train], X[ix_val]
        ytrain,yval = y[ix_train], y[ix_val]

        scalerX = None
        scalery = None
        if scaler_choice_X == 'minmax':
            scalerX = MinMaxScaler(feature_range=(0, 1))
        elif scaler_choice_X == 'standard':
            scalerX = StandardScaler()
        if scalerX != None:
            Xtrain = scalerX.fit_transform(Xtrain)
            Xval = scalerX.transform(Xval)

        if scaler_choice_y == 'minmax':
            scalery = MinMaxScaler(feature_range=(0, 1))
        elif scaler_choice_y == 'standard':
            scalery = StandardScaler()
        if scalery != None:
            ytrain = scalery.fit_transform(ytrain)
            yval = scalery.transform(yval)

        keys, values = zip(*param_grid.items())
        all_params = [dict(zip(keys, v)) for v in itertools.product(*values)]
        pred_train_params = []
        pred_val_params = []
        for p in all_params:
            rf = RandomForestRegressor(**p, verbose=0)
            rf.fit(Xtrain,ytrain)
            pred_train = rf.predict(Xtrain)
            pred_val = rf.predict(Xval)
            pred_train_params.append(pred_train)
            pred_val_params.append(pred_val)

        pred_train_array.append(pred_train_params)
        pred_val_array.append(pred_val_params)
        ytrain_array.append(ytrain)
        yval_array.append(yval)

    pred_train_array = np.array(pred_train_array)
    pred_val_array = np.array(pred_val_array)
    ytrain_array = np.array(ytrain_array)
    yval_array = np.array(yval_array) 
    
    return pred_train_array, pred_val_array, ytrain_array, yval_array


def optimize(X, y_list, param_kfold, time_series):

    cv = KFold(**param_kfold)
    
    # loop over y_list
    pred_train_all = []
    pred_val_all = []
    ytrain_all = []
    yval_all = []

    for y, ts in tqdm(zip(y_list,time_series),desc='Optimization time_series'):
        pred_train_array, pred_val_array, ytrain_array, yval_array = fit_predict(X, y, cv, param_grid, scaler_choice_X,
                                                                                 scaler_choice_y)
        pred_train_all.append(pred_train_array)
        pred_val_all.append(pred_val_array)
        ytrain_all.append(ytrain_array)
        yval_all.append(yval_array)

    pred_train_all = np.array(pred_train_all)
    pred_val_all = np.array(pred_val_all)

    ytrain_all = np.array(ytrain_all)
    yval_all = np.array(yval_all)

    # Errors calculus
    pred_train = np.swapaxes(pred_train_all, 0, 2)
    obs_train = np.swapaxes(ytrain_all, 0, 1)
    pred_val = np.swapaxes(pred_val_all, 0, 2)
    obs_val = np.swapaxes(yval_all, 0, 1)

    errors_function = [rmse, mse, mae, mape_at]
    errors_name = ['rmse', 'mse', 'mae', 'mape_at']
    grid_search_dict={'train':{}, 'val':{}}
    for ef,en in zip(errors_function, errors_name):

        grid_search_dict['train'][en]={}
        grid_search_dict['train'][en]['error'] = np.array([np.array([ef(np.concatenate(pred_train[ixp][ixcv]), np.concatenate(obs_train[ixcv])) 
                            for ixcv in range(pred_train.shape[1])]) for ixp in range(pred_train.shape[0])])

        grid_search_dict['train'][en]['mean'] = grid_search_dict['train'][en]['error'].mean(axis=1) 
        grid_search_dict['train'][en]['std'] = grid_search_dict['train'][en]['error'].std(axis=1) 

        grid_search_dict['val'][en]={}
        grid_search_dict['val'][en]['error'] = np.array([np.array([ef(np.concatenate(pred_val[ixp][ixcv]), np.concatenate(obs_val[ixcv])) 
                            for ixcv in range(pred_val.shape[1])]) for ixp in range(pred_val.shape[0])])

        grid_search_dict['val'][en]['mean'] = grid_search_dict['val'][en]['error'].mean(axis=1) 
        grid_search_dict['val'][en]['std'] = grid_search_dict['val'][en]['error'].std(axis=1) 

    return grid_search_dict


def pred_list_to_dataframe(pred_list, time_series, days):
    data = [j for i in [build_timestamp_list(d+' 00:00:00', d+ ' 23:45:00') for d in days] for j in i]
    df = pd.DataFrame(data=data, columns=['Datetime'])
    for ix, ts in enumerate(time_series):
        df[ts] = pred_list[ix].reshape(pred_list[ix].shape[0]*pred_list[ix].shape[1])
    return df

In [5]:
# Data

observation_data_path = ['/home/toque/data2/montreal/stm/data/valid_metro_15min_2015_2016_2017_sumpass_nodayfree.csv']
exogenous_data_path = ['/home/toque/data2/montreal/events/data/clean/events_2015_2018_end_event_stopid.csv',
                       '/home/toque/data2/montreal/events/data/clean/events_2015_2018_start_event_stopid.csv',
                       '/home/toque/data2/montreal/events/data/clean/events_2015_2018_period_event_stopid.csv',
                       '/home/toque/data2/weather/predicted_weather/predicted_weather_2015_2017_included_perday_pm.csv'
                      ]
context_data_path = ['/home/toque/data2/date/2013-01-01-2019-01-01_new.csv']

df_observation = read_csv_list(observation_data_path)
df_exogenous = read_csv_list(exogenous_data_path)
df_context = read_csv_list(context_data_path)

# fill timestamps not available with 0 to have 96 timestamps per day
days = sorted(list(set([i[:10] for i in df_observation['Datetime'].values])))
timestamp_list = [j for i in [build_timestamp_list(d+' 00:00:00', d+' 23:45:00', time_step_second=15*60) for d in days] for j in i]
df_date = pd.DataFrame(data = timestamp_list, columns = ['Datetime']).set_index('Datetime')
df_observation = df_date.join(df_observation.set_index('Datetime')).fillna(0).reset_index()

In [3]:
time_series = ['11', '32', '34', '15', '44', '65', '31', '33', '35', '47', '13',
       '14', '1', '9', '5', '18', '36', '24', '68', '43', '8', '64', '10',
       '55', '3', '49', '51', '2', '19', '56', '7', '6', '4', '48', '66',
       '25', '23', '28', '39', '54', '60', '27', '20', '46', '12', '21',
       '62', '52', '41', '50', '30', '16', '37', '40', '26', '67', '57',
       '61', '42', '45', '38', '29', '58', '63', '22', '59', '53', '17']

features_exogenous = ['5-end_event', '11-end_event', '12-end_event', '13-end_event',
       '15-end_event', '16-end_event', '23-end_event', '24-end_event',
       '31-end_event', '32-end_event', '35-end_event', '43-end_event',
       '45-end_event', '61-end_event', '68-end_event', '5-start_event',
       '11-start_event', '12-start_event', '13-start_event',
       '15-start_event', '16-start_event', '23-start_event',
       '24-start_event', '31-start_event', '32-start_event',
       '35-start_event', '43-start_event', '45-start_event',
       '61-start_event', '68-start_event', '5-period_event',
       '11-period_event', '12-period_event', '13-period_event',
       '15-period_event', '16-period_event', '23-period_event',
       '24-period_event', '31-period_event', '32-period_event',
       '35-period_event', '43-period_event', '45-period_event',
       '61-period_event', '68-period_event']

features_context = ['Day_id', 'Mois_id','vac_noel_quebec', 'day_off_quebec', '24DEC', '31DEC',
                    'renov_beaubien', 'vac_udem1', 'vac_udem2']

scaler_choice_X = None
scaler_choice_y = None

param_kfold={
    'n_splits': 5,
    'shuffle': True,
    'random_state': 1}

param_grid={
    'n_estimators': [100, 150, 200],
    'max_features': ['auto',None],
    'max_depth': [None],
    'min_samples_split': [2,5,10],
    'min_samples_leaf': [1,5,10],
    'n_jobs': [6],
    'criterion': ['mse']}

param_kfold={
    'n_splits': 2,
    'shuffle': True,
    'random_state': 1}

param_grid={
    'n_estimators': [10,20],
    'max_features': ['auto'],
    'max_depth': [None],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'n_jobs': [6],
    'criterion': ['mse']}

time_series = ['11', '32', '34', '15']

start_datetime, end_datetime = '2015-01-01 00:00:00', '2016-12-31 23:45:00'

model_name = 'rf_uni_inverted_event15'



# Optimisation

In [11]:
df_Xy = df_observation.set_index('Datetime').join([df_context.set_index('Datetime'), df_exogenous.set_index('Datetime')])

df_Xy_train = df_Xy[start_datetime:end_datetime]
Xtrain, ytrain_list, Xnames, days = create_xy_dataset(df_Xy_train, time_series, features_exogenous, features_context)

Days loop: 100%|██████████| 731/731 [00:01<00:00, 628.77it/s]


In [12]:
grid_search_dict = optimize(Xtrain, ytrain_list, param_kfold, time_series)

Optimization time_series: 4it [00:07,  1.77s/it]


In [15]:
save_pickle('/home/toque/data2/forecast/model/rf_uni_inverted/optimize/'+model_name+'/grid_search_dict.pkl', grid_search_dict)

In [9]:
grid_search_dict[val]

{'train': {'mae': {'error': array([[  6.73780586,   6.85801895,   6.72338605,   6.88054697,
             6.94202613],
          [ 12.51094189,  12.57533687,  12.56324214,  12.58461585,
            12.63133003],
          [ 14.81333577,  14.42503064,  14.82287561,  14.54895842,
            14.55412733],
          [  9.53294771,   9.61047125,   9.59010241,   9.63938784,
             9.63580201],
          [ 12.50589423,  12.57247808,  12.56392821,  12.58660306,
            12.62857325],
          [ 14.83479956,  14.43130091,  14.81689795,  14.5450778 ,
            14.57753745],
          [ 11.09703458,  11.21036976,  11.18257938,  11.23899798,
            11.21706199],
          [ 12.50365623,  12.57753594,  12.56594295,  12.58825079,
            12.63617389],
          [ 14.83235865,  14.44833073,  14.82415991,  14.53424399,
            14.54928177],
          [  6.72737512,   6.83776948,   6.70436076,   6.86020071,
             6.92524893],
          [ 12.49951296,  12.56709474,  12.55

# Get best params and learn with best params

In [10]:
model_name = 'mt_rf_uni_inverted_events15'
param_grid={
    'n_estimators': [100, 150, 200],
    'max_features': ['auto',None],
    'max_depth': [None],
    'min_samples_split': [2,5,10],
    'min_samples_leaf': [1,5,10],
    'n_jobs': [6],
    'criterion': ['mse']}

grid_search_dict = load_pickle('/home/toque/data2/forecast/model/rf_uni_inverted/optimize/'+model_name+'/grid_search_dict.pkl')

best_arg = grid_search_dict['val']['rmse']['mean'].argmin()
keys, values = zip(*param_grid.items())
all_params = [dict(zip(keys, v)) for v in itertools.product(*values)]
best_params = all_params[best_arg]


In [11]:
best_params


{'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'min_samples_leaf': 5,
 'min_samples_split': 10,
 'n_estimators': 100,
 'n_jobs': 6}

In [20]:
df_Xy = df_observation.set_index('Datetime').join([df_context.set_index('Datetime'), df_exogenous.set_index('Datetime')])
df_Xy_train = df_Xy[start_datetime:end_datetime]
Xtrain, ytrain_list, Xnames, days = create_xy_dataset(df_Xy_train, time_series, features_exogenous, features_context)

rf_list = []
for ytrain in tqdm(ytrain_list):
    rf = RandomForestRegressor(**best_params, verbose=0)
    rf.fit(Xtrain,ytrain)
    rf_list.append(rf)
    
# Save models
#save_pickle('/home/toque/data2/forecast/model/rf_uni_inverted/optimize/'+model_name+'/list_rf_uni_inverted.pkl', rf_list)   

Days loop: 100%|██████████| 731/731 [00:01<00:00, 627.99it/s]
100%|██████████| 4/4 [00:01<00:00,  3.13it/s]


# Predict

In [24]:
start_datetime, end_datetime = '2015-01-01 00:00:00', '2017-12-31 23:45:00'
df_Xy_test = df_Xy[start_datetime:end_datetime]
Xtest, ytest_list, Xnames, days_test = create_xy_dataset(df_Xy_test, time_series, features_exogenous, features_context)

Days loop: 100%|██████████| 1093/1093 [00:01<00:00, 632.12it/s]


In [32]:
path_directory_to_save = '/home/toque/data2/forecast/model/rf_uni_inverted/prediction/'+model_name+'/'
pred_list = []
for rf in tqdm(rf_list):
    pred_list.append(rf.predict(Xtest))
pred_list = np.array(pred_list)


df_res = pred_list_to_dataframe(pred_list, time_series, days_test)

if not os.path.exists(path_directory_to_save):
    os.makedirs(path_directory_to_save)

df_res.to_csv(path_directory_to_save + start_datetime[:10] + "_" + end_datetime[:10] + '.csv', index=False)


100%|██████████| 4/4 [00:00<00:00,  9.05it/s]


# Draft

In [130]:
p= {'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'n_estimators': 200,
 'n_jobs': 6}
rf_list_15stations = []
for ytrain in tqdm(ytrain_list):
    rf = RandomForestRegressor(**p, verbose=0)
    rf.fit(Xtrain,ytrain)
    rf_list_15stations.append(rf)


100%|██████████| 68/68 [02:08<00:00,  1.89s/it]


In [139]:
start_datetime, end_datetime = '2015-01-01 00:00:00', '2017-12-31 23:45:00'
df_Xy_pred = df_Xy[start_datetime:end_datetime]
Xpred, ypred_list, Xnames, days_pred = create_xy_dataset(df_Xy_pred, time_series, features_exogenous, features_context)

pred_list = []
for rf in tqdm(rf_list_15stations):
    pred_list.append(rf.predict(Xpred))
pred_list = np.array(pred_list)

print(mape_at(ypred_list, pred_list))
print(rmse(ypred_list, pred_list))
print(mse(ypred_list, pred_list))
print(mae(ypred_list, pred_list))

df = pred_list_to_dataframe(pred_list, time_series, days_pred)

df.to_csv('/home/toque/data2/forecast/model/rf_inverted/prediction/rf_inverted_15stationsexo_contextcal_withoutoptim6min/2015-01-01_2016-12-31.csv',index = False)

Days loop: 100%|██████████| 1093/1093 [00:09<00:00, 118.02it/s]
100%|██████████| 68/68 [00:14<00:00,  4.82it/s]


17.6100480166
29.609964402
876.749991889
12.6160008791


# 10 stations

In [134]:
features_exogenous = [ '12-start_event',
  '13-start_event', '15-start_event', '16-start_event',
  '23-start_event', '24-start_event', '31-start_event',
  '32-start_event', '45-start_event', '61-start_event', '12-end_event', '13-end_event', '15-end_event',
  '16-end_event', '23-end_event', '24-end_event', '31-end_event', '32-end_event', '45-end_event',
  '61-end_event', '12-period_event', '13-period_event', '15-period_event',
  '16-period_event', '23-period_event', '24-period_event',
  '31-period_event', '32-period_event', '45-period_event', '61-period_event']


df_Xy = df_observation.set_index('Datetime').join([df_context.set_index('Datetime'), df_exogenous.set_index('Datetime')])

start_datetime, end_datetime = '2015-01-01 00:00:00', '2016-12-31 23:45:00'
df_Xy_train = df_Xy[start_datetime:end_datetime]
Xtrain, ytrain_list, Xnames, days = create_xy_dataset(df_Xy_train, time_series, features_exogenous, features_context)

rf_list_10stations = []
for ytrain in tqdm(ytrain_list):
    rf = RandomForestRegressor(**p, verbose=0)
    rf.fit(Xtrain,ytrain)
    rf_list_10stations.append(rf)


Days loop: 100%|██████████| 731/731 [00:06<00:00, 116.31it/s]
100%|██████████| 68/68 [01:42<00:00,  1.51s/it]


In [135]:
start_datetime, end_datetime = '2015-01-01 00:00:00', '2017-12-31 23:45:00'
df_Xy_pred = df_Xy[start_datetime:end_datetime]
Xpred, ypred_list, Xnames, days_pred = create_xy_dataset(df_Xy_pred, time_series, features_exogenous, features_context)

Days loop: 100%|██████████| 1093/1093 [00:09<00:00, 117.12it/s]


In [136]:
pred_list = []
for rf in tqdm(rf_list_10stations):
    pred_list.append(rf.predict(Xpred))
pred_list = np.array(pred_list)

print(mape_at(ypred_list, pred_list))
print(rmse(ypred_list, pred_list))
print(mse(ypred_list, pred_list))
print(mae(ypred_list, pred_list))

df = pred_list_to_dataframe(pred_list, time_series, days_pred)

df.to_csv('/home/toque/data2/forecast/model/rf_inverted/prediction/rf_inverted_10stationsexo_contextcal_withoutoptim6min/2015-01-01_2016-12-31.csv',index = False)

100%|██████████| 68/68 [00:14<00:00,  4.74it/s]


17.6349150232
29.6660906757
880.076935981
12.6231035849


# 0 stations

In [141]:
features_exogenous = []


df_Xy = df_observation.set_index('Datetime').join([df_context.set_index('Datetime'), df_exogenous.set_index('Datetime')])

start_datetime, end_datetime = '2015-01-01 00:00:00', '2016-12-31 23:45:00'
df_Xy_train = df_Xy[start_datetime:end_datetime]
Xtrain, ytrain_list, Xnames, days = create_xy_dataset(df_Xy_train, time_series, features_exogenous, features_context)

rf_list_0stations = []
for ytrain in tqdm(ytrain_list):
    rf = RandomForestRegressor(**p, verbose=0)
    rf.fit(Xtrain,ytrain)
    rf_list_0stations.append(rf)


Days loop: 100%|██████████| 731/731 [00:06<00:00, 115.23it/s]
100%|██████████| 68/68 [00:22<00:00,  2.98it/s]


In [142]:
start_datetime, end_datetime = '2015-01-01 00:00:00', '2017-12-31 23:45:00'
df_Xy_pred = df_Xy[start_datetime:end_datetime]
Xpred, ypred_list, Xnames, days_pred = create_xy_dataset(df_Xy_pred, time_series, features_exogenous, features_context)

Days loop: 100%|██████████| 1093/1093 [00:09<00:00, 115.44it/s]


In [145]:
pred_list = []
for rf in tqdm(rf_list_0stations):
    pred_list.append(rf.predict(Xpred))
pred_list = np.array(pred_list)

print(mape_at(ypred_list, pred_list))
print(rmse(ypred_list, pred_list))
print(mse(ypred_list, pred_list))
print(mae(ypred_list, pred_list))

df = pred_list_to_dataframe(pred_list, time_series, days_pred)

df.to_csv('/home/toque/data2/forecast/model/rf_inverted/prediction/rf_inverted_0stationsexo_contextcal_withoutoptim6min/2015-01-01_2016-12-31.csv',index = False)

100%|██████████| 68/68 [00:13<00:00,  4.90it/s]


19.0824746327
34.2750358755
1174.77808426
13.6830421636


# Meteo day

In [152]:
features_context = ['Day_id', 'Mois_id','vac_noel_quebec', 'day_off_quebec', '24DEC', '31DEC',
                    'renov_beaubien', 'vac_udem1', 'vac_udem2', 'Temperature_min_celcius', 'Temperature_max_celcius',
                    'Humidex_celcius', 'Windchill_celcius', 'Probability', 'Water_height_mm', 'Snow_height_cm']
features_exogenous = []

df_Xy = df_observation.set_index('Datetime').join([df_context.set_index('Datetime'), df_exogenous.set_index('Datetime')])

start_datetime, end_datetime = '2015-01-01 00:00:00', '2016-12-31 23:45:00'
df_Xy_train = df_Xy[start_datetime:end_datetime]
Xtrain, ytrain_list, Xnames, days = create_xy_dataset(df_Xy_train, time_series, features_exogenous, features_context)

rf_list_meteo = []
for ytrain in tqdm(ytrain_list):
    rf = RandomForestRegressor(**p, verbose=0)
    rf.fit(Xtrain,ytrain)
    rf_list_meteo.append(rf)


Days loop: 100%|██████████| 731/731 [00:06<00:00, 106.66it/s]
100%|██████████| 68/68 [00:31<00:00,  2.14it/s]


In [153]:
start_datetime, end_datetime = '2015-01-01 00:00:00', '2017-12-31 23:45:00'
df_Xy_pred = df_Xy[start_datetime:end_datetime]
Xpred, ypred_list, Xnames, days_pred = create_xy_dataset(df_Xy_pred, time_series, features_exogenous, features_context)

Days loop: 100%|██████████| 1027/1027 [00:09<00:00, 104.33it/s]


In [156]:
pred_list = []
for rf in tqdm(rf_list_meteo):
    pred_list.append(rf.predict(Xpred))
pred_list = np.array(pred_list)

print(mape_at(ypred_list, pred_list))
print(rmse(ypred_list, pred_list))
print(mse(ypred_list, pred_list))
print(mae(ypred_list, pred_list))

df = pred_list_to_dataframe(pred_list, time_series, days_pred)

df.to_csv('/home/toque/data2/forecast/model/rf_inverted/prediction/rf_inverted_0stationsexo_contextcalmeteoday_withoutoptim6min/2015-01-01_2016-12-31.csv',index = False)

100%|██████████| 68/68 [00:14<00:00,  4.78it/s]


18.2560931804
31.0098156654
961.608667603
12.7690460991


# Meteo day + event

In [157]:
features_context = ['Day_id', 'Mois_id','vac_noel_quebec', 'day_off_quebec', '24DEC', '31DEC',
                    'renov_beaubien', 'vac_udem1', 'vac_udem2', 'Temperature_min_celcius', 'Temperature_max_celcius',
                    'Humidex_celcius', 'Windchill_celcius', 'Probability', 'Water_height_mm', 'Snow_height_cm']
features_exogenous = ['5-end_event', '11-end_event', '12-end_event', '13-end_event',
       '15-end_event', '16-end_event', '23-end_event', '24-end_event',
       '31-end_event', '32-end_event', '35-end_event', '43-end_event',
       '45-end_event', '61-end_event', '68-end_event', '5-start_event',
       '11-start_event', '12-start_event', '13-start_event',
       '15-start_event', '16-start_event', '23-start_event',
       '24-start_event', '31-start_event', '32-start_event',
       '35-start_event', '43-start_event', '45-start_event',
       '61-start_event', '68-start_event', '5-period_event',
       '11-period_event', '12-period_event', '13-period_event',
       '15-period_event', '16-period_event', '23-period_event',
       '24-period_event', '31-period_event', '32-period_event',
       '35-period_event', '43-period_event', '45-period_event',
       '61-period_event', '68-period_event']

df_Xy = df_observation.set_index('Datetime').join([df_context.set_index('Datetime'), df_exogenous.set_index('Datetime')])

start_datetime, end_datetime = '2015-01-01 00:00:00', '2016-12-31 23:45:00'
df_Xy_train = df_Xy[start_datetime:end_datetime]
Xtrain, ytrain_list, Xnames, days = create_xy_dataset(df_Xy_train, time_series, features_exogenous, features_context)

rf_list_meteo15stations = []
for ytrain in tqdm(ytrain_list):
    rf = RandomForestRegressor(**p, verbose=0)
    rf.fit(Xtrain,ytrain)
    rf_list_meteo15stations.append(rf)


Days loop: 100%|██████████| 731/731 [00:06<00:00, 105.55it/s]
100%|██████████| 68/68 [02:17<00:00,  2.02s/it]


In [158]:
start_datetime, end_datetime = '2015-01-01 00:00:00', '2017-12-31 23:45:00'
df_Xy_pred = df_Xy[start_datetime:end_datetime]
Xpred, ypred_list, Xnames, days_pred = create_xy_dataset(df_Xy_pred, time_series, features_exogenous, features_context)

Days loop: 100%|██████████| 1027/1027 [00:09<00:00, 107.15it/s]


In [159]:
pred_list = []
for rf in tqdm(rf_list_meteo15stations):
    pred_list.append(rf.predict(Xpred))
pred_list = np.array(pred_list)

print(mape_at(ypred_list, pred_list))
print(rmse(ypred_list, pred_list))
print(mse(ypred_list, pred_list))
print(mae(ypred_list, pred_list))

df = pred_list_to_dataframe(pred_list, time_series, days_pred)

df.to_csv('/home/toque/data2/forecast/model/rf_inverted/prediction/rf_inverted_15stationsexo_contextcalmeteoday_withoutoptim6min/2015-01-01_2016-12-31.csv',index = False)

100%|██████████| 68/68 [00:14<00:00,  4.63it/s]


17.2203841311
28.0771396727
788.3257722
12.1563958957


# Learn with best params/features

# Predict

# Prediction -> csv

In [None]:
def pred_list_to_dataframe(pred_list, time_series, days):
    data = [j for i in [build_timestamp_list(d+' 00:00:00', d+ ' 23:45:00') for d in days] for j in i]
    df = pd.DataFrame(data=data, columns=['Datetime'])
    for ix, ts in enumerate(time_series):
        df[ts] = pred_list[ix].reshape(pred_list[ix].shape[0]*pred_list[ix].shape[1])
    return df