# RF UNI - multi/uni -> uni (one model per time series)

In [1]:
import sys
sys.path.insert(0, '../../utils/')
from utils import *
from pylab import *
from utils_date import *
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline

from sklearn.metrics.scorer import make_scorer

from tqdm import tqdm
import itertools

In [39]:
def fit_predict(X, y, cv, param_grid, scaler_choice_X, scaler_choice_y):
    
    pred_train_array = []
    pred_val_array = []
    ytrain_array = []
    yval_array = []
    
    for ix_train,ix_val in cv.split(X):
        Xtrain,Xval = X[ix_train], X[ix_val]
        ytrain,yval = y[ix_train], y[ix_val]

        scalerX = None
        scalery = None
        if scaler_choice_X == 'minmax':
            scalerX = MinMaxScaler(feature_range=(0, 1))
        elif scaler_choice_X == 'standard':
            scalerX = StandardScaler()
        if scalerX != None:
            Xtrain = scalerX.fit_transform(Xtrain)
            Xval = scalerX.transform(Xval)

        if scaler_choice_y == 'minmax':
            scalery = MinMaxScaler(feature_range=(0, 1))
        elif scaler_choice_y == 'standard':
            scalery = StandardScaler()
        if scalery != None:
            ytrain = scalery.fit_transform(ytrain)
            yval = scalery.transform(yval)

        keys, values = zip(*param_grid.items())
        all_params = [dict(zip(keys, v)) for v in itertools.product(*values)]
        pred_train_params = []
        pred_val_params = []
        for p in all_params:
            rf = RandomForestRegressor(**p, verbose=1)
            rf.fit(Xtrain,ytrain)
            pred_train = rf.predict(Xtrain)
            pred_val = rf.predict(Xval)
            pred_train_params.append(pred_train)
            pred_val_params.append(pred_val)

        pred_train_array.append(pred_train_params)
        pred_val_array.append(pred_val_params)
        ytrain_array.append(ytrain)
        yval_array.append(yval)

    pred_train_array = np.array(pred_train_array)
    pred_val_array = np.array(pred_val_array)
    ytrain_array = np.array(ytrain_array)
    yval_array = np.array(yval_array) 
    
    return pred_train_array, pred_val_array, ytrain_array, yval_array


def optimize(df_Xy, param_kfold, time_series, list_features, features_window, tminus, tplus):

    cv = KFold(**param_kfold)
    
    grid_search_dict = {}
    for features in list_features:

        # create X and y_list
        if features_window!=None:
            features_window_tmp = [fw for fw in features_window if fw in features]

            if len(features_window_tmp)>0:
                X, y_list, fnames = window_Xy(df_Xy, time_series, features, features_window_tmp, tminus, tplus)
                # uni -> .T
                y_list = y_list.T
            else:
                X = df_Xy[features].values
                y_list = df_Xy[time_series].values.T
                fnames = features
        else:
            X = df_Xy[features].values
            y_list = df_Xy[time_series].values.T
            fnames = features

        # loop over y_list
        pred_train_all = []
        pred_val_all = []
        ytrain_all = []
        yval_all = []
        

        for y, ts in zip(y_list,time_series):
            print("Time series {}".format(ts))
            pred_train_array, pred_val_array, ytrain_array, yval_array = fit_predict(X, y, cv, param_grid, scaler_choice_X,
                                                                                     scaler_choice_y)
            pred_train_all.append(pred_train_array)
            pred_val_all.append(pred_val_array)
            ytrain_all.append(ytrain_array)
            yval_all.append(yval_array)

        pred_train_all = np.array(pred_train_all)
        pred_val_all = np.array(pred_val_all)
        
        ytrain_all = np.array(ytrain_all)
        yval_all = np.array(yval_all)
        
        # Errors calculus
        pred_train = np.swapaxes(pred_train_all, 0, 2)
        obs_train = np.swapaxes(ytrain_all, 0, 1)
        pred_val = np.swapaxes(pred_val_all, 0, 2)
        obs_val = np.swapaxes(yval_all, 0, 1)

        errors_function = [rmse, mse, mae, mape_at]
        errors_name = ['rmse', 'mse', 'mae', 'mape_at']
        errors_all={'train':{}, 'val':{}}
        for ef,en in zip(errors_function, errors_name):

            errors_all['train'][en]={}
            errors_all['train'][en]['error'] = np.array([np.array([ef(np.concatenate(pred_train[ixp][ixcv]),np.concatenate(obs_train[ixcv])) 
                                for ixcv in range(pred_train.shape[1])]) for ixp in range(pred_train.shape[0])])

            errors_all['train'][en]['mean'] = errors_all['train'][en]['error'].mean(axis=1) 
            errors_all['train'][en]['std'] = errors_all['train'][en]['error'].std(axis=1) 
            
            
            errors_all['val'][en]={}
            errors_all['val'][en]['error'] = np.array([np.array([ef(np.concatenate(pred_val[ixp][ixcv]),np.concatenate(obs_val[ixcv])) 
                                for ixcv in range(pred_val.shape[1])]) for ixp in range(pred_val.shape[0])])

            errors_all['val'][en]['mean'] = errors_all['val'][en]['error'].mean(axis=1) 
            errors_all['val'][en]['std'] = errors_all['val'][en]['error'].std(axis=1) 

        grid_search_dict[tuple(features)] = errors_all
        
    return grid_search_dict

In [40]:
# Data
observation_data_path = ['/home/toque/data2/montreal/stm/data/valid_metro_15min_2015_2016_2017_sumpass_nodayfree.csv']
exogenous_data_path = ['/home/toque/data2/montreal/events/data/clean/events_2015_2018_end_event_stopid.csv',
                       '/home/toque/data2/montreal/events/data/clean/events_2015_2018_start_event_stopid.csv',
                       '/home/toque/data2/montreal/events/data/clean/events_2015_2018_period_event_stopid.csv',
                       '/home/toque/data2/date/2013-01-01-2019-01-01_new.csv']

df_observation = read_csv_list(observation_data_path)
df_exogenous = read_csv_list(exogenous_data_path)

In [41]:
time_series = ['11']

list_features = [['hms_int_15min','Day_id', 'Mois_id','vac_noel_quebec', 'day_off_quebec', '24DEC', '31DEC',
                 'renov_beaubien', 'vac_udem1', 'vac_udem2', '5-end_event', '11-end_event', '12-end_event', '13-end_event',
       '15-end_event', '16-end_event', '23-end_event', '24-end_event',
       '31-end_event', '32-end_event', '35-end_event', '43-end_event',
       '45-end_event', '61-end_event', '68-end_event', '5-start_event',
       '11-start_event', '12-start_event', '13-start_event',
       '15-start_event', '16-start_event', '23-start_event',
       '24-start_event', '31-start_event', '32-start_event',
       '35-start_event', '43-start_event', '45-start_event',
       '61-start_event', '68-start_event', '5-period_event',
       '11-period_event', '12-period_event', '13-period_event',
       '15-period_event', '16-period_event', '23-period_event',
       '24-period_event', '31-period_event', '32-period_event',
       '35-period_event', '43-period_event', '45-period_event',
       '61-period_event', '68-period_event']]

features_window = ['5-end_event', '11-end_event', '12-end_event', '13-end_event',
       '15-end_event', '16-end_event', '23-end_event', '24-end_event',
       '31-end_event', '32-end_event', '35-end_event', '43-end_event',
       '45-end_event', '61-end_event', '68-end_event', '5-start_event',
       '11-start_event', '12-start_event', '13-start_event',
       '15-start_event', '16-start_event', '23-start_event',
       '24-start_event', '31-start_event', '32-start_event',
       '35-start_event', '43-start_event', '45-start_event',
       '61-start_event', '68-start_event', '5-period_event',
       '11-period_event', '12-period_event', '13-period_event',
       '15-period_event', '16-period_event', '23-period_event',
       '24-period_event', '31-period_event', '32-period_event',
       '35-period_event', '43-period_event', '45-period_event',
       '61-period_event', '68-period_event']

tminus = 10
tplus = 2

param_kfold={
    'n_splits': 5,
    'shuffle': True,
    'random_state': 0}
        
param_grid={
    'n_estimators': [100, 150, 200],
    'max_features': ['auto',None],
    'max_depth': [None],
    'min_samples_split': [2,5,10],
    'min_samples_leaf': [1,5,10],
    'n_jobs': [6],
    'criterion': ['mse']}

#scoring = "neg_mean_squared_error"
#scaler_choice = "standard"
scaler_choice_X = None
scaler_choice_y = None

start_date = '2015-01-01 00:00:00'
end_date = '2016-12-31 23:45:00'


model_name = 'mt_rf_uni_events15_optim1stop'

# Optimisation

In [43]:
all_features = list(set([j for i in list_features for j in i]))
df_Xy_train = df_observation.set_index('Datetime').join(df_exogenous.set_index('Datetime'))[start_date:end_date][time_series+all_features].dropna()

In [44]:
grid_search_dict = optimize(df_Xy_train, param_kfold, time_series, list_features, features_window, tminus, tplus)

Time series 11


[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    6.4s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   16.0s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.9s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   15.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.2s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.7s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   14.7s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: 

[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   13.8s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.4s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   13.7s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.6s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   13.9s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_j

[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.8s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   14.3s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.3s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   13.8s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.2s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: 

[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.9s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   14.9s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.7s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   14.4s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_j

[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.3s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   13.7s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.7s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   14.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.4s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   20.9s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: 

[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   14.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.4s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   14.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.5s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   21.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.2s finished
[Parallel(n_j

[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.7s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   14.6s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    6.6s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   24.4s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.3s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: 

[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.4s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   13.7s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.7s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   21.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.1s finished
[Parallel(n_j

[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.9s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   14.5s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.4s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   21.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    6.4s
[Parallel(n_jobs=6)]: 

[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    6.8s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   17.0s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    7.3s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   27.7s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.3s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    7.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   26.3s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: 

[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   14.0s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.2s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   20.9s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.3s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   20.9s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.2s finished
[Parallel(n_j

[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.6s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   21.7s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.6s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   21.8s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.1s finished
[Parallel(n_j

[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    6.5s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   24.8s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    6.2s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   23.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.9s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   22.4s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: 

[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   21.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.7s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   21.3s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.5s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   21.0s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.1s finished
[Parallel(n_j

[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.5s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   21.5s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.7s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   21.5s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.1s finished
[Parallel(n_j

In [45]:
save_pickle('/home/toque/data2/forecast/model/rf_uni/optimize/'+model_name+'/grid_search_dict.pkl', grid_search_dict)

# Get best features and params

In [81]:
def get_best_features_params(grid_search_dict, param_grid):
    keys, values = zip(*param_grid.items())
    all_params = [dict(zip(keys, v)) for v in itertools.product(*values)]

    params_bestscore_list = [[features,all_params[grid_search_dict[features]['val']['rmse']['mean'].argmin()],
      grid_search_dict[features]['val']['rmse']['mean'].min()] for features in list(grid_search_dict.keys())]
    
    ix_best_features = np.array([i[2] for i in params_bestscore_list]).argmin()
    best_features = params_bestscore_list[ix_best_features][0]
    best_params = params_bestscore_list[ix_best_features][1]
    best_score = params_bestscore_list[ix_best_features][2]
    return best_features, best_params, best_score

features, best_params, best_score = get_best_features_params(grid_search_dict, param_grid)


# Learn with best features and best params

In [90]:
time_series = ['11', '32', '34', '15', '44', '65', '31', '33', '35', '47', '13',
       '14', '1', '9', '5', '18', '36', '24', '68', '43', '8', '64', '10',
       '55', '3', '49', '51', '2', '19', '56', '7', '6', '4', '48', '66',
       '25', '23', '28', '39', '54', '60', '27', '20', '46', '12', '21',
       '62', '52', '41', '50', '30', '16', '37', '40', '26', '67', '57',
       '61', '42', '45', '38', '29', '58', '63', '22', '59', '53', '17']

all_features = list(set([j for i in list_features for j in i]))
df_Xy_train = df_observation.set_index('Datetime').join(df_exogenous.set_index('Datetime'))[start_date:end_date][time_series+all_features].dropna()

# create X train and ytrain_list
if features_window!=None:
    features_window_tmp = [fw for fw in features_window if fw in features]

    if len(features_window_tmp)>0:
        Xtrain, ytrain_list, fnames = window_Xy(df_Xy_train, time_series, features, features_window_tmp, tminus, tplus)
        # uni -> .T
        ytrain_list = ytrain_list.T
    else:
        Xtrain = df_Xy[features].values
        ytrain_list = df_Xy_train[time_series].values.T
        fnames = features
else:
    Xtrain = df_Xy_train[features].values
    ytrain_list = df_Xy_train[time_series].values.T
    fnames = features
            
            
rf_list = []
for ytrain in tqdm(ytrain_list):
    rf = RandomForestRegressor(**best_params, verbose=0)
    rf.fit(Xtrain,ytrain)
    rf_list.append(rf)




  0%|          | 0/68 [00:00<?, ?it/s][A[A

  1%|▏         | 1/68 [00:39<43:45, 39.18s/it][A[A

  3%|▎         | 2/68 [01:19<43:55, 39.93s/it][A[A

  4%|▍         | 3/68 [01:58<42:54, 39.61s/it][A[A

  6%|▌         | 4/68 [02:44<43:46, 41.03s/it][A[A

  7%|▋         | 5/68 [03:23<42:50, 40.80s/it][A[A

  9%|▉         | 6/68 [04:09<42:56, 41.55s/it][A[A

 10%|█         | 7/68 [04:53<42:41, 41.99s/it][A[A

 12%|█▏        | 8/68 [05:34<41:52, 41.87s/it][A[A

 13%|█▎        | 9/68 [06:15<40:58, 41.67s/it][A[A

 15%|█▍        | 10/68 [07:00<40:40, 42.07s/it][A[A

 16%|█▌        | 11/68 [07:41<39:49, 41.93s/it][A[A

 18%|█▊        | 12/68 [08:21<39:01, 41.81s/it][A[A

 19%|█▉        | 13/68 [09:07<38:35, 42.10s/it][A[A

 21%|██        | 14/68 [09:49<37:54, 42.13s/it][A[A

 22%|██▏       | 15/68 [10:31<37:12, 42.12s/it][A[A

 24%|██▎       | 16/68 [11:13<36:27, 42.08s/it][A[A

 25%|██▌       | 17/68 [11:56<35:50, 42.17s/it][A[A

 26%|██▋       | 18/68 [12

# Predict

In [107]:
start_date, end_date = '2015-01-01 00:00:00', '2017-12-31 23:45:00'
all_features = list(set([j for i in list_features for j in i]))
df_Xy_test = df_observation.set_index('Datetime').join(df_exogenous.set_index('Datetime'))[start_date:end_date][time_series+all_features].dropna()


# create X train and ytrain_list
if features_window!=None:
    features_window_tmp = [fw for fw in features_window if fw in features]

    if len(features_window_tmp)>0:
        Xtest, ytest_list, fnames = window_Xy(df_Xy_test, time_series, features, features_window_tmp, tminus, tplus)
        # uni -> .T
        ytest_list = ytest_list.T
    else:
        Xtest = df_Xy[features].values
        ytest_list = df_Xy_test[time_series].values.T
        fnames = features
else:
    Xtest = df_Xy_test[features].values
    ytest_list = df_Xy_test[time_series].values.T
    fnames = features

In [108]:
pred_list = []
for rf in tqdm(rf_list):
    pred_list.append(rf.predict(Xtest))
pred_list = np.array(pred_list)



  0%|          | 0/68 [00:00<?, ?it/s][A[A

  1%|▏         | 1/68 [00:00<00:42,  1.59it/s][A[A

  3%|▎         | 2/68 [00:01<00:42,  1.56it/s][A[A

  4%|▍         | 3/68 [00:01<00:39,  1.66it/s][A[A

  6%|▌         | 4/68 [00:02<00:39,  1.60it/s][A[A

  7%|▋         | 5/68 [00:03<00:39,  1.59it/s][A[A

  9%|▉         | 6/68 [00:03<00:39,  1.59it/s][A[A

 10%|█         | 7/68 [00:04<00:39,  1.56it/s][A[A

 12%|█▏        | 8/68 [00:05<00:39,  1.53it/s][A[A

 13%|█▎        | 9/68 [00:05<00:38,  1.54it/s][A[A

 15%|█▍        | 10/68 [00:06<00:38,  1.52it/s][A[A

 16%|█▌        | 11/68 [00:07<00:36,  1.55it/s][A[A

 18%|█▊        | 12/68 [00:07<00:35,  1.57it/s][A[A

 19%|█▉        | 13/68 [00:08<00:34,  1.59it/s][A[A

 21%|██        | 14/68 [00:08<00:33,  1.61it/s][A[A

 22%|██▏       | 15/68 [00:09<00:32,  1.63it/s][A[A

 24%|██▎       | 16/68 [00:09<00:32,  1.62it/s][A[A

 25%|██▌       | 17/68 [00:10<00:31,  1.61it/s][A[A

 26%|██▋       | 18/68 [00

17.0316535679
33.0286715794
1090.8931463
14.3119228257


In [110]:
df = pd.DataFrame(data=df_Xy_test.index.values[tminus:len(df_Xy_test) - tplus], columns=['Datetime'])
for ts,v in zip(time_series,pred_list):
    df[ts]=v
    
df.to_csv('/home/toque/data2/forecast/model/rf_uni/prediction/'+model_name+'/'+start_date[:10]+'_'+end_date[:10]+'.csv', index=False)
df


Unnamed: 0,Datetime,11,32,34,15,44,65,31,33,35,...,42,45,38,29,58,63,22,59,53,17
0,2015-01-01 05:45:00,61.923944,8.775826,29.922784,25.525856,15.355233,27.439986,26.308037,13.512738,12.866027,...,4.726763,45.287006,7.536326,14.368911,6.578090,11.523668,15.494552,5.323174,2.005022,2.450304
1,2015-01-01 06:00:00,47.000164,7.924903,29.684050,25.147570,59.099736,36.067336,16.823116,11.989556,16.948302,...,4.893160,15.914826,7.336157,10.785186,6.312588,10.391931,14.709074,3.962473,3.078765,2.512865
2,2015-01-01 06:15:00,47.966581,6.660579,30.086506,26.793933,32.939961,45.968428,10.657908,7.557528,10.269573,...,13.726796,15.896153,8.835896,10.763464,4.506249,10.544954,20.932698,4.273030,4.764031,4.940472
3,2015-01-01 06:30:00,44.450519,4.894296,22.074043,31.011007,37.451956,37.337226,14.069823,10.589667,10.846148,...,11.991128,12.737962,9.020750,12.950552,4.248411,11.118688,20.257836,3.851543,4.854179,4.590426
4,2015-01-01 06:45:00,45.940519,4.839319,24.676983,30.793091,38.902581,45.067439,12.540754,10.420890,14.433438,...,11.991128,11.905056,8.634745,12.879322,9.752594,10.571087,19.648640,3.860602,4.170473,3.829400
5,2015-01-01 07:00:00,50.171692,5.865915,29.940498,30.793091,48.909575,37.912393,27.466741,12.137276,19.838808,...,14.584512,11.447780,18.735923,14.957227,7.253035,9.610824,6.772112,6.775138,3.219155,5.564296
6,2015-01-01 07:15:00,49.298746,16.757233,49.457200,24.816068,29.670911,33.478150,27.380309,11.734555,22.127928,...,7.508618,11.002034,7.087087,10.939907,11.878749,9.493746,6.642316,7.676214,3.275821,9.415154
7,2015-01-01 07:30:00,49.195717,14.755979,23.602845,20.246687,27.766415,33.396194,17.552502,9.894200,19.698177,...,6.754808,8.635978,7.214246,11.594960,4.357811,6.701580,6.202830,3.960384,1.640086,7.353542
8,2015-01-01 07:45:00,47.027180,14.755979,22.441034,21.187238,27.711497,34.349469,17.795041,9.695983,19.698177,...,6.686245,9.293151,7.141693,11.565151,4.584598,6.701580,6.061209,5.910468,1.765304,6.755990
9,2015-01-01 08:00:00,56.344763,14.728865,23.115948,20.187352,26.921834,34.664867,18.053973,9.735911,20.906314,...,8.659251,6.802994,7.607120,7.982753,4.664662,8.437260,5.849592,5.575444,1.856470,4.833600


# Draft

In [334]:
# Optimization 
def optimize_old(df_Xy, time_series, features, features_window_tmp, tminus, tplus):
    grid_search_features = {}

    for features in list_features:

        grid_search_model = {}

        if features_window!=None:
            features_window_tmp = [fw for fw in features_window if fw in features]

            if len(features_window_tmp)>0:
                X, list_y, fnames = window_Xy(df_Xy, time_series, features, features_window_tmp, tminus, tplus)
                # uni -> .T
                list_y = list_y.T
            else:
                X = df_Xy[features].values
                list_y = df_Xy[time_series].values.T
                fnames = features
        else:
            X = df_Xy[features].values
            list_y = df_Xy[time_series].values.T
            fnames = features

        for y, ts in zip(list_y,time_series):
            scaler = None
            if scaler_choice == 'minmax':
                scaler = MinMaxScaler(feature_range=(0, 1))
            elif scaler_choice == 'standard':
                scaler = StandardScaler()

            if scaler!=None:
                rf = RandomForestRegressor()
                pipeline = Pipeline([('scale', scaler), ('model', rf)])
                cv = KFold(**param_kfold)
                grid_search = GridSearchCV(pipeline, param_grid=param_grid, n_jobs=1, cv=cv, verbose=0, scoring=scoring)
                grid_search.fit(X, y)

            else:
                rf = RandomForestRegressor()
                cv = KFold(**param_kfold)
                grid_search = GridSearchCV(rf, param_grid=param_grid, n_jobs=1, cv=cv, verbose=0, scoring=scoring)
                grid_search.fit(X, y)

            # Save space disk
            grid_search.estimator = None
            grid_search.best_estimator_ = None
            grid_search_model[ts] = grid_search

        grid_search_features[tuple(features)] = grid_search_model
        
    return grid_search_features

In [335]:
grid_search_features = optimize(df_Xy, time_series, features, features_window_tmp, tminus, tplus)

In [332]:
list_features
time_series

def get_best_features_params_old(gs, time_series, list_features):
    list_params = grid_search_features[tuple(list_features[0])][time_series[0]].cv_results_['params']
    scores_features = np.array([[gs[tuple(f)][ts].cv_results_['mean_test_score'] for ts in time_series] for f in list_features ]).mean(axis=1).mean(axis=1)
    ix_features = scores_features.argmax()
    scores_params = np.array([[gs[tuple(f)][ts].cv_results_['mean_test_score'] for ts in time_series] for f in list_features ]).mean(axis=1)[ix_features]
    ix_params = scores_params.argmax()
    
    print(np.array([[gs[tuple(f)][ts].cv_results_['mean_test_score'] for ts in time_series] for f in list_features ]).mean(axis=1))
    print(np.array([[gs[tuple(f)][ts].cv_results_['mean_test_score'] for ts in time_series] for f in list_features ]).mean(axis=1).mean(axis=1))
    print(ix_features)
    print(ix_params)
    return list_features[ix_features], list_params[ix_params]

    
get_best_features_params(grid_search_features,time_series, list_features )



[[-6239.30744012 -5951.99741519 -6085.89564472 -5924.4081123 ]
 [-6043.92665866 -5783.99847991 -5911.82148858 -5799.61592499]]
[-6050.40215308 -5884.84063803]
1
1


(['hms_int_15min',
  'Day_id',
  'Mois_id',
  'vac_noel_quebec',
  'day_off_quebec',
  '24DEC',
  '31DEC',
  'renov_beaubien',
  'vac_udem1',
  'vac_udem2'],
 {'criterion': 'mse',
  'max_depth': None,
  'max_features': 'auto',
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 100,
  'n_jobs': 6})