# RF ST InUNI OutUNI - uni -> uni (one model per time series)

In [1]:
import sys
sys.path.insert(0, '../../utils/')
from utils import *
from pylab import *
from utils_date import *
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline

from sklearn.metrics.scorer import make_scorer

from tqdm import tqdm
import itertools

In [3]:
def fit_predict(X, y, cv, param_grid, scaler_choice_X, scaler_choice_y):
    
    pred_train_array = []
    pred_val_array = []
    ytrain_array = []
    yval_array = []
    
    for ix_train,ix_val in cv.split(X):
        Xtrain,Xval = X[ix_train], X[ix_val]
        ytrain,yval = y[ix_train], y[ix_val]

        scalerX = None
        scalery = None
        if scaler_choice_X == 'minmax':
            scalerX = MinMaxScaler(feature_range=(0, 1))
        elif scaler_choice_X == 'standard':
            scalerX = StandardScaler()
        if scalerX != None:
            Xtrain = scalerX.fit_transform(Xtrain)
            Xval = scalerX.transform(Xval)

        if scaler_choice_y == 'minmax':
            scalery = MinMaxScaler(feature_range=(0, 1))
        elif scaler_choice_y == 'standard':
            scalery = StandardScaler()
        if scalery != None:
            ytrain = scalery.fit_transform(ytrain)
            yval = scalery.transform(yval)

        keys, values = zip(*param_grid.items())
        all_params = [dict(zip(keys, v)) for v in itertools.product(*values)]
        pred_train_params = []
        pred_val_params = []
        for p in all_params:
            rf = RandomForestRegressor(**p, verbose=1)
            rf.fit(Xtrain,ytrain)
            pred_train = rf.predict(Xtrain)
            pred_val = rf.predict(Xval)
            pred_train_params.append(pred_train)
            pred_val_params.append(pred_val)

        pred_train_array.append(pred_train_params)
        pred_val_array.append(pred_val_params)
        ytrain_array.append(ytrain)
        yval_array.append(yval)

    pred_train_array = np.array(pred_train_array)
    pred_val_array = np.array(pred_val_array)
    ytrain_array = np.array(ytrain_array)
    yval_array = np.array(yval_array) 
    
    return pred_train_array, pred_val_array, ytrain_array, yval_array


def optimize(df_Xy, param_kfold, time_series, list_features, features_window, tminus, tplus):

    cv = KFold(**param_kfold)
    
    grid_search_dict = {}
    for features in list_features:

        # create X and y_list
        if features_window!=None:
            features_window_tmp = [fw for fw in features_window if fw in features]

            if len(features_window_tmp)>0:
                X, y_list, fnames = window_Xy(df_Xy, time_series, features, features_window_tmp, tminus, tplus)
                # uni -> .T
                y_list = y_list.T
            else:
                X = df_Xy[features].values
                y_list = df_Xy[time_series].values.T
                fnames = features
        else:
            X = df_Xy[features].values
            y_list = df_Xy[time_series].values.T
            fnames = features

        # loop over y_list
        pred_train_all = []
        pred_val_all = []
        ytrain_all = []
        yval_all = []
        

        for y, ts in zip(y_list,time_series):
            print("Time series {}".format(ts))
            pred_train_array, pred_val_array, ytrain_array, yval_array = fit_predict(X, y, cv, param_grid, scaler_choice_X,
                                                                                     scaler_choice_y)
            pred_train_all.append(pred_train_array)
            pred_val_all.append(pred_val_array)
            ytrain_all.append(ytrain_array)
            yval_all.append(yval_array)

        pred_train_all = np.array(pred_train_all)
        pred_val_all = np.array(pred_val_all)
        
        ytrain_all = np.array(ytrain_all)
        yval_all = np.array(yval_all)
        
        # Errors calculus
        pred_train = np.swapaxes(pred_train_all, 0, 2)
        obs_train = np.swapaxes(ytrain_all, 0, 1)
        pred_val = np.swapaxes(pred_val_all, 0, 2)
        obs_val = np.swapaxes(yval_all, 0, 1)

        errors_function = [rmse, mse, mae, mape_at]
        errors_name = ['rmse', 'mse', 'mae', 'mape_at']
        errors_all={'train':{}, 'val':{}}
        for ef,en in zip(errors_function, errors_name):

            errors_all['train'][en]={}
            errors_all['train'][en]['error'] = np.array([np.array([ef(np.concatenate(pred_train[ixp][ixcv]),np.concatenate(obs_train[ixcv])) 
                                for ixcv in range(pred_train.shape[1])]) for ixp in range(pred_train.shape[0])])

            errors_all['train'][en]['mean'] = errors_all['train'][en]['error'].mean(axis=1) 
            errors_all['train'][en]['std'] = errors_all['train'][en]['error'].std(axis=1) 
            
            
            errors_all['val'][en]={}
            errors_all['val'][en]['error'] = np.array([np.array([ef(np.concatenate(pred_val[ixp][ixcv]),np.concatenate(obs_val[ixcv])) 
                                for ixcv in range(pred_val.shape[1])]) for ixp in range(pred_val.shape[0])])

            errors_all['val'][en]['mean'] = errors_all['val'][en]['error'].mean(axis=1) 
            errors_all['val'][en]['std'] = errors_all['val'][en]['error'].std(axis=1) 

        grid_search_dict[tuple(features)] = errors_all
        
    return grid_search_dict

In [2]:
# Data
observation_data_path = ['/home/toque/data2/montreal/stm/data/valid_metro_15min_2015_2016_2017_sumpass_nodayfree.csv']
context_data_path = ['/home/toque/data2/montreal/events/data/clean/events_2015_2018_end_event_stopid.csv',
                       '/home/toque/data2/montreal/events/data/clean/events_2015_2018_start_event_stopid.csv',
                       '/home/toque/data2/montreal/events/data/clean/events_2015_2018_period_event_stopid.csv',
                       '/home/toque/data2/date/2013-01-01-2019-01-01_new.csv',
                      ]

df_observation = read_csv_list(observation_data_path)

# fill timestamps not available with 0 to have 96 timestamps per day
days = sorted(list(set([i[:10] for i in df_observation['Datetime'].values])))
timestamp_list = [j for i in [build_timestamp_list(d+' 00:00:00', d+' 23:45:00', time_step_second=15*60) for d in days] for j in i]
df_date = pd.DataFrame(data = timestamp_list, columns = ['Datetime']).set_index('Datetime')
df_observation = df_date.join(df_observation.set_index('Datetime')).fillna(0).reset_index()

df_context = read_csv_list(context_data_path)

In [3]:
time_series = ['11', '32', '34', '15', '44', '65', '31', '33', '35', '47', '13',
       '14', '1', '9', '5', '18', '36', '24', '68', '43', '8', '64', '10',
       '55', '3', '49', '51', '2', '19', '56', '7', '6', '4', '48', '66',
       '25', '23', '28', '39', '54', '60', '27', '20', '46', '12', '21',
       '62', '52', '41', '50', '30', '16', '37', '40', '26', '67', '57',
       '61', '42', '45', '38', '29', '58', '63', '22', '59', '53', '17']

features_t0 = ['hms_int_15min','Day_id', 'Mois_id','vac_noel_quebec', 'day_off_quebec', '24DEC', '31DEC',
                 'renov_beaubien', 'vac_udem1', 'vac_udem2']

features_day = ['5-end_event', '11-end_event', '12-end_event', '13-end_event',
       '15-end_event', '16-end_event', '23-end_event', '24-end_event',
       '31-end_event', '32-end_event', '35-end_event', '43-end_event',
       '45-end_event', '61-end_event', '68-end_event', '5-start_event',
       '11-start_event', '12-start_event', '13-start_event',
       '15-start_event', '16-start_event', '23-start_event',
       '24-start_event', '31-start_event', '32-start_event',
       '35-start_event', '43-start_event', '45-start_event',
       '61-start_event', '68-start_event', '5-period_event',
       '11-period_event', '12-period_event', '13-period_event',
       '15-period_event', '16-period_event', '23-period_event',
       '24-period_event', '31-period_event', '32-period_event',
       '35-period_event', '43-period_event', '45-period_event',
       '61-period_event', '68-period_event']

features_day = []

lag = 8

param_kfold={
    'n_splits': 5,
    'shuffle': True,
    'random_state': 0}
        
param_grid={
    'n_estimators': [100, 150, 200],
    'max_features': ['auto',None],
    'max_depth': [None],
    'min_samples_split': [2,5,10],
    'min_samples_leaf': [1,5,10],
    'n_jobs': [6],
    'criterion': ['mse']}

#scoring = "neg_mean_squared_error"
#scaler_choice = "standard"
scaler_choice_X = None
scaler_choice_y = None

start_date = '2015-01-01 00:00:00'
end_date = '2016-12-31 23:45:00'


model_name = 'st_rf_InUni_OutUni'

In [7]:
def createXy_InUni_OutUni_lag_t0_day(dfXy, time_series, features_t0, features_day, lag):
    y_list = []
    X_list = []
    Xnames_list = []
    datetime_list = dfXy.index.values[lag:]
    
    x_t0 = dfXy[features_t0].values[lag:]
    
    for fl in time_series:
        Xnames_list.append( features_t0 +  [fl+'-T-'+str(ix) for ix in np.arange(1,lag+1)[::-1]] + [f+'-T'+str(ix) for f in features_day for ix in np.arange(96)])  
    
    for ts in time_series:
        y_list.append(dfXy[ts].values[lag:])
      
    x_day_list = []
    x_lag_list = [] 
    for ix, dt in tqdm(list(zip(np.arange(len(datetime_list)),datetime_list))):
        day = dt[:10]
        x_day = dfXy[features_day].loc[day + ' 00:00:00' : day + ' 23:45:00'].values.T.flatten()
        x_day_list.append(x_day)
        x_lag_list.append(dfXy[time_series].loc[dfXy.index.values[ix]:dfXy.index.values[ix+lag-1]].values.T.flatten())

    x_t0 = np.array(x_t0)
    x_lag_list = np.array(x_lag_list)
    x_lag_list = x_lag_list.reshape(x_lag_list.shape[0], len(time_series), lag)
    x_day_list = np.array(x_day_list)
    
    
    X_list = []
    for x_lag in np.swapaxes(x_lag_list,0,1):
        X_list.append(np.concatenate([x_t0, x_lag, x_day_list], axis=1))

    return np.array(X_list), np.array(y_list), np.array(Xnames_list), np.array(datetime_list)

# Optimisation

In [43]:
all_features = list(set([j for i in list_features for j in i]))
df_Xy_train = df_observation.set_index('Datetime').join(df_exogenous.set_index('Datetime'))[start_date:end_date][time_series+all_features].dropna()

In [44]:
grid_search_dict = optimize(df_Xy_train, param_kfold, time_series, list_features, features_window, tminus, tplus)

Time series 11


[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    6.4s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   16.0s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.9s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   15.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.2s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.7s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   14.7s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: 

[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   13.8s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.4s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   13.7s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.6s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   13.9s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_j

[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.8s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   14.3s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.3s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   13.8s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.2s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: 

[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.9s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   14.9s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.7s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   14.4s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_j

[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.3s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   13.7s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.7s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   14.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.4s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   20.9s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: 

[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   14.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.4s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   14.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.5s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   21.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.2s finished
[Parallel(n_j

[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.7s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   14.6s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    6.6s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   24.4s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.3s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: 

[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.4s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   13.7s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.7s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   21.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.1s finished
[Parallel(n_j

[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.9s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   14.5s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.4s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   21.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    6.4s
[Parallel(n_jobs=6)]: 

[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    6.8s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   17.0s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    7.3s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   27.7s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.3s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    7.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   26.3s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: 

[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   14.0s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.2s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   20.9s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.3s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   20.9s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.2s finished
[Parallel(n_j

[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.6s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   21.7s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.6s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   21.8s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.1s finished
[Parallel(n_j

[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    6.5s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   24.8s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    6.2s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   23.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.9s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   22.4s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: 

[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   21.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.0s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.7s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   21.3s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.5s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   21.0s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.1s finished
[Parallel(n_j

[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.5s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   21.5s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.7s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:   21.5s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 150 out of 150 | elapsed:    0.1s finished
[Parallel(n_j

In [45]:
save_pickle('/home/toque/data2/forecast/model/rf_uni/optimize/'+model_name+'/grid_search_dict.pkl', grid_search_dict)

# Get best features and params

In [81]:
def get_best_features_params(grid_search_dict, param_grid):
    keys, values = zip(*param_grid.items())
    all_params = [dict(zip(keys, v)) for v in itertools.product(*values)]

    params_bestscore_list = [[features,all_params[grid_search_dict[features]['val']['rmse']['mean'].argmin()],
      grid_search_dict[features]['val']['rmse']['mean'].min()] for features in list(grid_search_dict.keys())]
    
    ix_best_features = np.array([i[2] for i in params_bestscore_list]).argmin()
    best_features = params_bestscore_list[ix_best_features][0]
    best_params = params_bestscore_list[ix_best_features][1]
    best_score = params_bestscore_list[ix_best_features][2]
    return best_features, best_params, best_score

features, best_params, best_score = get_best_features_params(grid_search_dict, param_grid)


# Learn with best features and best params

In [8]:
time_series = ['11', '32', '34', '15', '44', '65', '31', '33', '35', '47', '13',
       '14', '1', '9', '5', '18', '36', '24', '68', '43', '8', '64', '10',
       '55', '3', '49', '51', '2', '19', '56', '7', '6', '4', '48', '66',
       '25', '23', '28', '39', '54', '60', '27', '20', '46', '12', '21',
       '62', '52', '41', '50', '30', '16', '37', '40', '26', '67', '57',
       '61', '42', '45', '38', '29', '58', '63', '22', '59', '53', '17']

dfXy_train = df_observation.set_index('Datetime').join(df_context.set_index('Datetime'))[start_date:end_date][time_series+features_day+features_t0].dropna()
X_list, y_list, Xnames_list, datetime_list = createXy_InUni_OutUni_lag_t0_day(dfXy_train, time_series, features_t0, features_day, lag)



100%|██████████| 70168/70168 [14:18<00:00, 81.73it/s]


In [9]:
best_params = {'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'min_samples_leaf': 5,
 'min_samples_split': 10,
 'n_estimators': 100,
 'n_jobs': 5}
            
rf_list = []
for Xtrain, ytrain in tqdm(list(zip(X_list, y_list))):
    rf = RandomForestRegressor(**best_params, verbose=0)
    rf.fit(Xtrain, ytrain)
    rf_list.append(rf)


100%|██████████| 68/68 [05:44<00:00,  5.06s/it]


In [10]:
rf_list[0]

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=5, min_samples_split=10,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=5,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

# Predict

In [None]:
start_date, end_date = '2015-01-01 00:00:00', '2017-12-31 23:45:00'
dfXy_test = df_observation.set_index('Datetime').join(df_context.set_index('Datetime'))[start_date:end_date][time_series+features_day+features_t0].dropna()
Xtest_list, ytest_list, Xnames_list, datetime_list = createXy_InUni_OutUni_lag_t0_day(dfXy_test, time_series, features_t0, features_day, lag)

 33%|███▎      | 34608/104920 [09:51<20:00, 58.55it/s]

In [18]:
pred_list = []
for Xtest, rf in tqdm(list(zip(Xtest_list, rf_list))):
    pred_list.append(rf.predict(Xtest))
pred_list = np.array(pred_list)

100%|██████████| 68/68 [00:23<00:00,  2.93it/s]


In [22]:
path_directory_to_save = '/home/toque/data2/forecast/model/rf_st_InUni_OutUni/prediction/'+model_name+'/'

df = pd.DataFrame(data=datetime_list, columns=['Datetime'])
for ts,v in zip(time_series,pred_list):
    df[ts]=v
       
if not os.path.exists(path_directory_to_save):
    os.makedirs(path_directory_to_save)
df.to_csv(path_directory_to_save + start_date[:10]+'_'+end_date[:10]+'.csv', index=False)
df


Unnamed: 0,Datetime,11,32,34,15,44,65,31,33,35,...,42,45,38,29,58,63,22,59,53,17
0,2015-01-01 02:00:00,0.011717,0.000000,0.000000,0.004338,0.000000,0.063127,0.015795,0.000000,0.000000,...,0.000000,0.000000,0.001111,0.011046,0.029004,0.000000,0.054071,0.000000,0.030327,0.037960
1,2015-01-01 02:15:00,0.000000,0.000000,0.000000,0.000000,0.000000,0.016528,0.002857,0.000000,0.000000,...,0.000000,0.000000,0.001765,0.000000,0.005000,0.000000,0.023237,0.000000,0.000000,0.000000
2,2015-01-01 02:30:00,0.000000,0.000000,0.000000,0.000000,0.000000,0.017611,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.024857,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,2015-01-01 02:45:00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,2015-01-01 03:00:00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,2015-01-01 03:15:00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,2015-01-01 03:30:00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,2015-01-01 03:45:00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,2015-01-01 04:00:00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,2015-01-01 04:15:00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [20]:
df

Unnamed: 0,Datetime,11,32,34,15,44,65,31,33,35,...,42,45,38,29,58,63,22,59,53,17
0,2015-01-01 02:00:00,0.011717,0.000000,0.000000,0.004338,0.000000,0.063127,0.015795,0.000000,0.000000,...,0.000000,0.000000,0.001111,0.011046,0.029004,0.000000,0.054071,0.000000,0.030327,0.037960
1,2015-01-01 02:15:00,0.000000,0.000000,0.000000,0.000000,0.000000,0.016528,0.002857,0.000000,0.000000,...,0.000000,0.000000,0.001765,0.000000,0.005000,0.000000,0.023237,0.000000,0.000000,0.000000
2,2015-01-01 02:30:00,0.000000,0.000000,0.000000,0.000000,0.000000,0.017611,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.024857,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,2015-01-01 02:45:00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,2015-01-01 03:00:00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,2015-01-01 03:15:00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,2015-01-01 03:30:00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,2015-01-01 03:45:00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,2015-01-01 04:00:00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,2015-01-01 04:15:00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


# Draft

In [334]:
# Optimization 
def optimize_old(df_Xy, time_series, features, features_window_tmp, tminus, tplus):
    grid_search_features = {}

    for features in list_features:

        grid_search_model = {}

        if features_window!=None:
            features_window_tmp = [fw for fw in features_window if fw in features]

            if len(features_window_tmp)>0:
                X, list_y, fnames = window_Xy(df_Xy, time_series, features, features_window_tmp, tminus, tplus)
                # uni -> .T
                list_y = list_y.T
            else:
                X = df_Xy[features].values
                list_y = df_Xy[time_series].values.T
                fnames = features
        else:
            X = df_Xy[features].values
            list_y = df_Xy[time_series].values.T
            fnames = features

        for y, ts in zip(list_y,time_series):
            scaler = None
            if scaler_choice == 'minmax':
                scaler = MinMaxScaler(feature_range=(0, 1))
            elif scaler_choice == 'standard':
                scaler = StandardScaler()

            if scaler!=None:
                rf = RandomForestRegressor()
                pipeline = Pipeline([('scale', scaler), ('model', rf)])
                cv = KFold(**param_kfold)
                grid_search = GridSearchCV(pipeline, param_grid=param_grid, n_jobs=1, cv=cv, verbose=0, scoring=scoring)
                grid_search.fit(X, y)

            else:
                rf = RandomForestRegressor()
                cv = KFold(**param_kfold)
                grid_search = GridSearchCV(rf, param_grid=param_grid, n_jobs=1, cv=cv, verbose=0, scoring=scoring)
                grid_search.fit(X, y)

            # Save space disk
            grid_search.estimator = None
            grid_search.best_estimator_ = None
            grid_search_model[ts] = grid_search

        grid_search_features[tuple(features)] = grid_search_model
        
    return grid_search_features

In [335]:
grid_search_features = optimize(df_Xy, time_series, features, features_window_tmp, tminus, tplus)

In [332]:
list_features
time_series

def get_best_features_params_old(gs, time_series, list_features):
    list_params = grid_search_features[tuple(list_features[0])][time_series[0]].cv_results_['params']
    scores_features = np.array([[gs[tuple(f)][ts].cv_results_['mean_test_score'] for ts in time_series] for f in list_features ]).mean(axis=1).mean(axis=1)
    ix_features = scores_features.argmax()
    scores_params = np.array([[gs[tuple(f)][ts].cv_results_['mean_test_score'] for ts in time_series] for f in list_features ]).mean(axis=1)[ix_features]
    ix_params = scores_params.argmax()
    
    print(np.array([[gs[tuple(f)][ts].cv_results_['mean_test_score'] for ts in time_series] for f in list_features ]).mean(axis=1))
    print(np.array([[gs[tuple(f)][ts].cv_results_['mean_test_score'] for ts in time_series] for f in list_features ]).mean(axis=1).mean(axis=1))
    print(ix_features)
    print(ix_params)
    return list_features[ix_features], list_params[ix_params]

    
get_best_features_params(grid_search_features,time_series, list_features )



[[-6239.30744012 -5951.99741519 -6085.89564472 -5924.4081123 ]
 [-6043.92665866 -5783.99847991 -5911.82148858 -5799.61592499]]
[-6050.40215308 -5884.84063803]
1
1


(['hms_int_15min',
  'Day_id',
  'Mois_id',
  'vac_noel_quebec',
  'day_off_quebec',
  '24DEC',
  '31DEC',
  'renov_beaubien',
  'vac_udem1',
  'vac_udem2'],
 {'criterion': 'mse',
  'max_depth': None,
  'max_features': 'auto',
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 100,
  'n_jobs': 6})