In [1]:
import sys

sys.path.insert(0, '../')
from model.ha.ha_model import Ha_model
from utils.utils import *


# FIT

In [88]:

observation_data_path = ['../data/observation_file_2017-01-01_2017-06-30_included_test.csv']
exogenous_data_path = ['../data/date_file_2013_2018_included_test.csv']
features_list = [['Day_id', 'Month_id', 'School_holidays_france_zoneC', 'Extra_day_off_france',
                  'Holidays_france', 'hour_minute_second_numerical'], ['hour_minute_second_numerical']]
time_series = ['71634', '71650', '71442', '71654', '71743', '71328',
               '71305', '71517', '71284', '415852', '71404', '71298', '73630',
               '71318', '71348', '71379', '71647', '71663', '71673', '71485',
               '71222', '71297', '71347', '71100', '71133', '71217', '73696',
               '73689', '71407', '73616', '70537', '70636', '70375', '71351',
               '71977', '70452', '72031', '72013', '70645', '71253', '71363',
               '70596', '72430', '71201', '72460', '70488', '71076', '70604',
               '73695', '70143', '70248', '71001', '73615']

model_name = 'rf_model_test'
start_date = '2017-01-01 00:00:00'
end_date = '2017-01-05 00:00:00'
path_to_save = '../data/model/test/rf/'
param_kfold = {'n_splits': 2, 'shuffle': True, 'random_state': 0}
param_grid = {'n_estimators': [1], 'max_features': ['auto'], 'max_depth': [None], 'min_samples_split': [5],
              'min_samples_leaf': [5], 'n_jobs': [6], 'criterion': ['mse']}
scoring = "neg_mean_squared_error"
scaler_choice = "standard"

#path_directory_to_save = path_to_save + model_name + '/'

df_Xy = read_csv_list(observation_data_path).set_index('Datetime').join(
    read_csv_list(exogenous_data_path).set_index('Datetime'))[start_date:end_date]

X_list = [df_Xy[features].values for features in features_list]
y = df_Xy[time_series].values

my_model = Rf_model(model_name, start_date, end_date, features_list, time_series, observation_data_path,
                    exogenous_data_path, scaler_choice)
grid_search_dict = my_model.optimize(X_list, y, param_grid, param_kfold, scoring)

best_conf = [(features, grid_search_dict[tuple(features)].best_params_,
              grid_search_dict[tuple(features)].best_score_) for features in list(grid_search_dict.keys())]
best_conf.sort(key=lambda x: x[2])
features = list(best_conf[-1][0])
best_params = best_conf[-1][1]
assert( best_params['n_estimators']==1)
assert( best_params['max_features']=='auto')
assert( best_params['min_samples_leaf']==5)
assert( best_params['n_jobs']==6)
assert( best_params['min_samples_split']==5)
assert( best_params['criterion']=='mse')
assert( best_params['max_depth']==None)




Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s finished


In [94]:
for i in range(10):
    observation_data_path = ['../data/observation_file_2017-01-01_2017-06-30_included_test.csv']
    exogenous_data_path = ['../data/date_file_2013_2018_included_test.csv']
    features = ['Day_id', 'Month_id', 'School_holidays_france_zoneC', 'Extra_day_off_france',
                'Holidays_france', 'hour_minute_second_numerical']
    time_series = ['71634', '71650', '71442', '71654', '71743', '71328',
                   '71305', '71517', '71284', '415852', '71404', '71298', '73630',
                   '71318', '71348', '71379', '71647', '71663', '71673', '71485',
                   '71222', '71297', '71347', '71100', '71133', '71217', '73696',
                   '73689', '71407', '73616', '70537', '70636', '70375', '71351',
                   '71977', '70452', '72031', '72013', '70645', '71253', '71363',
                   '70596', '72430', '71201', '72460', '70488', '71076', '70604',
                   '73695', '70143', '70248', '71001', '73615']

    model_name = 'rf_model_test'
    start_date = '2017-01-01 00:00:00'
    end_date = '2017-01-31 00:00:00'
    path_to_save = '../data/model/test/rf/'
    scaler_choice = 'standard'
    best_params = {'criterion': 'mse', 'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 3,
                   'min_samples_split': 5, 'n_estimators': 30, 'n_jobs': 6}

    df_observation = read_csv_list(observation_data_path)
    df_exogenous = read_csv_list(exogenous_data_path)

    df_Xy = df_observation.set_index('Datetime').join(df_exogenous.set_index("Datetime"))[start_date:end_date]
    X = df_Xy[features].values
    y = df_Xy[time_series].values


    my_model = Rf_model(model_name, start_date, end_date, [features], time_series, observation_data_path,
                        exogenous_data_path, scaler_choice)
    my_model.infos['features'] = features
    my_model.infos['best_params'] = best_params

    rf = my_model.fit(X, y, my_model.infos['best_params'])

    feature_importances = dict(zip(features, np.round(rf.feature_importances_ * 100, 2).tolist()))
    my_model.infos["feature_importances"] = feature_importances

    assert(my_model.infos['name'] ==  model_name)
    assert(my_model.infos['features'] ==  features)
    assert(my_model.infos['time_series'] ==  time_series)
    assert(my_model.infos['start_date'] ==  start_date)
    assert(my_model.infos['end_date'] ==  end_date)
    assert(my_model.infos['exogenous_data_path'] ==  exogenous_data_path)
    assert(my_model.infos['observation_data_path'] ==  observation_data_path)
    assert(my_model.infos['best_params'] ==  best_params)
    assert(my_model.infos['feature_importances'] ==  feature_importances)
    print (feature_importances)



{'Month_id': 0.0, 'Holidays_france': 0.07, 'Extra_day_off_france': 0.0, 'Day_id': 15.68, 'hour_minute_second_numerical': 84.15, 'School_holidays_france_zoneC': 0.1}




{'Month_id': 0.0, 'Holidays_france': 0.07, 'Extra_day_off_france': 0.0, 'Day_id': 16.27, 'hour_minute_second_numerical': 83.56, 'School_holidays_france_zoneC': 0.1}




{'Month_id': 0.0, 'Holidays_france': 0.06, 'Extra_day_off_france': 0.0, 'Day_id': 16.04, 'hour_minute_second_numerical': 83.78, 'School_holidays_france_zoneC': 0.12}




{'Month_id': 0.0, 'Holidays_france': 0.08, 'Extra_day_off_france': 0.0, 'Day_id': 15.93, 'hour_minute_second_numerical': 83.88, 'School_holidays_france_zoneC': 0.11}




{'Month_id': 0.0, 'Holidays_france': 0.06, 'Extra_day_off_france': 0.0, 'Day_id': 16.0, 'hour_minute_second_numerical': 83.84, 'School_holidays_france_zoneC': 0.1}




{'Month_id': 0.0, 'Holidays_france': 0.06, 'Extra_day_off_france': 0.0, 'Day_id': 15.74, 'hour_minute_second_numerical': 84.09, 'School_holidays_france_zoneC': 0.11}




{'Month_id': 0.0, 'Holidays_france': 0.08, 'Extra_day_off_france': 0.0, 'Day_id': 15.83, 'hour_minute_second_numerical': 83.99, 'School_holidays_france_zoneC': 0.1}




{'Month_id': 0.0, 'Holidays_france': 0.07, 'Extra_day_off_france': 0.0, 'Day_id': 15.72, 'hour_minute_second_numerical': 84.09, 'School_holidays_france_zoneC': 0.12}




{'Month_id': 0.0, 'Holidays_france': 0.06, 'Extra_day_off_france': 0.0, 'Day_id': 15.83, 'hour_minute_second_numerical': 84.0, 'School_holidays_france_zoneC': 0.1}




{'Month_id': 0.0, 'Holidays_france': 0.07, 'Extra_day_off_france': 0.0, 'Day_id': 15.84, 'hour_minute_second_numerical': 83.91, 'School_holidays_france_zoneC': 0.18}


In [128]:
observation_data_path = ['../data/observation_file_2017-01-01_2017-06-30_included_test.csv']
exogenous_data_path = ['../data/date_file_2013_2018_included_test.csv']
features = ['Day_id', 'Month_id', 'School_holidays_france_zoneC', 'Extra_day_off_france',
            'Holidays_france', 'hour_minute_second_numerical']
time_series = ['71634', '71650', '71442', '71654', '71743', '71328',
               '71305', '71517', '71284', '415852', '71404', '71298', '73630',
               '71318', '71348', '71379', '71647', '71663', '71673', '71485',
               '71222', '71297', '71347', '71100', '71133', '71217', '73696',
               '73689', '71407', '73616', '70537', '70636', '70375', '71351',
               '71977', '70452', '72031', '72013', '70645', '71253', '71363',
               '70596', '72430', '71201', '72460', '70488', '71076', '70604',
               '73695', '70143', '70248', '71001', '73615']

model_name = 'rf_model_test'
start_date = '2017-01-01 00:00:00'
end_date = '2017-12-31 00:00:00'
path_to_save = '../data/model/test/rf/'
scaler_choice = 'standard'
best_params = {'criterion': 'mse', 'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 3,
               'min_samples_split': 5, 'n_estimators': 100, 'n_jobs': 6}

df_observation = read_csv_list(observation_data_path)
df_exogenous = read_csv_list(exogenous_data_path)

df_Xy = df_observation.set_index('Datetime').join(df_exogenous.set_index("Datetime"))[start_date:end_date]
X = df_Xy[features].values
y = df_Xy[time_series].values

my_model = Rf_model(model_name, start_date, end_date, [features], time_series, observation_data_path,
                    exogenous_data_path, scaler_choice)
my_model.infos['features'] = features
my_model.infos['best_params'] = best_params

rf = my_model.fit(X, y, my_model.infos['best_params'])

feature_importances = dict(zip(features, np.round(rf.feature_importances_ * 100, 2).tolist()))
my_model.infos["feature_importances"] = feature_importances


X = df_Xy[my_model.infos['features']].values
obs = np.around(df_Xy[time_series][start_date:end_date].values, decimals=2)
pred = my_model.predict(rf, X)
pred_mean = np.around(df_Xy[time_series][start_date:end_date].values.mean(axis=0), decimals=2)

assert(np.abs(pred-obs).mean()<np.abs(pred_mean-obs).mean())




(4344, 53)
(4344, 53)
(53,)
28.2373201132 437.337070781




array([  471.20107235,   283.09032627,   650.92547703,   568.6844622 ,
        1118.22422162,   476.9383416 ,   270.39956395,  5904.97384939,
         176.9107872 ,   912.70058285,   514.95485748,   400.57844309,
         174.24660566,   787.62918596,   259.00252224,   977.25324949,
         593.25532205,   731.56095532,  1237.70067127,   953.96042086,
         487.08857873,   568.8620866 ,  1317.16638919,   314.91609894,
         252.70662581,   122.71399033,   587.91614105,   546.1985232 ,
         223.49385803,   223.71725808,   147.40170686,   178.52578142,
         244.63717555,   282.70121317,   227.38505785,   422.43652495,
         277.21809597,   575.10599701,   192.56151105,   109.49925147,
         316.21384761,   214.84150329,   347.49604604,   146.44409512,
         736.76453874,   299.92767219,   329.84403027,   277.26230442,
         399.73900953,   662.39369385,   220.52883508,   322.42134093,
         175.04278375])

In [105]:
obs1 = np.around(df_observation.set_index('Datetime').loc['2017-01-01 00:00:00'].values, decimals=2) 
res1 = np.around(my_model.infos['dict_pred_mean'][(6,1,1,0,1,0)].astype(float), decimals=2)
obs2 = np.around(df_observation.set_index('Datetime').loc['2017-01-01 01:00:00'].values, decimals=2) 
res2 = np.around(my_model.infos['dict_pred_mean'][(6,1,1,0,1,1)].astype(float), decimals=2)

print((obs1!=res1).sum() == 0)
print((obs2!=res2).sum() == 0)

KeyError: 'dict_pred_mean'

In [99]:
my_model.infos['dict_pred_mean'][(6,1,1,0,1,0)]

array([22.4593, 7.807, 10.7536, 22.8844, 146.96439999999996, 124.2087,
       24.178, 109.446, 10.6672, 80.2373, 9.8027, 90.0448, 16.8435, 57.816,
       8.630400000000002, 68.0279, 17.4, 28.216000000000005, 77.0412,
       15.7896, 0.5281999999999999, 43.9401, 153.6652, 18.9126, 38.7772,
       15.806400000000002, 14.336199999999998, 13.468800000000002,
       12.275200000000002, 10.2363, 4.7817, 9.7092, 5.659200000000001,
       18.1482, 10.251, 0.0, 10.8222, 27.966, 9.5118, 11.6332, 12.6984,
       12.0736, 32.7104, 10.2176, 39.0145, 7.3299, 22.1378, 9.0321,
       11.7978, 14.704199999999998, 4.83, 15.6462, 3.8402], dtype=object)

In [106]:
my_model.infos.keys()

dict_keys(['dict_pred_mean', 'dict_pred_median', 'features', 'exogenous_data_path', 'start_date', 'observation_data_path', 'name', 'time_series', 'end_date'])

In [108]:
features = ['Day_id', 'Month_id', 'School_holidays_france_zoneC', 'Extra_day_off_france',
                    'Holidays_france', 'hour_minute_second_numerical']
time_series = ['71634', '71650', '71442', '71654', '71743', '71328',
               '71305', '71517', '71284', '415852', '71404', '71298', '73630',
               '71318', '71348', '71379', '71647', '71663', '71673', '71485',
               '71222', '71297', '71347', '71100', '71133', '71217', '73696',
               '73689', '71407', '73616', '70537', '70636', '70375', '71351',
               '71977', '70452', '72031', '72013', '70645', '71253', '71363',
               '70596', '72430', '71201', '72460', '70488', '71076', '70604',
               '73695', '70143', '70248', '71001', '73615']

model_name = 'ha_model_test'
start_date = '2017-01-01 00:00:00'
end_date = '2017-01-01 01:00:00'
path_to_save = '../data/model/ha/'

In [112]:
my_model.infos['features'] == features
my_model.infos['time_series'] == time_series
my_model.infos['start_date'] == start_date
my_model.infos['end_date'] == end_date
my_model.infos['exogenous_data_path']

['../data/date_file_2013_2018_included_test.csv']

In [91]:
df_observation['2017-01-01 00:00:00':'2017-01-01 01:00:00']

Unnamed: 0_level_0,71634,71650,71442,71654,71743,71328,71305,71517,71284,415852,...,71201,72460,70488,71076,70604,73695,70143,70248,71001,73615
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-01 00:00:00,22.4593,7.807,10.7536,22.8844,146.9644,124.2087,24.178,109.446,10.6672,80.2373,...,10.2176,39.0145,7.3299,22.1378,9.0321,11.7978,14.7042,4.83,15.6462,3.8402
2017-01-01 01:00:00,1.8891,1.477,2.068,3.2692,21.7932,9.6558,2.4178,12.3395,0.904,8.866,...,2.48,3.297,1.383,2.0267,1.0626,0.8427,1.7505,1.155,2.4612,0.633


In [None]:
dic

In [18]:
dict_id_name = dict(pd.read_csv('../data/station_information.csv')[['ID_REFA_LDA','NOM_GARE']].values.tolist())

In [30]:
dict_id_name[71517]

'LA D?FENSE-GRANDE ARCHE'

In [7]:
model_name = 'ha_model_test'
path_to_save = '../data/model/test/ha/'
load_model = load_pickle(path_to_save + model_name + '.pkl')
exogenous_data_path = load_model.infos['exogenous_data_path']
list_date = ['2017-01-01 00:00:00', '2017-01-01 01:00:00']

X = read_csv_list(exogenous_data_path).set_index('Datetime').ix[list_date][load_model.infos['features']].values
pred_mean = load_model.predict(X, choice='mean')
pred_median = load_model.predict(X, choice='median')

100%|██████████| 2/2 [00:00<00:00, 11008.67it/s]
100%|██████████| 2/2 [00:00<00:00, 12985.46it/s]


In [8]:
pred_mean[0]

array([  22.4593,    7.807 ,   10.7536,   22.8844,  146.9644,  124.2087,
         24.178 ,  109.446 ,   10.6672,   80.2373,    9.8027,   90.0448,
         16.8435,   57.816 ,    8.6304,   68.0279,   17.4   ,   28.216 ,
         77.0412,   15.7896,    0.5282,   43.9401,  153.6652,   18.9126,
         38.7772,   15.8064,   14.3362,   13.4688,   12.2752,   10.2363,
          4.7817,    9.7092,    5.6592,   18.1482,   10.251 ,    0.    ,
         10.8222,   27.966 ,    9.5118,   11.6332,   12.6984,   12.0736,
         32.7104,   10.2176,   39.0145,    7.3299,   22.1378,    9.0321,
         11.7978,   14.7042,    4.83  ,   15.6462,    3.8402])

# RF

In [1]:
import unittest

import sys

sys.path.insert(0, '../')
from model.rf.rf_model import Rf_model
from utils.utils import *

In [60]:
observation_data_path = ['../data/observation_file_2017-01-01_2017-06-30_included_test.csv']
exogenous_data_path = ['../data/date_file_2013_2018_included_test.csv']
features_list = [['Day_id', 'Month_id', 'School_holidays_france_zoneC', 'Extra_day_off_france',
                  'Holidays_france', 'hour_minute_second_numerical'], ['hour_minute_second_numerical']]
time_series = ['71634', '71650', '71442', '71654', '71743', '71328',
               '71305', '71517', '71284', '415852', '71404', '71298', '73630',
               '71318', '71348', '71379', '71647', '71663', '71673', '71485',
               '71222', '71297', '71347', '71100', '71133', '71217', '73696',
               '73689', '71407', '73616', '70537', '70636', '70375', '71351',
               '71977', '70452', '72031', '72013', '70645', '71253', '71363',
               '70596', '72430', '71201', '72460', '70488', '71076', '70604',
               '73695', '70143', '70248', '71001', '73615']

model_name = 'rf_model_test'
start_date = '2017-01-01 00:00:00'
end_date = '2017-01-31 00:00:00'
path_to_save = '../data/model/test/rf/'
param_kfold = {'n_splits': 2, 'shuffle': True, 'random_state': 0}
param_grid = {'n_estimators': [10, 30], 'max_features': ['auto'], 'max_depth': [None], 'min_samples_split': [5],
              'min_samples_leaf': [3, 5], 'n_jobs': [6], 'criterion': ['mse']}
scoring = "neg_mean_squared_error"
scaler_choice = "standard"

path_directory_to_save = path_to_save + model_name + '/'

df_Xy = read_csv_list(observation_data_path).set_index('Datetime').join(
    read_csv_list(exogenous_data_path).set_index('Datetime'))[start_date:end_date]

X_list = [df_Xy[features].values for features in features_list]
y = df_Xy[time_series].values

my_model = Rf_model(model_name, start_date, end_date, features_list, time_series, observation_data_path,
                    exogenous_data_path, scaler_choice)

grid_search_dict = my_model.optimize(X_list, y, param_grid, param_kfold, scoring)

best_conf = [(features, grid_search_dict[tuple(features)].best_params_,
                      grid_search_dict[tuple(features)].best_score_) for features in list(grid_search_dict.keys())]
best_conf.sort(key=lambda x: x[2])
features = list(best_conf[-1][0])
best_params = best_conf[-1][1]
my_model.infos['features'] = features
my_model.infos['best_params'] = best_params



df_observation = read_csv_list(observation_data_path)
df_exogenous = read_csv_list(exogenous_data_path)

df_Xy = df_observation.set_index('Datetime').join(df_exogenous.set_index("Datetime"))[start_date:end_date]
X = df_Xy[features].values
y = df_Xy[time_series].values

rf = my_model.fit(X, y, my_model.infos['best_params'])

feature_importances = dict(zip(features, np.round(rf.feature_importances_ * 100, 2).tolist()))




Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    2.7s finished


Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    2.7s finished


In [63]:
model_name = 'rf_model_test'
path_to_save = '../data/model/test/rf/'
load_model = load_pickle(path_to_save + model_name + '/rf_model_test.pkl')
exogenous_data_path = load_model.infos['exogenous_data_path']
list_date = ['2017-01-01 00:00:00', '2017-01-01 01:00:00']


observation_data_path = ['../data/observation_file_2017-01-01_2017-06-30_included_test.csv']
df_observation = read_csv_list(observation_data_path)
X = read_csv_list(exogenous_data_path).set_index('Datetime').ix[list_date][load_model.infos['features']].values
pred_mean = load_model.predict(X, choice='mean')
pred_median = load_model.predict(X, choice='median')

res1 = np.around(pred_mean[0], decimals=2)
res2 = np.around(pred_mean[1], decimals=2)
res1_ = np.around(pred_median[0], decimals=2)
res2_ = np.around(pred_median[1], decimals=2)
obs1 = np.around(df_observation.set_index('Datetime').loc['2017-01-01 00:00:00'].values, decimals=2)
obs2 = np.around(df_observation.set_index('Datetime').loc['2017-01-01 01:00:00'].values, decimals=2)


KeyError: 'features'

In [65]:
load_model.infos

{'end_date': '2017-01-31 00:00:00',
 'exogenous_data_path': ['../data/date_file_2013_2018_included_test.csv'],
 'features_list': [['Day_id',
   'Month_id',
   'School_holidays_france_zoneC',
   'Extra_day_off_france',
   'Holidays_france',
   'hour_minute_second_numerical'],
  ['hour_minute_second_numerical']],
 'name': 'rf_model_test',
 'observation_data_path': ['../data/observation_file_2017-01-01_2017-06-30_included_test.csv'],
 'scaler_choice': 'standard',
 'start_date': '2017-01-01 00:00:00',
 'time_series': ['71634',
  '71650',
  '71442',
  '71654',
  '71743',
  '71328',
  '71305',
  '71517',
  '71284',
  '415852',
  '71404',
  '71298',
  '73630',
  '71318',
  '71348',
  '71379',
  '71647',
  '71663',
  '71673',
  '71485',
  '71222',
  '71297',
  '71347',
  '71100',
  '71133',
  '71217',
  '73696',
  '73689',
  '71407',
  '73616',
  '70537',
  '70636',
  '70375',
  '71351',
  '71977',
  '70452',
  '72031',
  '72013',
  '70645',
  '71253',
  '71363',
  '70596',
  '72430',
  '71201