# Importation

In [1]:
from sklearn.metrics import make_scorer,SCORERS,mean_absolute_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge, Lasso, LinearRegression,SGDRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor
from scipy import stats
import pandas as pd
import numpy as np

# Load dataframe

In [2]:
data = pd.read_csv("predictions_vinci")
data.head()

Unnamed: 0.1,Unnamed: 0,yesterday,prediction,conf_low,conf_high,true,conf_std,Date,perf_pred,perf_true,perf_low,perf_high,garch_pred,vol_variation,prediction_cnn
0,0,101.311661,100.229275,98.853678,101.604872,100.998656,0.701848,2016-09-29,-0.010684,-0.00309,-0.024262,0.002894,21.300782,-0.004677,0.000385
1,1,100.998656,102.686427,101.292483,104.08037,101.535259,0.711209,2016-09-30,0.016711,0.005313,0.002909,0.030512,21.186977,-0.005343,0.000222
2,2,101.535259,101.803834,100.37707,103.230599,101.445808,0.727954,2016-10-03,0.002645,-0.000881,-0.011407,0.016697,21.075146,-0.005278,0.000415
3,3,101.445808,101.608547,100.183373,103.033721,101.997306,0.727143,2016-10-04,0.001604,0.005436,-0.012444,0.015653,21.142841,0.003212,9.5e-05
4,4,101.997306,101.715455,100.285934,103.144976,100.298096,0.729361,2016-10-05,-0.002763,-0.016659,-0.016779,0.011252,21.256323,0.005367,0.00028


In [3]:
data.shape

(1131, 15)

In [4]:
feature_to_remove = ["Unnamed: 0","yesterday","prediction","conf_low","conf_high","true","Date","vol_variation","garch_pred"]

In [5]:
data = data.drop(columns=feature_to_remove).loc[:800]

In [6]:
data.head()

Unnamed: 0,conf_std,perf_pred,perf_true,perf_low,perf_high,prediction_cnn
0,0.701848,-0.010684,-0.00309,-0.024262,0.002894,0.000385
1,0.711209,0.016711,0.005313,0.002909,0.030512,0.000222
2,0.727954,0.002645,-0.000881,-0.011407,0.016697,0.000415
3,0.727143,0.001604,0.005436,-0.012444,0.015653,9.5e-05
4,0.729361,-0.002763,-0.016659,-0.016779,0.011252,0.00028


In [7]:
X = data.copy().drop(columns="perf_true")
y= data["perf_true"]

In [8]:
mae_arima = mean_absolute_error(data.perf_true,data.perf_pred)
mae_arima

0.00521128782478184

# GridSearch SVM

In [9]:
SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])

In [10]:

# Instanciate model
model_svm = Pipeline([("scale",StandardScaler()),("svm",SVR())])

# Hyperparameter search space
search_space = {
    'svm__kernel': ["linear", "poly", "rbf", "sigmoid"],
    'svm__C': stats.uniform(0.01, 1000),
    'svm__gamma': stats.loguniform(0.001,10),
    'svm__coef0': stats.uniform(-5,5),
}

# Instanciate Random Search
search_svm = RandomizedSearchCV(
    model_svm, search_space,
    n_jobs=-1, scoring='neg_mean_absolute_error', cv=5, n_iter=1, verbose=0)


search_svm.fit(X,y)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('scale',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('svm',
                                              SVR(C=1.0, cache_size=200,
                                                  coef0=0.0, degree=3,
                                                  epsilon=0.1, gamma='scale',
                                                  kernel='rbf', max_iter=-1,
                                                  shrinking=True, tol=0.001,
                                                  verbose=False))],
                                      verbose=False),
                   iid='deprecated', n_iter=1, n_jobs=-1,
                

In [11]:
search_svm.best_estimator_

Pipeline(memory=None,
         steps=[('scale',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svm',
                 SVR(C=756.4761383275566, cache_size=200,
                     coef0=-4.983802447756276, degree=3, epsilon=0.1,
                     gamma=0.001159700612336527, kernel='linear', max_iter=-1,
                     shrinking=True, tol=0.001, verbose=False))],
         verbose=False)

In [12]:
search_svm.best_score_

-0.011892484654970713

# GridSearch KNN

In [13]:
model_knn = Pipeline([("scale",StandardScaler()),("knn",KNeighborsRegressor())])

param_grid_knn =  {'knn__n_neighbors': range(2,50)}
search_knn = GridSearchCV(model_knn, param_grid=param_grid_knn, 
                          cv=5, n_jobs=-1, verbose=0, scoring='neg_mean_absolute_error')
search_knn.fit(X,y)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scale',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('knn',
                                        KNeighborsRegressor(algorithm='auto',
                                                            leaf_size=30,
                                                            metric='minkowski',
                                                            metric_params=None,
                                                            n_jobs=None,
                                                            n_neighbors=5, p=2,
                                                            weights='uniform'))],
                                verbose=False),
             iid='d

In [14]:
search_knn.best_estimator_

Pipeline(memory=None,
         steps=[('scale',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('knn',
                 KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                     metric='minkowski', metric_params=None,
                                     n_jobs=None, n_neighbors=37, p=2,
                                     weights='uniform'))],
         verbose=False)

In [15]:
search_knn.best_score_

-0.00549091438848859

# GridSearch Ridge

In [16]:
model_ridge = Pipeline([("scale",StandardScaler()),("ridge",Ridge())])
param_grid_ridge =  {'ridge__alpha': np.linspace(0.0001,2,num=1000)}
search_ridge = GridSearchCV(model_ridge, param_grid=param_grid_ridge, 
                              cv=5, n_jobs=-1, verbose=0, scoring='neg_mean_absolute_error')
search_ridge.fit(X,y)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scale',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('ridge',
                                        Ridge(alpha=1.0, copy_X=True,
                                              fit_intercept=True, max_iter=None,
                                              normalize=False,
                                              random_state=None, solver='auto',
                                              tol=0.001))],
                                verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'ridge__alpha': array([1.0000...
       1.96196386e+00, 1.96396577e+00, 1.96596767e+00, 1.96796957e+00,
       1.96997147e+00, 1.97197337e+00, 

In [17]:
search_ridge.best_estimator_

Pipeline(memory=None,
         steps=[('scale',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('ridge',
                 Ridge(alpha=2.0, copy_X=True, fit_intercept=True,
                       max_iter=None, normalize=False, random_state=None,
                       solver='auto', tol=0.001))],
         verbose=False)

In [18]:
search_ridge.best_score_

-0.0053650466667143

# GridSearch Lasso

In [19]:
model_lasso = Pipeline([("scale",StandardScaler()),("lasso",Lasso())])
param_grid_lasso =  {'lasso__alpha': np.linspace(0.0001,2,num=1000)}
search_lasso = GridSearchCV(model_lasso, param_grid=param_grid_lasso, 
                              cv=5, n_jobs=-1, verbose=0, scoring='neg_mean_absolute_error')
search_lasso.fit(X,y)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scale',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('lasso',
                                        Lasso(alpha=1.0, copy_X=True,
                                              fit_intercept=True, max_iter=1000,
                                              normalize=False, positive=False,
                                              precompute=False,
                                              random_state=None,
                                              selection='cyclic', tol=0.0001,
                                              warm_start=False))],
                                verbose=False),
             iid='depreca...
       1.96196386e+00, 1.963965

In [20]:
search_lasso.best_estimator_

Pipeline(memory=None,
         steps=[('scale',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('lasso',
                 Lasso(alpha=0.0001, copy_X=True, fit_intercept=True,
                       max_iter=1000, normalize=False, positive=False,
                       precompute=False, random_state=None, selection='cyclic',
                       tol=0.0001, warm_start=False))],
         verbose=False)

In [21]:
search_lasso.best_score_

-0.005317165860904019

# GridSearch  SGD

In [22]:
model_sgd = Pipeline([("scale",StandardScaler()),("sgd",SGDRegressor())]) 
param_grid_sgd =  {'sgd__alpha': np.linspace(0.0001,2,num=100),
                  'sgd__penalty': ["l2", "l1", "elasticnet"],
                  "sgd__l1_ratio" :np.linspace(0,1,num=10) }
search_sgd = GridSearchCV(model_sgd, param_grid=param_grid_sgd, 
                              cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_absolute_error')
search_sgd.fit(X,y)

Fitting 5 folds for each of 3000 candidates, totalling 15000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 668 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 4540 tasks      | elapsed:   21.3s
[Parallel(n_jobs=-1)]: Done 11036 tasks      | elapsed:   51.7s
[Parallel(n_jobs=-1)]: Done 15000 out of 15000 | elapsed:  1.2min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scale',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('sgd',
                                        SGDRegressor(alpha=0.0001,
                                                     average=False,
                                                     early_stopping=False,
                                                     epsilon=0.1, eta0=0.01,
                                                     fit_intercept=True,
                                                     l1_ratio=0.15,
                                                     learning_rate='invscaling',
                                                     loss='squared_loss',
                               

In [23]:
search_sgd.best_estimator_

Pipeline(memory=None,
         steps=[('scale',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('sgd',
                 SGDRegressor(alpha=0.0001, average=False, early_stopping=False,
                              epsilon=0.1, eta0=0.01, fit_intercept=True,
                              l1_ratio=0.6666666666666666,
                              learning_rate='invscaling', loss='squared_loss',
                              max_iter=1000, n_iter_no_change=5, penalty='l1',
                              power_t=0.25, random_state=None, shuffle=True,
                              tol=0.001, validation_fraction=0.1, verbose=0,
                              warm_start=False))],
         verbose=False)

In [24]:
search_sgd.best_score_

-0.005276649979264778

# GridSearch Descision Tree

In [25]:
model_des_tree = Pipeline([("scale",StandardScaler()),("desTree",DecisionTreeRegressor())]) 
param_grid_des_tree =  {'desTree__max_depth': range(2,100),
                  "desTree__min_samples_leaf" :range(2,100) }
search_des_tree = GridSearchCV(model_des_tree, param_grid=param_grid_des_tree, 
                              cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_absolute_error')
search_des_tree.fit(X,y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 5 folds for each of 9604 candidates, totalling 48020 fits


[Parallel(n_jobs=-1)]: Done 700 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 4572 tasks      | elapsed:   21.3s
[Parallel(n_jobs=-1)]: Done 11068 tasks      | elapsed:   51.6s
[Parallel(n_jobs=-1)]: Done 20124 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 31804 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 46044 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 48020 out of 48020 | elapsed:  3.7min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scale',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('desTree',
                                        DecisionTreeRegressor(ccp_alpha=0.0,
                                                              criterion='mse',
                                                              max_depth=None,
                                                              max_features=None,
                                                              max_leaf_nodes=None,
                                                              min_impurity_decrease=0.0,
                                                              min_impurity_split=None,
                                    

In [26]:
search_des_tree.best_estimator_

Pipeline(memory=None,
         steps=[('scale',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('desTree',
                 DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse',
                                       max_depth=5, max_features=None,
                                       max_leaf_nodes=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=52, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       presort='deprecated', random_state=None,
                                       splitter='best'))],
         verbose=False)

In [27]:
search_des_tree.best_score_

-0.005599471501856607

# GridSearch Random forest

In [None]:
model_rand_tree =  Pipeline([("scale",StandardScaler()),("forest",RandomForestRegressor(criterion="mae"))]) 
param_grid_rand_tree =  {"forest__n_estimators": [10,50, 100, 300, 500, 1000],
                         'forest__max_depth': np.linspace(5,500,10),
                          "forest__min_samples_leaf" :[5] }
search_rand_tree = GridSearchCV(model_rand_tree, param_grid=param_grid_rand_tree, 
                              cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_absolute_error')
search_rand_tree.fit(X,y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  1.2min


In [None]:
search_rand_tree.best_estimator_

In [None]:
search_rand_tree.best_score_

# GridSearch Gradient Boost

In [32]:
model_gboost = Pipeline([("scale",StandardScaler()),("gboost",GradientBoostingRegressor(criterion="mae"))]) 
param_grid_gboost =  {'gboost__n_estimators': [10,50, 100, 300, 500, 1000],
                      "gboost__learning_rate": [0.0001,0.001,0.003,0.01,0.03,0.1,0.3,1]}
search_gboost = GridSearchCV(model_gboost, param_grid=param_grid_gboost, 
                              cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_absolute_error')
search_gboost.fit(X,y)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 15.8min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scale',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('gboost',
                                        GradientBoostingRegressor(alpha=0.9,
                                                                  ccp_alpha=0.0,
                                                                  criterion='mae',
                                                                  init=None,
                                                                  learning_rate=0.1,
                                                                  loss='ls',
                                                                  max_depth=3,
                                                    

In [33]:
search_gboost.best_estimator_

Pipeline(memory=None,
         steps=[('scale',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('gboost',
                 GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                           criterion='mae', init=None,
                                           learning_rate=0.01, loss='ls',
                                           max_depth=3, max_features=None,
                                           max_leaf_nodes=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                           min_samples_leaf=1,
                                           min_samples_split=2,
                                           min_weight_fraction_leaf=0.0,
                                           n_estimators=300,
                                           n_iter_no_change=None,
                               

In [34]:
search_gboost.best_score_

-0.005408516785120936

# GridSearch Ada Boost

In [35]:
model_aboost = Pipeline([("scale",StandardScaler()),("aboost",AdaBoostRegressor())])
param_aboost =  {'aboost__n_estimators': [10,50, 100, 300, 500, 1000],
                      "aboost__learning_rate": [0.0001,0.001,0.003,0.01,0.03,0.1,0.3,1],
                      "aboost__loss":['linear', 'square', 'exponential']}
search_aboost = GridSearchCV(model_aboost, param_grid=param_aboost, 
                              cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_absolute_error')
search_aboost.fit(X,y)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:   20.7s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 373 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 656 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:  5.9min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scale',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('aboost',
                                        AdaBoostRegressor(base_estimator=None,
                                                          learning_rate=1.0,
                                                          loss='linear',
                                                          n_estimators=50,
                                                          random_state=None))],
                                verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'aboost__learning_rate': [0.0001, 0.001, 0.003, 0.01,
                                                   0.03, 0

In [None]:
search_aboost.best_estimator_

In [None]:
search_aboost.best_score_

# Stacking 

In [38]:
gboost = search_gboost.best_estimator_
ridge = search_ridge.best_estimator_
lasso = search_lasso.best_estimator_
svm = search_svm.best_estimator_
adaboost = search_aboost.best_estimator_
forest = search_rand_tree.best_estimator_
desTree = search_des_tree.best_estimator_
sgd = search_sgd.best_estimator_
knn = search_knn.best_estimator_




model_stacking = VotingRegressor(
    estimators = [("gboost", gboost),("adaboost", adaboost),("ridge", ridge),("svm", svm),
                 ("lasso", lasso),("forest", forest),("desTree", desTree),("sgd", sgd),("knn", knn)],
    weights = [1,1,1,-1,1,1,1,1,1], # to equally weight the two models
    n_jobs=-1
)

score = cross_val_score(model_stacking, X, y, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
print(score.std())
score.mean()

0.00047571785599926606


-0.0055317570269267004