In [1]:
import numpy as np
from winton_script_ver1 import load_data, get_xy, get_cols, pre_process, hyper_parameter_search
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import TransformedTargetRegressor # used for transforming the target variable
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor
np.random.seed(100)

In [2]:
train, test = load_data()

Done Loading data ...


In [3]:
xtrain, ytrain = get_xy(train)

In [4]:
y1 = ytrain['Ret_PlusOne'] # D + 1 returns
y2 = ytrain['Ret_PlusTwo'] # D + 2 returns
cols = get_cols(xtrain)
xtrain = xtrain.iloc[:, cols: ]

25 columns contained the search word Feature 


In [5]:
xtrain1, xtest1, ytrain1, ytest1 = pre_process(xtrain, y1)

# Prediction D + 1

$R^2(y, \hat{y}) = 1 - \frac{\sum_{i=1}^{n} (y_i - \hat{y}_i)^2}{\sum_{i=1}^{n} (y_i - \bar{y})^2}$



$explained\_{}variance(y, \hat{y}) = 1 - \frac{Var\{ y - \hat{y}\}}{Var\{y\}}$

In [6]:
from scipy.stats import randint as sp_randint

In [7]:
param_dist = {"max_depth": sp_randint(2, 10), 
              "max_features": ['auto', 'sqrt', 'log2', None], 
              "min_samples_split": sp_randint(2, 50), 
              'min_samples_leaf': sp_randint(2, 500),
              "n_estimators": sp_randint(200, 300), 
              'loss': ['huber', 'lad', 'lad'], 
              'learning_rate': [0.1, 0.2, 0.3, 0.0001], 
              'n_iter_no_change': sp_randint(1, 50)}

train_samples = 20000
test_samples = 5000
n_iter = 50
model = GradientBoostingRegressor

In [8]:
reg, params = hyper_parameter_search(xtrain1, 
                                     ytrain1, 
                                     xtest1, 
                                     ytest1, 
                                     model=model,
                                     param_dist=param_dist,
                                     n_iter_search=n_iter,
                                     train_samples=train_samples,
                                     test_samples=test_samples,
                                     cv=5)

model used: GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   24.9s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   32.5s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   42.4s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:  9.5min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed: 13.0min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 13

Done with training.
R2 score train set: 0.3581 score test set: 0.0577
Mean-squared error train set: 0.0004 test set: 0.0006
Explained variance train set: 0.3581 test set: 0.058
Root mean squared error train set 0.0195 test set: 0.0236
..................................................
Best parameters:
{'learning_rate': 0.2, 'loss': 'huber', 'max_depth': 8, 'max_features': None, 'min_samples_leaf': 47, 'min_samples_split': 17, 'n_estimators': 281, 'n_iter_no_change': 44}


# Prediction D + 2

In [9]:
xtrain2, xtest2, ytrain2, ytest2 = pre_process(xtrain, y2)

In [10]:
reg, params = hyper_parameter_search(xtrain2, 
                                     ytrain2,
                                     xtest2, 
                                     ytest2,
                                     model=model,
                                     param_dist=param_dist,
                                     n_iter_search=n_iter,
                                     train_samples=train_samples,
                                     test_samples=test_samples,
                                     cv=5)

model used: GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed: 10.5min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 11.7min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed: 12.0min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 12

Done with training.
R2 score train set: 0.3094 score test set: 0.0398
Mean-squared error train set: 0.0004 test set: 0.0006
Explained variance train set: 0.3095 test set: 0.0399
Root mean squared error train set 0.0201 test set: 0.0238
..................................................
Best parameters:
{'learning_rate': 0.3, 'loss': 'lad', 'max_depth': 8, 'max_features': None, 'min_samples_leaf': 19, 'min_samples_split': 4, 'n_estimators': 255, 'n_iter_no_change': 20}
