# Tunning GBM training parameters

It is going to be analyzed the impact of the following GBM constrains:
- max depth
- minimum amount of samples per leaf.
- minimum amount of samples to split a node.
- Diferent error criteria: MSE, Friedman MSE, and MAE.
- Different amount of boosted trees.
- Different losses: Least Squares, Least Absolute Deviation, Huber and Quantile.

In [1]:
import logging
import imp
from dateutil.relativedelta import relativedelta
from collections import OrderedDict
import sys

In [2]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor

In [3]:
sys.path.append('../..')

from helpers.dataset import read_quote_dataset, preprocess_quotes
from helpers.backtest import train_model_and_backtest_regressor, get_backtest_performance_metrics
from helpers.visualization import plot_return

In [4]:
# Configir logging module for jypter notebook
imp.reload(logging)
logging_format = '%(asctime)s - %(levelname)s - %(process)s - %(message)s'
logging.basicConfig(level=logging.DEBUG, format=logging_format)

# Disable backtesting logs
logging.getLogger('helpers.backtest').setLevel(level=logging.WARNING)

In [5]:
PARAM_DATASET = '../../../data/SPY_postprocess_adj.csv.gz'

In [6]:
df = read_quote_dataset(PARAM_DATASET)

In [7]:
df.head()

Unnamed: 0,date,open,high,low,close,close_adj,volume,open_adj,low_adj,high_adj,...,ratio_close_adj_000_close_adj_005_norm,ratio_close_adj_000_close_adj_020_norm,ratio_close_adj_000_ema_005_norm,ratio_close_adj_000_ema_010_norm,ratio_close_adj_000_ema_020_norm,ratio_close_adj_000_ema_050_norm,ratio_close_adj_000_sma_005_norm,ratio_close_adj_000_sma_010_norm,ratio_close_adj_000_sma_020_norm,ratio_close_adj_000_sma_050_norm
0,2000-01-03,148.25,148.25,143.875,145.4375,101.425385,8164300,103.38677,100.335727,103.38677,...,,,,,,,,,,
1,2000-01-04,143.531204,144.0625,139.640594,139.75,97.459068,8089800,100.09601,97.38277,100.466526,...,,,,,,,,,,
2,2000-01-05,139.9375,141.531204,137.25,140.0,97.633377,12177900,97.589791,95.715579,98.70121,...,,,,,,,,,,
3,2000-01-06,139.625,141.5,137.75,137.75,96.064301,6227200,97.371891,96.064301,98.679482,...,,,0.48663,,,,,,,
4,2000-01-07,140.3125,145.75,140.0625,145.75,101.643333,8066500,97.851322,97.676977,101.643333,...,,,0.815422,,,,0.740588,,,


In [8]:
vars_to_shift = ['close_adj', 'close_adj_norm', 'close_adj_std']
shift_periods = [1, 5, 10, 20]
vars_for_return = ['close_adj']
return_periods = [1, 5, 10, 20]

In [9]:
df = preprocess_quotes(
    df, vars_to_shift=vars_to_shift, shift_periods=shift_periods,
    vars_for_return=vars_for_return, return_periods=return_periods,
    shift_date=True
)

In [10]:
df[['date', 'close_adj', 'date_shift_1', 'close_adj_shift_1', 'close_adj_ret_1', 
    'date_shift_5', 'close_adj_shift_5', 'close_adj_ret_5']].head(10)

Unnamed: 0,date,close_adj,date_shift_1,close_adj_shift_1,close_adj_ret_1,date_shift_5,close_adj_shift_5,close_adj_ret_5
0,2000-01-03,101.425385,2000-01-04,97.459068,-0.039106,2000-01-10,101.992004,0.005587
1,2000-01-04,97.459068,2000-01-05,97.633377,0.001789,2000-01-11,100.771645,0.033989
2,2000-01-05,97.633377,2000-01-06,96.064301,-0.016071,2000-01-12,99.76915,0.021875
3,2000-01-06,96.064301,2000-01-07,101.643333,0.058076,2000-01-13,101.120308,0.052631
4,2000-01-07,101.643333,2000-01-10,101.992004,0.00343,2000-01-14,102.493233,0.008362
5,2000-01-10,101.992004,2000-01-11,100.771645,-0.011965,2000-01-18,101.686958,-0.002991
6,2000-01-11,100.771645,2000-01-12,99.76915,-0.009948,2000-01-19,102.51506,0.017301
7,2000-01-12,99.76915,2000-01-13,101.120308,0.013543,2000-01-20,100.945953,0.011795
8,2000-01-13,101.120308,2000-01-14,102.493233,0.013577,2000-01-21,100.727989,-0.00388
9,2000-01-14,102.493233,2000-01-18,101.686958,-0.007867,2000-01-24,97.873047,-0.045078


# Processing all the input variables

On the data processing notebook, it was computed all the historical variables. Lets review them

In [11]:
x_vars_all = list(filter(lambda varname: 
                ('_adj' in varname or 'volume' in varname)and
                not '_shift_' in varname and
                not '_std' in varname and
                not '_norm' in varname and
                not '_ret_' in varname, 
            df.columns))

In [12]:
logging.info('There are in total %d dependent variables', len(x_vars_all))

2019-06-24 15:23:16,966 - INFO - 24085 - There are in total 53 dependent variables


In [13]:
x_vars_all

['close_adj',
 'volume',
 'open_adj',
 'low_adj',
 'high_adj',
 'slope_close_adj_005',
 'slope_volume_005',
 'slope_close_adj_010',
 'slope_volume_010',
 'slope_close_adj_020',
 'slope_volume_020',
 'slope_close_adj_050',
 'slope_volume_050',
 'sma_close_adj_005',
 'sma_volume_005',
 'ema_close_adj_005',
 'ema_volume_005',
 'sma_close_adj_010',
 'sma_volume_010',
 'ema_close_adj_010',
 'ema_volume_010',
 'sma_close_adj_020',
 'sma_volume_020',
 'ema_close_adj_020',
 'ema_volume_020',
 'sma_close_adj_050',
 'sma_volume_050',
 'ema_close_adj_050',
 'ema_volume_050',
 'lag_close_adj_001',
 'lag_volume_001',
 'lag_close_adj_005',
 'lag_volume_005',
 'lag_close_adj_010',
 'lag_volume_010',
 'lag_close_adj_020',
 'lag_volume_020',
 'lag_close_adj_060',
 'lag_volume_060',
 'ratio_volume_000_sma_005',
 'ratio_volume_000_sma_020',
 'ratio_volume_000_ema_050',
 'ratio_close_adj_000_close_adj_001',
 'ratio_close_adj_000_close_adj_005',
 'ratio_close_adj_000_close_adj_020',
 'ratio_close_adj_000_e

Divive them by categories

In [14]:
x_vars_slope = list(filter(lambda var: var.startswith('slope_'), x_vars_all))
x_vars_sma = list(filter(lambda var: var.startswith('sma_'), x_vars_all))
x_vars_ema = list(filter(lambda var: var.startswith('ema_'), x_vars_all))
x_vars_lagged = list(filter(lambda var: var.startswith('lag_'), x_vars_all))
x_vars_ratio_close_adj = list(filter(lambda var: var.startswith('ratio_close_adj_'), x_vars_all))
x_vars_ratio_volume = list(filter(lambda var: var.startswith('ratio_volume_'), x_vars_all))

# Run backtests with different set of GBM parameters

Define the inputs

In [15]:
x_var_set = OrderedDict()
x_var_set['all'] = x_vars_all
x_var_set['close_adj'] = ['close_adj']
x_var_set['olh'] = ['open_adj', 'low_adj', 'high_adj']
x_var_set['olhc'] = ['open_adj', 'low_adj', 'high_adj', 'close_adj']
x_var_set['olhv'] = ['open_adj', 'low_adj', 'high_adj', 'volume']
x_var_set['olhcv'] = ['open_adj', 'low_adj', 'high_adj', 'close_adj', 'volume']
x_var_set['volume'] = ['volume']
#x_var_set['slope'] = x_vars_slope
x_var_set['sma'] = x_vars_sma
x_var_set['ema'] = x_vars_ema
x_var_set['lagged'] = x_vars_lagged
#x_var_set['ratio_closed_adj'] = x_vars_ratio_close_adj
#x_var_set['ratio_vol'] = x_vars_ratio_volume
x_var_set['olh_sma'] = ['open_adj', 'low_adj', 'high_adj'] + x_vars_sma
x_var_set['olh_ema'] = ['open_adj', 'low_adj', 'high_adj'] + x_vars_ema
x_var_set['olh_slope'] = ['open_adj', 'low_adj', 'high_adj'] + x_vars_slope

### Try min sample per leaf

In [16]:
model_params = OrderedDict()

model_params['defalt'] = {'random_state': 100, }

#model_params['depth3'] =  {'random_state': 100, 'max_depth': 3}
#model_params['depth4'] =  {'random_state': 100, 'max_depth': 4}
#model_params['depth5'] =  {'random_state': 100, 'max_depth': 5}
#model_params['depth6'] =  {'random_state': 100, 'max_depth': 6}
#model_params['depth7'] =  {'random_state': 100, 'max_depth': 7}
#model_params['depth8'] =  {'random_state': 100, 'max_depth': 8}
#model_params['depth9'] =  {'random_state': 100, 'max_depth': 9}
#model_params['depth10'] = {'random_state': 100, 'max_depth': 10}

model_params['min_samp_leaf-10'] =   {'random_state': 100, 'min_samples_leaf': 10}
model_params['min_samp_leaf-20'] =   {'random_state': 100, 'min_samples_leaf': 10}
model_params['min_samp_leaf-50'] =   {'random_state': 100, 'min_samples_leaf': 50}

#model_params['min_samp_split-10'] = {'random_state': 100, 'min_samples_split': 10}
#model_params['min_samp_split-50'] = {'random_state': 100, 'min_samples_split': 50}
#model_params['min_samp_split-100'] = {'random_state': 100, 'min_samples_split': 100}

#model_params['mse'] = {'random_state': 100, 'criterion': 'mse'}
#model_params['mae'] = {'random_state': 100, 'criterion': 'mae'}

#model_params['lad'] = {'random_state': 100, 'loss': 'lad'}
#model_params['huber'] = {'random_state': 100, 'loss': 'huber'}
#model_params['quant'] = {'random_state': 100, 'loss': 'quantile'}


#model_params['lr.01'] = {'random_state': 100, 'learning_rate': .01}
#model_params['lr.05'] = {'random_state': 100, 'learning_rate': .05}
#model_params['lr.001'] = {'random_state': 100, 'learning_rate': .001}


#model_params['n10'] = {'random_state': 100, 'n_estimators': 10}
#model_params['n50'] = {'random_state': 100, 'n_estimators': 50}
#model_params['n200'] = {'random_state': 100, 'n_estimators': 200}

In [17]:
%%time
backtest_summaries_list = []
performance_track = OrderedDict()
for model_params_name, current_model_params in model_params.items():
    for inputs_name, x_var_current in x_var_set.items():
        x_vars = x_var_current
        y_var = 'close_adj_shift_1'
        buy_price_col = 'close_adj'
        sell_price_col = 'close_adj_shift_1'
        model_class = GradientBoostingRegressor

        # logging.info('Backtesting with %s - %s', model_params_name, inputs_name)
        df_backtest = train_model_and_backtest_regressor(df, x_vars=x_vars, y_var=y_var, 
            buy_price_col=buy_price_col, sell_price_col=sell_price_col,
            model_class=model_class, model_params=current_model_params, 
            backtest_start='2000-06-01', backtest_end='2018-12-31', 
            model_update_frequency='M', train_history_period=relativedelta(months=1, days=1),
            col_date_shift='date_shift_1'
        )
        name = '%s-%s' % (model_params_name, inputs_name)
        performance_track[name] = df_backtest[['date', 'ret']]
        backtest_summary = get_backtest_performance_metrics(df_backtest.ret, df_backtest.benchmark_ret, 
                                                            with_benchmark=True, with_delta=True)
        backtest_summary_no_benchmark = backtest_summary['main']
        backtest_summary_no_benchmark.name = name
        backtest_summaries_list.append(backtest_summary_no_benchmark)
backtest_summaries_list.append(backtest_summary.benchmark)
performance_track['benchmark'] = df_backtest[['date', 'benchmark_ret']].rename({'benchmark_ret': 'ret'}, axis=1)

CPU times: user 8min 40s, sys: 547 ms, total: 8min 41s
Wall time: 8min 46s


In [18]:
pd.concat(backtest_summaries_list, axis=1).T.sort_values('return', ascending=False)[:30]

Unnamed: 0,alpha,beta,cagr,max_drawdown,return,sharpe,var,volatility
defalt-olh_sma,0.07758844,0.288603,0.081842,-0.371374,3.303236,0.507701,-0.01745,0.19056
defalt-olh,0.0725428,0.2792,0.075729,-0.348481,2.873806,0.478037,-0.017411,0.190524
defalt-olh_ema,0.06950265,0.316354,0.075134,-0.293878,2.834283,0.475849,-0.017332,0.190195
defalt-olhv,0.06713122,0.302398,0.071588,-0.323638,2.606296,0.458247,-0.017282,0.190303
defalt-ema,0.06201691,0.314121,0.066958,-0.290552,2.32793,0.435661,-0.017362,0.190216
defalt-olhc,0.0545754,0.291177,0.057425,-0.339725,1.817529,0.388185,-0.01745,0.190387
benchmark,-3.681556e-16,1.0,0.049527,-0.551894,1.451705,0.349346,-0.019044,0.190026
defalt-sma,0.04335895,0.27159,0.044273,-0.328392,1.233732,0.322438,-0.017859,0.190388
defalt-lagged,0.03411949,0.325692,0.038403,-0.405632,1.011927,0.293021,-0.017621,0.190227
defalt-olhcv,0.03001081,0.31166,0.033185,-0.444309,0.832423,0.266577,-0.017597,0.190191


According the tests, constraining the amount of samples per leaf doesn't help. Default params with OLH and SMA quotes is the best model. 

### Try min sample split

In [19]:
model_params = OrderedDict()

model_params['defalt'] = {'random_state': 100, }

#model_params['depth3'] =  {'random_state': 100, 'max_depth': 3}
#model_params['depth4'] =  {'random_state': 100, 'max_depth': 4}
#model_params['depth5'] =  {'random_state': 100, 'max_depth': 5}
#model_params['depth6'] =  {'random_state': 100, 'max_depth': 6}
#model_params['depth7'] =  {'random_state': 100, 'max_depth': 7}
#model_params['depth8'] =  {'random_state': 100, 'max_depth': 8}
#model_params['depth9'] =  {'random_state': 100, 'max_depth': 9}
#model_params['depth10'] = {'random_state': 100, 'max_depth': 10}

#model_params['min_samp_leaf-10'] =   {'random_state': 100, 'min_samples_leaf': 10}
#model_params['min_samp_leaf-20'] =   {'random_state': 100, 'min_samples_leaf': 10}
#model_params['min_samp_leaf-50'] =   {'random_state': 100, 'min_samples_leaf': 50}

model_params['min_samp_split-10'] = {'random_state': 100, 'min_samples_split': 10}
model_params['min_samp_split-50'] = {'random_state': 100, 'min_samples_split': 50}
model_params['min_samp_split-100'] = {'random_state': 100, 'min_samples_split': 100}

#model_params['mse'] = {'random_state': 100, 'criterion': 'mse'}
#model_params['mae'] = {'random_state': 100, 'criterion': 'mae'}

#model_params['lad'] = {'random_state': 100, 'loss': 'lad'}
#model_params['huber'] = {'random_state': 100, 'loss': 'huber'}
#model_params['quant'] = {'random_state': 100, 'loss': 'quantile'}


#model_params['lr.01'] = {'random_state': 100, 'learning_rate': .01}
#model_params['lr.05'] = {'random_state': 100, 'learning_rate': .05}
#model_params['lr.001'] = {'random_state': 100, 'learning_rate': .001}


#model_params['n10'] = {'random_state': 100, 'n_estimators': 10}
#model_params['n50'] = {'random_state': 100, 'n_estimators': 50}
#model_params['n200'] = {'random_state': 100, 'n_estimators': 200}

In [20]:
%%time
backtest_summaries_list = []
performance_track = OrderedDict()
for model_params_name, current_model_params in model_params.items():
    for inputs_name, x_var_current in x_var_set.items():
        x_vars = x_var_current
        y_var = 'close_adj_shift_1'
        buy_price_col = 'close_adj'
        sell_price_col = 'close_adj_shift_1'
        model_class = GradientBoostingRegressor

        logging.info('Backtesting with %s - %s', model_params_name, inputs_name)
        df_backtest = train_model_and_backtest_regressor(df, x_vars=x_vars, y_var=y_var, 
            buy_price_col=buy_price_col, sell_price_col=sell_price_col,
            model_class=model_class, model_params=current_model_params, 
            backtest_start='2000-06-01', backtest_end='2018-12-31', 
            model_update_frequency='M', train_history_period=relativedelta(months=1, days=1),
            col_date_shift='date_shift_1'
        )
        name = '%s-%s' % (model_params_name, inputs_name)
        performance_track[name] = df_backtest[['date', 'ret']]
        backtest_summary = get_backtest_performance_metrics(df_backtest.ret, df_backtest.benchmark_ret, 
                                                            with_benchmark=True, with_delta=True)
        backtest_summary_no_benchmark = backtest_summary['main']
        backtest_summary_no_benchmark.name = name
        backtest_summaries_list.append(backtest_summary_no_benchmark)
backtest_summaries_list.append(backtest_summary.benchmark)
performance_track['benchmark'] = df_backtest[['date', 'benchmark_ret']].rename({'benchmark_ret': 'ret'}, axis=1)

2019-06-24 15:40:59,732 - INFO - 24085 - Backtesting with defalt - all
2019-06-24 15:41:16,713 - INFO - 24085 - Backtesting with defalt - close_adj
2019-06-24 15:41:28,555 - INFO - 24085 - Backtesting with defalt - olh
2019-06-24 15:41:40,612 - INFO - 24085 - Backtesting with defalt - olhc
2019-06-24 15:41:52,735 - INFO - 24085 - Backtesting with defalt - olhv
2019-06-24 15:42:05,250 - INFO - 24085 - Backtesting with defalt - olhcv
2019-06-24 15:42:15,989 - INFO - 24085 - Backtesting with defalt - volume
2019-06-24 15:42:27,326 - INFO - 24085 - Backtesting with defalt - sma
2019-06-24 15:42:39,832 - INFO - 24085 - Backtesting with defalt - ema
2019-06-24 15:42:49,031 - INFO - 24085 - Backtesting with defalt - lagged
2019-06-24 15:42:58,611 - INFO - 24085 - Backtesting with defalt - olh_sma
2019-06-24 15:43:07,947 - INFO - 24085 - Backtesting with defalt - olh_ema
2019-06-24 15:43:17,250 - INFO - 24085 - Backtesting with defalt - olh_slope
2019-06-24 15:43:28,151 - INFO - 24085 - Backte

CPU times: user 9min 25s, sys: 584 ms, total: 9min 25s
Wall time: 9min 27s


In [21]:
pd.concat(backtest_summaries_list, axis=1).T.sort_values('return', ascending=False)[:30]

Unnamed: 0,alpha,beta,cagr,max_drawdown,return,sharpe,var,volatility
min_samp_split-10-olh,0.08355586,0.296302,0.088879,-0.343385,3.853263,0.541974,-0.017168,0.190463
min_samp_split-10-olh_ema,0.07619025,0.316687,0.082373,-0.290552,3.342542,0.511165,-0.017376,0.19018
defalt-olh_sma,0.07758844,0.288603,0.081842,-0.371374,3.303236,0.507701,-0.01745,0.19056
min_samp_split-10-olh_sma,0.07549319,0.255526,0.077221,-0.315306,2.974702,0.485571,-0.017229,0.190407
defalt-olh,0.0725428,0.2792,0.075729,-0.348481,2.873806,0.478037,-0.017411,0.190524
defalt-olh_ema,0.06950265,0.316354,0.075134,-0.293878,2.834283,0.475849,-0.017332,0.190195
defalt-olhv,0.06713122,0.302398,0.071588,-0.323638,2.606296,0.458247,-0.017282,0.190303
min_samp_split-10-olhv,0.06193384,0.340978,0.068771,-0.442934,2.434451,0.444547,-0.017132,0.190238
min_samp_split-10-olhc,0.06508305,0.280084,0.067807,-0.340293,2.377454,0.439525,-0.017588,0.190379
defalt-ema,0.06201691,0.314121,0.066958,-0.290552,2.32793,0.435661,-0.017362,0.190216


`min_sample_split` could help. While the default strategy  has almost the same CAGR than with `min_sample_split` equal 10, the max drawdown was reduced from 37% to 29%.

### Try different error criterions

In [22]:
model_params = OrderedDict()

model_params['defalt'] = {'random_state': 100, }

#model_params['depth3'] =  {'random_state': 100, 'max_depth': 3}
#model_params['depth4'] =  {'random_state': 100, 'max_depth': 4}
#model_params['depth5'] =  {'random_state': 100, 'max_depth': 5}
#model_params['depth6'] =  {'random_state': 100, 'max_depth': 6}
#model_params['depth7'] =  {'random_state': 100, 'max_depth': 7}
#model_params['depth8'] =  {'random_state': 100, 'max_depth': 8}
#model_params['depth9'] =  {'random_state': 100, 'max_depth': 9}
#model_params['depth10'] = {'random_state': 100, 'max_depth': 10}

#model_params['min_samp_leaf-10'] =   {'random_state': 100, 'min_samples_leaf': 10}
#model_params['min_samp_leaf-20'] =   {'random_state': 100, 'min_samples_leaf': 10}
#model_params['min_samp_leaf-50'] =   {'random_state': 100, 'min_samples_leaf': 50}

#model_params['min_samp_split-10'] = {'random_state': 100, 'min_samples_split': 10}
#model_params['min_samp_split-50'] = {'random_state': 100, 'min_samples_split': 50}
#model_params['min_samp_split-100'] = {'random_state': 100, 'min_samples_split': 100}

model_params['mse'] = {'random_state': 100, 'criterion': 'mse'}
model_params['mae'] = {'random_state': 100, 'criterion': 'mae'}

#model_params['lad'] = {'random_state': 100, 'loss': 'lad'}
#model_params['huber'] = {'random_state': 100, 'loss': 'huber'}
#model_params['quant'] = {'random_state': 100, 'loss': 'quantile'}


#model_params['lr.01'] = {'random_state': 100, 'learning_rate': .01}
#model_params['lr.05'] = {'random_state': 100, 'learning_rate': .05}
#model_params['lr.001'] = {'random_state': 100, 'learning_rate': .001}


#model_params['n10'] = {'random_state': 100, 'n_estimators': 10}
#model_params['n50'] = {'random_state': 100, 'n_estimators': 50}
#model_params['n200'] = {'random_state': 100, 'n_estimators': 200}

In [23]:
%%time
backtest_summaries_list = []
performance_track = OrderedDict()
for model_params_name, current_model_params in model_params.items():
    for inputs_name, x_var_current in x_var_set.items():
        x_vars = x_var_current
        y_var = 'close_adj_shift_1'
        buy_price_col = 'close_adj'
        sell_price_col = 'close_adj_shift_1'
        model_class = GradientBoostingRegressor

        logging.info('Backtesting with %s - %s', model_params_name, inputs_name)
        df_backtest = train_model_and_backtest_regressor(df, x_vars=x_vars, y_var=y_var, 
            buy_price_col=buy_price_col, sell_price_col=sell_price_col,
            model_class=model_class, model_params=current_model_params, 
            backtest_start='2000-06-01', backtest_end='2018-12-31', 
            model_update_frequency='M', train_history_period=relativedelta(months=1, days=1),
            col_date_shift='date_shift_1'
        )
        name = '%s-%s' % (model_params_name, inputs_name)
        performance_track[name] = df_backtest[['date', 'ret']]
        backtest_summary = get_backtest_performance_metrics(df_backtest.ret, df_backtest.benchmark_ret, 
                                                            with_benchmark=True, with_delta=True)
        backtest_summary_no_benchmark = backtest_summary['main']
        backtest_summary_no_benchmark.name = name
        backtest_summaries_list.append(backtest_summary_no_benchmark)
backtest_summaries_list.append(backtest_summary.benchmark)
performance_track['benchmark'] = df_backtest[['date', 'benchmark_ret']].rename({'benchmark_ret': 'ret'}, axis=1)

2019-06-24 15:50:27,246 - INFO - 24085 - Backtesting with defalt - all
2019-06-24 15:50:44,714 - INFO - 24085 - Backtesting with defalt - close_adj
2019-06-24 15:50:57,080 - INFO - 24085 - Backtesting with defalt - olh
2019-06-24 15:51:06,196 - INFO - 24085 - Backtesting with defalt - olhc
2019-06-24 15:51:14,691 - INFO - 24085 - Backtesting with defalt - olhv
2019-06-24 15:51:23,291 - INFO - 24085 - Backtesting with defalt - olhcv
2019-06-24 15:51:32,001 - INFO - 24085 - Backtesting with defalt - volume
2019-06-24 15:51:40,182 - INFO - 24085 - Backtesting with defalt - sma
2019-06-24 15:51:48,902 - INFO - 24085 - Backtesting with defalt - ema
2019-06-24 15:51:57,775 - INFO - 24085 - Backtesting with defalt - lagged
2019-06-24 15:52:06,875 - INFO - 24085 - Backtesting with defalt - olh_sma
2019-06-24 15:52:18,621 - INFO - 24085 - Backtesting with defalt - olh_ema
2019-06-24 15:52:31,543 - INFO - 24085 - Backtesting with defalt - olh_slope
2019-06-24 15:52:44,514 - INFO - 24085 - Backte

CPU times: user 9min 17s, sys: 376 ms, total: 9min 17s
Wall time: 9min 18s


In [24]:
pd.concat(backtest_summaries_list, axis=1).T.sort_values('return', ascending=False)[:30]

Unnamed: 0,alpha,beta,cagr,max_drawdown,return,sharpe,var,volatility
mae-olhv,0.07793318,0.332855,0.08542,-0.380868,3.575052,0.525783,-0.017168,0.190249
mae-olh_ema,0.07570419,0.340944,0.083589,-0.290552,3.433972,0.517037,-0.017376,0.190195
defalt-olh_sma,0.07758844,0.288603,0.081842,-0.371374,3.303236,0.507701,-0.01745,0.19056
mse-olh_sma,0.07758844,0.288603,0.081842,-0.371374,3.303236,0.507701,-0.01745,0.19056
defalt-olh,0.0725428,0.2792,0.075729,-0.348481,2.873806,0.478037,-0.017411,0.190524
mse-olh,0.0725428,0.2792,0.075729,-0.348481,2.873806,0.478037,-0.017411,0.190524
mse-olh_ema,0.06950265,0.316354,0.075134,-0.293878,2.834283,0.475849,-0.017332,0.190195
defalt-olh_ema,0.06950265,0.316354,0.075134,-0.293878,2.834283,0.475849,-0.017332,0.190195
mae-ema,0.06747369,0.321744,0.073337,-0.289942,2.717069,0.46696,-0.017132,0.190236
defalt-olhv,0.06713122,0.302398,0.071588,-0.323638,2.606296,0.458247,-0.017282,0.190303


The three criterion are in the top-3: MSE, Friedman MSE and MAE. The three criterion have to be tested with more depth.

### Try the most important parameter: the depth

In [25]:
model_params = OrderedDict()

model_params['defalt'] = {'random_state': 100, }

model_params['depth3'] =  {'random_state': 100, 'max_depth': 3}
model_params['depth4'] =  {'random_state': 100, 'max_depth': 4}
model_params['depth5'] =  {'random_state': 100, 'max_depth': 5}
model_params['depth6'] =  {'random_state': 100, 'max_depth': 6}
model_params['depth7'] =  {'random_state': 100, 'max_depth': 7}
model_params['depth8'] =  {'random_state': 100, 'max_depth': 8}
model_params['depth9'] =  {'random_state': 100, 'max_depth': 9}
model_params['depth10'] = {'random_state': 100, 'max_depth': 10}

#model_params['min_samp_leaf-10'] =   {'random_state': 100, 'min_samples_leaf': 10}
#model_params['min_samp_leaf-20'] =   {'random_state': 100, 'min_samples_leaf': 10}
#model_params['min_samp_leaf-50'] =   {'random_state': 100, 'min_samples_leaf': 50}

#model_params['min_samp_split-10'] = {'random_state': 100, 'min_samples_split': 10}
#model_params['min_samp_split-50'] = {'random_state': 100, 'min_samples_split': 50}
#model_params['min_samp_split-100'] = {'random_state': 100, 'min_samples_split': 100}

#model_params['mse'] = {'random_state': 100, 'criterion': 'mse'}
#model_params['mae'] = {'random_state': 100, 'criterion': 'mae'}

#model_params['lad'] = {'random_state': 100, 'loss': 'lad'}
#model_params['huber'] = {'random_state': 100, 'loss': 'huber'}
#model_params['quant'] = {'random_state': 100, 'loss': 'quantile'}


#model_params['lr.01'] = {'random_state': 100, 'learning_rate': .01}
#model_params['lr.05'] = {'random_state': 100, 'learning_rate': .05}
#model_params['lr.001'] = {'random_state': 100, 'learning_rate': .001}


#model_params['n10'] = {'random_state': 100, 'n_estimators': 10}
#model_params['n50'] = {'random_state': 100, 'n_estimators': 50}
#model_params['n200'] = {'random_state': 100, 'n_estimators': 200}

In [26]:
%%time
backtest_summaries_list = []
performance_track = OrderedDict()
for model_params_name, current_model_params in model_params.items():
    for inputs_name, x_var_current in x_var_set.items():
        x_vars = x_var_current
        y_var = 'close_adj_shift_1'
        buy_price_col = 'close_adj'
        sell_price_col = 'close_adj_shift_1'
        model_class = GradientBoostingRegressor

        logging.info('Backtesting with %s - %s', model_params_name, inputs_name)
        df_backtest = train_model_and_backtest_regressor(df, x_vars=x_vars, y_var=y_var, 
            buy_price_col=buy_price_col, sell_price_col=sell_price_col,
            model_class=model_class, model_params=current_model_params, 
            backtest_start='2000-06-01', backtest_end='2018-12-31', 
            model_update_frequency='M', train_history_period=relativedelta(months=1, days=1),
            col_date_shift='date_shift_1'
        )
        name = '%s-%s' % (model_params_name, inputs_name)
        performance_track[name] = df_backtest[['date', 'ret']]
        backtest_summary = get_backtest_performance_metrics(df_backtest.ret, df_backtest.benchmark_ret, 
                                                            with_benchmark=True, with_delta=True)
        backtest_summary_no_benchmark = backtest_summary['main']
        backtest_summary_no_benchmark.name = name
        backtest_summaries_list.append(backtest_summary_no_benchmark)
backtest_summaries_list.append(backtest_summary.benchmark)
performance_track['benchmark'] = df_backtest[['date', 'benchmark_ret']].rename({'benchmark_ret': 'ret'}, axis=1)

2019-06-24 15:59:46,174 - INFO - 24085 - Backtesting with defalt - all
2019-06-24 16:00:03,308 - INFO - 24085 - Backtesting with defalt - close_adj
2019-06-24 16:00:15,713 - INFO - 24085 - Backtesting with defalt - olh
2019-06-24 16:00:28,122 - INFO - 24085 - Backtesting with defalt - olhc
2019-06-24 16:00:40,523 - INFO - 24085 - Backtesting with defalt - olhv
2019-06-24 16:00:53,414 - INFO - 24085 - Backtesting with defalt - olhcv
2019-06-24 16:01:06,208 - INFO - 24085 - Backtesting with defalt - volume
2019-06-24 16:01:18,167 - INFO - 24085 - Backtesting with defalt - sma
2019-06-24 16:01:31,393 - INFO - 24085 - Backtesting with defalt - ema
2019-06-24 16:01:45,051 - INFO - 24085 - Backtesting with defalt - lagged
2019-06-24 16:01:58,950 - INFO - 24085 - Backtesting with defalt - olh_sma
2019-06-24 16:02:12,576 - INFO - 24085 - Backtesting with defalt - olh_ema
2019-06-24 16:02:26,340 - INFO - 24085 - Backtesting with defalt - olh_slope
2019-06-24 16:02:39,948 - INFO - 24085 - Backte

2019-06-24 16:24:47,746 - INFO - 24085 - Backtesting with depth10 - ema
2019-06-24 16:25:00,954 - INFO - 24085 - Backtesting with depth10 - lagged
2019-06-24 16:25:14,492 - INFO - 24085 - Backtesting with depth10 - olh_sma
2019-06-24 16:25:27,281 - INFO - 24085 - Backtesting with depth10 - olh_ema
2019-06-24 16:25:41,159 - INFO - 24085 - Backtesting with depth10 - olh_slope


CPU times: user 26min 6s, sys: 1.22 s, total: 26min 7s
Wall time: 26min 8s


In [27]:
pd.concat(backtest_summaries_list, axis=1).T.sort_values('return', ascending=False)[:30]

Unnamed: 0,alpha,beta,cagr,max_drawdown,return,sharpe,var,volatility
depth4-olh_ema,0.106635,0.313632,0.1156,-0.290552,6.609563,0.670151,-0.01702,0.190189
depth6-olh,0.107879,0.272252,0.113898,-0.345964,6.396992,0.660939,-0.017109,0.190565
depth7-olh,0.105171,0.271602,0.110838,-0.374404,6.028975,0.646482,-0.017127,0.190572
depth10-olh,0.102063,0.267706,0.107105,-0.405297,5.603511,0.628829,-0.017145,0.190568
depth8-olh,0.101043,0.275589,0.106556,-0.377599,5.543035,0.626247,-0.017168,0.190561
depth9-olh,0.100908,0.267986,0.105849,-0.405297,5.465818,0.622861,-0.017145,0.19057
depth6-olh_ema,0.088652,0.339619,0.097612,-0.271001,4.628503,0.584695,-0.017256,0.19018
depth5-olh_ema,0.087599,0.344127,0.096786,-0.305589,4.550448,0.58074,-0.017376,0.190178
depth5-olh,0.091194,0.276604,0.095789,-0.358915,4.457511,0.574998,-0.017256,0.190533
depth4-olhv,0.088746,0.300363,0.094852,-0.330352,4.371593,0.571151,-0.017202,0.190292


Several depths are having good returns. Now the default is on position #30. While the default depth has a CAGR near 7.5%, changing the depth a CAGR of 11% was reached.

### Try different kind of losses

In [28]:
model_params = OrderedDict()

model_params['defalt'] = {'random_state': 100, }

#model_params['depth3'] =  {'random_state': 100, 'max_depth': 3}
#model_params['depth4'] =  {'random_state': 100, 'max_depth': 4}
#model_params['depth5'] =  {'random_state': 100, 'max_depth': 5}
#model_params['depth6'] =  {'random_state': 100, 'max_depth': 6}
#model_params['depth7'] =  {'random_state': 100, 'max_depth': 7}
#model_params['depth8'] =  {'random_state': 100, 'max_depth': 8}
#model_params['depth9'] =  {'random_state': 100, 'max_depth': 9}
#model_params['depth10'] = {'random_state': 100, 'max_depth': 10}

#model_params['min_samp_leaf-10'] =   {'random_state': 100, 'min_samples_leaf': 10}
#model_params['min_samp_leaf-20'] =   {'random_state': 100, 'min_samples_leaf': 10}
#model_params['min_samp_leaf-50'] =   {'random_state': 100, 'min_samples_leaf': 50}

#model_params['min_samp_split-10'] = {'random_state': 100, 'min_samples_split': 10}
#model_params['min_samp_split-50'] = {'random_state': 100, 'min_samples_split': 50}
#model_params['min_samp_split-100'] = {'random_state': 100, 'min_samples_split': 100}

#model_params['mse'] = {'random_state': 100, 'criterion': 'mse'}
#model_params['mae'] = {'random_state': 100, 'criterion': 'mae'}

model_params['lad'] = {'random_state': 100, 'loss': 'lad'}
model_params['huber'] = {'random_state': 100, 'loss': 'huber'}
model_params['quant'] = {'random_state': 100, 'loss': 'quantile'}


#model_params['lr.01'] = {'random_state': 100, 'learning_rate': .01}
#model_params['lr.05'] = {'random_state': 100, 'learning_rate': .05}
#model_params['lr.001'] = {'random_state': 100, 'learning_rate': .001}


#model_params['n10'] = {'random_state': 100, 'n_estimators': 10}
#model_params['n50'] = {'random_state': 100, 'n_estimators': 50}
#model_params['n200'] = {'random_state': 100, 'n_estimators': 200}

In [29]:
%%time
backtest_summaries_list = []
performance_track = OrderedDict()
for model_params_name, current_model_params in model_params.items():
    for inputs_name, x_var_current in x_var_set.items():
        x_vars = x_var_current
        y_var = 'close_adj_shift_1'
        buy_price_col = 'close_adj'
        sell_price_col = 'close_adj_shift_1'
        model_class = GradientBoostingRegressor

        logging.info('Backtesting with %s - %s', model_params_name, inputs_name)
        df_backtest = train_model_and_backtest_regressor(df, x_vars=x_vars, y_var=y_var, 
            buy_price_col=buy_price_col, sell_price_col=sell_price_col,
            model_class=model_class, model_params=current_model_params, 
            backtest_start='2000-06-01', backtest_end='2018-12-31', 
            model_update_frequency='M', train_history_period=relativedelta(months=1, days=1),
            col_date_shift='date_shift_1'
        )
        name = '%s-%s' % (model_params_name, inputs_name)
        performance_track[name] = df_backtest[['date', 'ret']]
        backtest_summary = get_backtest_performance_metrics(df_backtest.ret, df_backtest.benchmark_ret, 
                                                            with_benchmark=True, with_delta=True)
        backtest_summary_no_benchmark = backtest_summary['main']
        backtest_summary_no_benchmark.name = name
        backtest_summaries_list.append(backtest_summary_no_benchmark)
backtest_summaries_list.append(backtest_summary.benchmark)
performance_track['benchmark'] = df_backtest[['date', 'benchmark_ret']].rename({'benchmark_ret': 'ret'}, axis=1)

2019-06-24 18:12:18,490 - INFO - 24085 - Backtesting with defalt - all
2019-06-24 18:12:30,315 - INFO - 24085 - Backtesting with defalt - close_adj
2019-06-24 18:12:40,808 - INFO - 24085 - Backtesting with defalt - olh
2019-06-24 18:12:48,609 - INFO - 24085 - Backtesting with defalt - olhc
2019-06-24 18:12:56,961 - INFO - 24085 - Backtesting with defalt - olhv
2019-06-24 18:13:05,254 - INFO - 24085 - Backtesting with defalt - olhcv
2019-06-24 18:13:17,626 - INFO - 24085 - Backtesting with defalt - volume
2019-06-24 18:13:28,239 - INFO - 24085 - Backtesting with defalt - sma
2019-06-24 18:13:40,453 - INFO - 24085 - Backtesting with defalt - ema
2019-06-24 18:13:51,898 - INFO - 24085 - Backtesting with defalt - lagged
2019-06-24 18:14:04,022 - INFO - 24085 - Backtesting with defalt - olh_sma
2019-06-24 18:14:17,196 - INFO - 24085 - Backtesting with defalt - olh_ema
2019-06-24 18:14:30,951 - INFO - 24085 - Backtesting with defalt - olh_slope
2019-06-24 18:14:43,816 - INFO - 24085 - Backte

CPU times: user 24min 28s, sys: 919 ms, total: 24min 29s
Wall time: 24min 44s


In [30]:
pd.concat(backtest_summaries_list, axis=1).T.sort_values('return', ascending=False)[:30]

Unnamed: 0,alpha,beta,cagr,max_drawdown,return,sharpe,var,volatility
huber-olh_ema,0.07721875,0.347044,0.08567,-0.340293,3.594639,0.527103,-0.017411,0.190204
lad-ema,0.07646537,0.320055,0.082911,-0.316074,3.382762,0.513717,-0.017168,0.190206
defalt-olh_sma,0.07758844,0.288603,0.081842,-0.371374,3.303236,0.507701,-0.01745,0.19056
huber-olh_sma,0.07202875,0.290556,0.075992,-0.383324,2.891427,0.47949,-0.01745,0.190447
defalt-olh,0.0725428,0.2792,0.075729,-0.348481,2.873806,0.478037,-0.017411,0.190524
defalt-olh_ema,0.06950265,0.316354,0.075134,-0.293878,2.834283,0.475849,-0.017332,0.190195
quant-olhv,0.04961196,0.577815,0.072417,-0.392214,2.658454,0.462642,-0.01823,0.190147
defalt-olhv,0.06713122,0.302398,0.071588,-0.323638,2.606296,0.458247,-0.017282,0.190303
lad-olh_ema,0.06544231,0.30539,0.069996,-0.355756,2.508227,0.450566,-0.017202,0.19024
quant-olhcv,0.04524817,0.592408,0.068785,-0.468951,2.435259,0.444842,-0.0183,0.190124


There is no improvement using different losses

### Try different learning rates

In [32]:
model_params = OrderedDict()

model_params['defalt'] = {'random_state': 100, }

#model_params['depth3'] =  {'random_state': 100, 'max_depth': 3}
#model_params['depth4'] =  {'random_state': 100, 'max_depth': 4}
#model_params['depth5'] =  {'random_state': 100, 'max_depth': 5}
#model_params['depth6'] =  {'random_state': 100, 'max_depth': 6}
#model_params['depth7'] =  {'random_state': 100, 'max_depth': 7}
#model_params['depth8'] =  {'random_state': 100, 'max_depth': 8}
#model_params['depth9'] =  {'random_state': 100, 'max_depth': 9}
#model_params['depth10'] = {'random_state': 100, 'max_depth': 10}

#model_params['min_samp_leaf-10'] =   {'random_state': 100, 'min_samples_leaf': 10}
#model_params['min_samp_leaf-20'] =   {'random_state': 100, 'min_samples_leaf': 10}
#model_params['min_samp_leaf-50'] =   {'random_state': 100, 'min_samples_leaf': 50}

#model_params['min_samp_split-10'] = {'random_state': 100, 'min_samples_split': 10}
#model_params['min_samp_split-50'] = {'random_state': 100, 'min_samples_split': 50}
#model_params['min_samp_split-100'] = {'random_state': 100, 'min_samples_split': 100}

#model_params['mse'] = {'random_state': 100, 'criterion': 'mse'}
#model_params['mae'] = {'random_state': 100, 'criterion': 'mae'}

#model_params['lad'] = {'random_state': 100, 'loss': 'lad'}
#model_params['huber'] = {'random_state': 100, 'loss': 'huber'}
#model_params['quant'] = {'random_state': 100, 'loss': 'quantile'}

model_params['lr.01'] = {'random_state': 100, 'learning_rate': .01}
model_params['lr.05'] = {'random_state': 100, 'learning_rate': .05}
model_params['lr.001'] = {'random_state': 100, 'learning_rate': .001}

#model_params['n10'] = {'random_state': 100, 'n_estimators': 10}
#model_params['n50'] = {'random_state': 100, 'n_estimators': 50}
#model_params['n200'] = {'random_state': 100, 'n_estimators': 200}

In [33]:
%%time
backtest_summaries_list = []
performance_track = OrderedDict()
for model_params_name, current_model_params in model_params.items():
    for inputs_name, x_var_current in x_var_set.items():
        x_vars = x_var_current
        y_var = 'close_adj_shift_1'
        buy_price_col = 'close_adj'
        sell_price_col = 'close_adj_shift_1'
        model_class = GradientBoostingRegressor

        logging.info('Backtesting with %s - %s', model_params_name, inputs_name)
        df_backtest = train_model_and_backtest_regressor(df, x_vars=x_vars, y_var=y_var, 
            buy_price_col=buy_price_col, sell_price_col=sell_price_col,
            model_class=model_class, model_params=current_model_params, 
            backtest_start='2000-06-01', backtest_end='2018-12-31', 
            model_update_frequency='M', train_history_period=relativedelta(months=1, days=1),
            col_date_shift='date_shift_1'
        )
        name = '%s-%s' % (model_params_name, inputs_name)
        performance_track[name] = df_backtest[['date', 'ret']]
        backtest_summary = get_backtest_performance_metrics(df_backtest.ret, df_backtest.benchmark_ret, 
                                                            with_benchmark=True, with_delta=True)
        backtest_summary_no_benchmark = backtest_summary['main']
        backtest_summary_no_benchmark.name = name
        backtest_summaries_list.append(backtest_summary_no_benchmark)
backtest_summaries_list.append(backtest_summary.benchmark)
performance_track['benchmark'] = df_backtest[['date', 'benchmark_ret']].rename({'benchmark_ret': 'ret'}, axis=1)

2019-06-24 18:40:50,580 - INFO - 24085 - Backtesting with defalt - all
2019-06-24 18:41:05,907 - INFO - 24085 - Backtesting with defalt - close_adj
2019-06-24 18:41:15,634 - INFO - 24085 - Backtesting with defalt - olh
2019-06-24 18:41:26,918 - INFO - 24085 - Backtesting with defalt - olhc
2019-06-24 18:41:37,283 - INFO - 24085 - Backtesting with defalt - olhv
2019-06-24 18:41:48,355 - INFO - 24085 - Backtesting with defalt - olhcv
2019-06-24 18:41:56,648 - INFO - 24085 - Backtesting with defalt - volume
2019-06-24 18:42:04,794 - INFO - 24085 - Backtesting with defalt - sma
2019-06-24 18:42:13,499 - INFO - 24085 - Backtesting with defalt - ema
2019-06-24 18:42:23,749 - INFO - 24085 - Backtesting with defalt - lagged
2019-06-24 18:42:36,890 - INFO - 24085 - Backtesting with defalt - olh_sma
2019-06-24 18:42:50,665 - INFO - 24085 - Backtesting with defalt - olh_ema
2019-06-24 18:43:04,145 - INFO - 24085 - Backtesting with defalt - olh_slope
2019-06-24 18:43:16,267 - INFO - 24085 - Backte

CPU times: user 9min 11s, sys: 664 ms, total: 9min 11s
Wall time: 9min 15s


In [34]:
pd.concat(backtest_summaries_list, axis=1).T.sort_values('return', ascending=False)[:30]

Unnamed: 0,alpha,beta,cagr,max_drawdown,return,sharpe,var,volatility
lr.05-olh_ema,0.08321806,0.320332,0.090265,-0.290552,3.969101,0.549221,-0.017256,0.190239
defalt-olh_sma,0.07758844,0.288603,0.081842,-0.371374,3.303236,0.507701,-0.01745,0.19056
defalt-olh,0.0725428,0.2792,0.075729,-0.348481,2.873806,0.478037,-0.017411,0.190524
defalt-olh_ema,0.06950265,0.316354,0.075134,-0.293878,2.834283,0.475849,-0.017332,0.190195
defalt-olhv,0.06713122,0.302398,0.071588,-0.323638,2.606296,0.458247,-0.017282,0.190303
lr.05-olh,0.06774922,0.293134,0.071577,-0.343385,2.605618,0.457766,-0.017588,0.19051
lr.05-olhv,0.06631626,0.303935,0.070823,-0.323638,2.558855,0.454466,-0.01724,0.190318
lr.05-ema,0.0641417,0.321857,0.069774,-0.290552,2.494705,0.449446,-0.017168,0.190252
lr.05-olh_sma,0.06574601,0.274561,0.068119,-0.366621,2.395768,0.440909,-0.017597,0.190454
defalt-ema,0.06201691,0.314121,0.066958,-0.290552,2.32793,0.435661,-0.017362,0.190216


Using a reduction of the learning rate produces a slighty bigger CAGR, and also a reduction in the max drawdown.

### Try different amount of booested trees

In [35]:
model_params = OrderedDict()

model_params['defalt'] = {'random_state': 100, }

#model_params['depth3'] =  {'random_state': 100, 'max_depth': 3}
#model_params['depth4'] =  {'random_state': 100, 'max_depth': 4}
#model_params['depth5'] =  {'random_state': 100, 'max_depth': 5}
#model_params['depth6'] =  {'random_state': 100, 'max_depth': 6}
#model_params['depth7'] =  {'random_state': 100, 'max_depth': 7}
#model_params['depth8'] =  {'random_state': 100, 'max_depth': 8}
#model_params['depth9'] =  {'random_state': 100, 'max_depth': 9}
#model_params['depth10'] = {'random_state': 100, 'max_depth': 10}

#model_params['min_samp_leaf-10'] =   {'random_state': 100, 'min_samples_leaf': 10}
#model_params['min_samp_leaf-20'] =   {'random_state': 100, 'min_samples_leaf': 10}
#model_params['min_samp_leaf-50'] =   {'random_state': 100, 'min_samples_leaf': 50}

#model_params['min_samp_split-10'] = {'random_state': 100, 'min_samples_split': 10}
#model_params['min_samp_split-50'] = {'random_state': 100, 'min_samples_split': 50}
#model_params['min_samp_split-100'] = {'random_state': 100, 'min_samples_split': 100}

#model_params['mse'] = {'random_state': 100, 'criterion': 'mse'}
#model_params['mae'] = {'random_state': 100, 'criterion': 'mae'}

#model_params['lad'] = {'random_state': 100, 'loss': 'lad'}
#model_params['huber'] = {'random_state': 100, 'loss': 'huber'}
#model_params['quant'] = {'random_state': 100, 'loss': 'quantile'}

#model_params['lr.01'] = {'random_state': 100, 'learning_rate': .01}
#model_params['lr.05'] = {'random_state': 100, 'learning_rate': .05}
#model_params['lr.001'] = {'random_state': 100, 'learning_rate': .001}

model_params['n10'] = {'random_state': 100, 'n_estimators': 10}
model_params['n50'] = {'random_state': 100, 'n_estimators': 50}
model_params['n200'] = {'random_state': 100, 'n_estimators': 200}

In [36]:
%%time
backtest_summaries_list = []
performance_track = OrderedDict()
for model_params_name, current_model_params in model_params.items():
    for inputs_name, x_var_current in x_var_set.items():
        x_vars = x_var_current
        y_var = 'close_adj_shift_1'
        buy_price_col = 'close_adj'
        sell_price_col = 'close_adj_shift_1'
        model_class = GradientBoostingRegressor

        logging.info('Backtesting with %s - %s', model_params_name, inputs_name)
        df_backtest = train_model_and_backtest_regressor(df, x_vars=x_vars, y_var=y_var, 
            buy_price_col=buy_price_col, sell_price_col=sell_price_col,
            model_class=model_class, model_params=current_model_params, 
            backtest_start='2000-06-01', backtest_end='2018-12-31', 
            model_update_frequency='M', train_history_period=relativedelta(months=1, days=1),
            col_date_shift='date_shift_1'
        )
        name = '%s-%s' % (model_params_name, inputs_name)
        performance_track[name] = df_backtest[['date', 'ret']]
        backtest_summary = get_backtest_performance_metrics(df_backtest.ret, df_backtest.benchmark_ret, 
                                                            with_benchmark=True, with_delta=True)
        backtest_summary_no_benchmark = backtest_summary['main']
        backtest_summary_no_benchmark.name = name
        backtest_summaries_list.append(backtest_summary_no_benchmark)
backtest_summaries_list.append(backtest_summary.benchmark)
performance_track['benchmark'] = df_backtest[['date', 'benchmark_ret']].rename({'benchmark_ret': 'ret'}, axis=1)

2019-06-24 18:50:06,249 - INFO - 24085 - Backtesting with defalt - all
2019-06-24 18:50:17,766 - INFO - 24085 - Backtesting with defalt - close_adj
2019-06-24 18:50:26,069 - INFO - 24085 - Backtesting with defalt - olh
2019-06-24 18:50:37,782 - INFO - 24085 - Backtesting with defalt - olhc
2019-06-24 18:50:49,970 - INFO - 24085 - Backtesting with defalt - olhv
2019-06-24 18:51:01,200 - INFO - 24085 - Backtesting with defalt - olhcv
2019-06-24 18:51:13,565 - INFO - 24085 - Backtesting with defalt - volume
2019-06-24 18:51:24,323 - INFO - 24085 - Backtesting with defalt - sma
2019-06-24 18:51:35,243 - INFO - 24085 - Backtesting with defalt - ema
2019-06-24 18:51:45,987 - INFO - 24085 - Backtesting with defalt - lagged
2019-06-24 18:51:56,707 - INFO - 24085 - Backtesting with defalt - olh_sma
2019-06-24 18:52:08,960 - INFO - 24085 - Backtesting with defalt - olh_ema
2019-06-24 18:52:22,039 - INFO - 24085 - Backtesting with defalt - olh_slope
2019-06-24 18:52:32,948 - INFO - 24085 - Backte

CPU times: user 9min 36s, sys: 564 ms, total: 9min 36s
Wall time: 9min 39s


In [37]:
pd.concat(backtest_summaries_list, axis=1).T.sort_values('return', ascending=False)[:30]

Unnamed: 0,alpha,beta,cagr,max_drawdown,return,sharpe,var,volatility
defalt-olh_sma,0.07758844,0.288603,0.081842,-0.371374,3.303236,0.507701,-0.01745,0.19056
n50-olh_sma,0.07750058,0.289197,0.08179,-0.369175,3.299369,0.507447,-0.01745,0.19056
n50-olhv,0.07507809,0.298191,0.079833,-0.323638,3.15733,0.498458,-0.017202,0.190334
n50-olh,0.07346,0.286649,0.077248,-0.345447,2.97657,0.485438,-0.017375,0.190527
n50-olh_ema,0.07064586,0.315675,0.076316,-0.293878,2.913196,0.481626,-0.017332,0.190193
defalt-olh,0.0725428,0.2792,0.075729,-0.348481,2.873806,0.478037,-0.017411,0.190524
defalt-olh_ema,0.06950265,0.316354,0.075134,-0.293878,2.834283,0.475849,-0.017332,0.190195
n200-olh_ema,0.06950265,0.316354,0.075134,-0.293878,2.834283,0.475849,-0.017332,0.190195
n200-olh,0.07173647,0.278134,0.074786,-0.372331,2.811288,0.473434,-0.01745,0.190524
n200-olh_sma,0.0717694,0.273647,0.074504,-0.366621,2.792806,0.472156,-0.017515,0.190478


There is no improvement adding more boosted trees. With 50 or 100 seems to be ok.

**Partial conclusion:** For the moment, GBM was the model with highest CAGR when it was trained with only 1 month of history.