# Tunning Random Forest training parameters

It is going to be analyzed the impact of the following random forest constrains:
- max depth
- minimum amount of samples per leaf
- minimum amount of samples to split a node
- Diferent error criteria: MSE, Friedman MSE, and MAE
- Different amount of parallel trees

In [1]:
import logging
import imp
from dateutil.relativedelta import relativedelta
from collections import OrderedDict
import sys

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

In [3]:
sys.path.append('../..')

from helpers.dataset import read_quote_dataset, preprocess_quotes
from helpers.backtest import train_model_and_backtest_regressor, get_backtest_performance_metrics
from helpers.visualization import plot_return

In [4]:
# Configir logging module for jypter notebook
imp.reload(logging)
logging_format = '%(asctime)s - %(levelname)s - %(process)s - %(message)s'
logging.basicConfig(level=logging.DEBUG, format=logging_format)

# Disable backtesting logs
logging.getLogger('helpers.backtest').setLevel(level=logging.WARNING)

In [5]:
PARAM_DATASET = '../../../data/SPY_postprocess_adj.csv.gz'

In [6]:
df = read_quote_dataset(PARAM_DATASET)

In [7]:
df.head()

Unnamed: 0,date,open,high,low,close,close_adj,volume,open_adj,low_adj,high_adj,...,ratio_close_adj_000_close_adj_005_norm,ratio_close_adj_000_close_adj_020_norm,ratio_close_adj_000_ema_005_norm,ratio_close_adj_000_ema_010_norm,ratio_close_adj_000_ema_020_norm,ratio_close_adj_000_ema_050_norm,ratio_close_adj_000_sma_005_norm,ratio_close_adj_000_sma_010_norm,ratio_close_adj_000_sma_020_norm,ratio_close_adj_000_sma_050_norm
0,2000-01-03,148.25,148.25,143.875,145.4375,101.425385,8164300,103.38677,100.335727,103.38677,...,,,,,,,,,,
1,2000-01-04,143.531204,144.0625,139.640594,139.75,97.459068,8089800,100.09601,97.38277,100.466526,...,,,,,,,,,,
2,2000-01-05,139.9375,141.531204,137.25,140.0,97.633377,12177900,97.589791,95.715579,98.70121,...,,,,,,,,,,
3,2000-01-06,139.625,141.5,137.75,137.75,96.064301,6227200,97.371891,96.064301,98.679482,...,,,0.48663,,,,,,,
4,2000-01-07,140.3125,145.75,140.0625,145.75,101.643333,8066500,97.851322,97.676977,101.643333,...,,,0.815422,,,,0.740588,,,


In [8]:
vars_to_shift = ['close_adj', 'close_adj_norm', 'close_adj_std']
shift_periods = [1, 5, 10, 20]
vars_for_return = ['close_adj']
return_periods = [1, 5, 10, 20]

In [9]:
df = preprocess_quotes(
    df, vars_to_shift=vars_to_shift, shift_periods=shift_periods,
    vars_for_return=vars_for_return, return_periods=return_periods,
    shift_date=True
)

In [10]:
df[['date', 'close_adj', 'date_shift_1', 'close_adj_shift_1', 'close_adj_ret_1', 
    'date_shift_5', 'close_adj_shift_5', 'close_adj_ret_5']].head(10)

Unnamed: 0,date,close_adj,date_shift_1,close_adj_shift_1,close_adj_ret_1,date_shift_5,close_adj_shift_5,close_adj_ret_5
0,2000-01-03,101.425385,2000-01-04,97.459068,-0.039106,2000-01-10,101.992004,0.005587
1,2000-01-04,97.459068,2000-01-05,97.633377,0.001789,2000-01-11,100.771645,0.033989
2,2000-01-05,97.633377,2000-01-06,96.064301,-0.016071,2000-01-12,99.76915,0.021875
3,2000-01-06,96.064301,2000-01-07,101.643333,0.058076,2000-01-13,101.120308,0.052631
4,2000-01-07,101.643333,2000-01-10,101.992004,0.00343,2000-01-14,102.493233,0.008362
5,2000-01-10,101.992004,2000-01-11,100.771645,-0.011965,2000-01-18,101.686958,-0.002991
6,2000-01-11,100.771645,2000-01-12,99.76915,-0.009948,2000-01-19,102.51506,0.017301
7,2000-01-12,99.76915,2000-01-13,101.120308,0.013543,2000-01-20,100.945953,0.011795
8,2000-01-13,101.120308,2000-01-14,102.493233,0.013577,2000-01-21,100.727989,-0.00388
9,2000-01-14,102.493233,2000-01-18,101.686958,-0.007867,2000-01-24,97.873047,-0.045078


# Processing all the input variables

On the data processing notebook, it was computed all the historical variables. Lets review them

In [11]:
x_vars_all = list(filter(lambda varname: 
                ('_adj' in varname or 'volume' in varname)and
                not '_shift_' in varname and
                not '_std' in varname and
                not '_norm' in varname and
                not '_ret_' in varname, 
            df.columns))

In [12]:
logging.info('There are in total %d dependent variables', len(x_vars_all))

2019-06-23 19:21:11,095 - INFO - 8190 - There are in total 53 dependent variables


In [13]:
x_vars_all

['close_adj',
 'volume',
 'open_adj',
 'low_adj',
 'high_adj',
 'slope_close_adj_005',
 'slope_volume_005',
 'slope_close_adj_010',
 'slope_volume_010',
 'slope_close_adj_020',
 'slope_volume_020',
 'slope_close_adj_050',
 'slope_volume_050',
 'sma_close_adj_005',
 'sma_volume_005',
 'ema_close_adj_005',
 'ema_volume_005',
 'sma_close_adj_010',
 'sma_volume_010',
 'ema_close_adj_010',
 'ema_volume_010',
 'sma_close_adj_020',
 'sma_volume_020',
 'ema_close_adj_020',
 'ema_volume_020',
 'sma_close_adj_050',
 'sma_volume_050',
 'ema_close_adj_050',
 'ema_volume_050',
 'lag_close_adj_001',
 'lag_volume_001',
 'lag_close_adj_005',
 'lag_volume_005',
 'lag_close_adj_010',
 'lag_volume_010',
 'lag_close_adj_020',
 'lag_volume_020',
 'lag_close_adj_060',
 'lag_volume_060',
 'ratio_volume_000_sma_005',
 'ratio_volume_000_sma_020',
 'ratio_volume_000_ema_050',
 'ratio_close_adj_000_close_adj_001',
 'ratio_close_adj_000_close_adj_005',
 'ratio_close_adj_000_close_adj_020',
 'ratio_close_adj_000_e

Divive them by categories

In [14]:
x_vars_slope = list(filter(lambda var: var.startswith('slope_'), x_vars_all))
x_vars_sma = list(filter(lambda var: var.startswith('sma_'), x_vars_all))
x_vars_ema = list(filter(lambda var: var.startswith('ema_'), x_vars_all))
x_vars_lagged = list(filter(lambda var: var.startswith('lag_'), x_vars_all))
x_vars_ratio_close_adj = list(filter(lambda var: var.startswith('ratio_close_adj_'), x_vars_all))
x_vars_ratio_volume = list(filter(lambda var: var.startswith('ratio_volume_'), x_vars_all))

# Run backtests with different set of random forest parameters

Define the inputs

In [19]:
x_var_set = OrderedDict()
x_var_set['all'] = x_vars_all
x_var_set['close_adj'] = ['close_adj']
x_var_set['olh'] = ['open_adj', 'low_adj', 'high_adj']
x_var_set['olhc'] = ['open_adj', 'low_adj', 'high_adj', 'close_adj']
x_var_set['olhv'] = ['open_adj', 'low_adj', 'high_adj', 'volume']
x_var_set['olhcv'] = ['open_adj', 'low_adj', 'high_adj', 'close_adj', 'volume']
x_var_set['volume'] = ['volume']
#x_var_set['slope'] = x_vars_slope
x_var_set['sma'] = x_vars_sma
x_var_set['ema'] = x_vars_ema
x_var_set['lagged'] = x_vars_lagged
#x_var_set['ratio_closed_adj'] = x_vars_ratio_close_adj
#x_var_set['ratio_vol'] = x_vars_ratio_volume
x_var_set['olh_sma'] = ['open_adj', 'low_adj', 'high_adj'] + x_vars_sma
x_var_set['olh_ema'] = ['open_adj', 'low_adj', 'high_adj'] + x_vars_ema
x_var_set['olh_slope'] = ['open_adj', 'low_adj', 'high_adj'] + x_vars_slope

### Try min sample per leaf

In [20]:
model_params = OrderedDict()

model_params['defalt'] = {'n_estimators': 10, 'random_state': 100, }

#model_params['depth3'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 3}
#model_params['depth4'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 4}
#model_params['depth5'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 5}
#model_params['depth6'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 6}
#model_params['depth7'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 7}
#model_params['depth8'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 8}
#model_params['depth9'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 9}
#model_params['depth10'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 10}

model_params['min_samp_leaf-10'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 10}
model_params['min_samp_leaf-20'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 10}
model_params['min_samp_leaf-50'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 50}
#model_params['min_samp_leaf-100'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 100}
#model_params['min_samp_leaf-200'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 200}
#model_params['min_samp_leaf-500'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 500}
#model_params['min_samp_leaf-1000'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 1000}

#model_params['min_samp_split-10'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 10}
#model_params['min_samp_split-50'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 50}
#model_params['min_samp_split-100'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 100}
#model_params['min_samp_split-200'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 200}
#model_params['min_samp_split-500'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 500}
#model_params['min_samp_split-1000'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 1000}

#model_params['friedman_mse'] = {'random_state': 100, 'n_estimators': 10, 'criterion': 'friedman_mse'}
#model_params['mae'] = {'random_state': 100, 'n_estimators': 10, 'criterion': 'mae'}

In [21]:
%%time
backtest_summaries_list = []
performance_track = OrderedDict()
for model_params_name, current_model_params in model_params.items():
    for inputs_name, x_var_current in x_var_set.items():
        x_vars = x_var_current
        y_var = 'close_adj_shift_1'
        buy_price_col = 'close_adj'
        sell_price_col = 'close_adj_shift_1'
        model_class = RandomForestRegressor

        # logging.info('Backtesting with %s - %s', model_params_name, inputs_name)
        df_backtest = train_model_and_backtest_regressor(df, x_vars=x_vars, y_var=y_var, 
            buy_price_col=buy_price_col, sell_price_col=sell_price_col,
            model_class=model_class, model_params=current_model_params, 
            backtest_start='2000-06-01', backtest_end='2018-12-31', 
            model_update_frequency='M', train_history_period=relativedelta(months=1, days=1),
            col_date_shift='date_shift_1'
        )
        name = '%s-%s' % (model_params_name, inputs_name)
        performance_track[name] = df_backtest[['date', 'ret']]
        backtest_summary = get_backtest_performance_metrics(df_backtest.ret, df_backtest.benchmark_ret, 
                                                            with_benchmark=True, with_delta=True)
        backtest_summary_no_benchmark = backtest_summary['main']
        backtest_summary_no_benchmark.name = name
        backtest_summaries_list.append(backtest_summary_no_benchmark)
backtest_summaries_list.append(backtest_summary.benchmark)
performance_track['benchmark'] = df_backtest[['date', 'benchmark_ret']].rename({'benchmark_ret': 'ret'}, axis=1)

CPU times: user 2min 43s, sys: 227 ms, total: 2min 43s
Wall time: 2min 43s


In [22]:
pd.concat(backtest_summaries_list, axis=1).T.sort_values('return', ascending=False)[:30]

Unnamed: 0,alpha,beta,cagr,max_drawdown,return,sharpe,var,volatility
defalt-olh,0.07777427,0.306185,0.083312,-0.381271,3.41302,0.51502,-0.017202,0.190479
defalt-sma,0.06151127,0.265823,0.062991,-0.383837,2.105755,0.4157,-0.017515,0.190421
defalt-olh_ema,0.05060987,0.303266,0.054098,-0.296095,1.657511,0.371909,-0.01745,0.190214
defalt-olhcv,0.04756235,0.346221,0.053894,-0.393737,1.647997,0.370959,-0.017597,0.190172
defalt-olhv,0.04763374,0.344222,0.05383,-0.440767,1.645012,0.370654,-0.01745,0.190164
benchmark,-3.681556e-16,1.0,0.049527,-0.551894,1.451705,0.349346,-0.019044,0.190026
defalt-ema,0.04330849,0.321117,0.047672,-0.322004,1.372525,0.339788,-0.017588,0.190195
defalt-olhc,0.03127856,0.31663,0.034839,-0.492169,0.887614,0.275023,-0.017722,0.190158
defalt-olh_sma,0.02865874,0.312087,0.031816,-0.331957,0.787899,0.259567,-0.017588,0.190227
defalt-close_adj,0.01007786,0.264984,0.009662,-0.471655,0.195288,0.145478,-0.018061,0.190193


According the tests, constraining the amount of samples per leaf doesn't help. Default params with OLH quotes is still the best model. 

### Try min sample split

In [23]:
model_params = OrderedDict()

model_params['defalt'] = {'n_estimators': 10, 'random_state': 100, }

#model_params['depth3'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 3}
#model_params['depth4'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 4}
#model_params['depth5'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 5}
#model_params['depth6'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 6}
#model_params['depth7'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 7}
#model_params['depth8'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 8}
#model_params['depth9'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 9}
#model_params['depth10'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 10}

#model_params['min_samp_leaf-10'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 10}
#model_params['min_samp_leaf-20'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 10}
#model_params['min_samp_leaf-50'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 50}
#model_params['min_samp_leaf-100'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 100}
#model_params['min_samp_leaf-200'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 200}
#model_params['min_samp_leaf-500'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 500}
#model_params['min_samp_leaf-1000'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 1000}

model_params['min_samp_split-10'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 10}
model_params['min_samp_split-50'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 50}
model_params['min_samp_split-100'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 100}
#model_params['min_samp_split-200'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 200}
#model_params['min_samp_split-500'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 500}
#model_params['min_samp_split-1000'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 1000}

#model_params['friedman_mse'] = {'random_state': 100, 'n_estimators': 10, 'criterion': 'friedman_mse'}
#model_params['mae'] = {'random_state': 100, 'n_estimators': 10, 'criterion': 'mae'}

In [24]:
%%time
backtest_summaries_list = []
performance_track = OrderedDict()
for model_params_name, current_model_params in model_params.items():
    for inputs_name, x_var_current in x_var_set.items():
        x_vars = x_var_current
        y_var = 'close_adj_shift_1'
        buy_price_col = 'close_adj'
        sell_price_col = 'close_adj_shift_1'
        model_class = RandomForestRegressor

        logging.info('Backtesting with %s - %s', model_params_name, inputs_name)
        df_backtest = train_model_and_backtest_regressor(df, x_vars=x_vars, y_var=y_var, 
            buy_price_col=buy_price_col, sell_price_col=sell_price_col,
            model_class=model_class, model_params=current_model_params, 
            backtest_start='2000-06-01', backtest_end='2018-12-31', 
            model_update_frequency='M', train_history_period=relativedelta(months=1, days=1),
            col_date_shift='date_shift_1'
        )
        name = '%s-%s' % (model_params_name, inputs_name)
        performance_track[name] = df_backtest[['date', 'ret']]
        backtest_summary = get_backtest_performance_metrics(df_backtest.ret, df_backtest.benchmark_ret, 
                                                            with_benchmark=True, with_delta=True)
        backtest_summary_no_benchmark = backtest_summary['main']
        backtest_summary_no_benchmark.name = name
        backtest_summaries_list.append(backtest_summary_no_benchmark)
backtest_summaries_list.append(backtest_summary.benchmark)
performance_track['benchmark'] = df_backtest[['date', 'benchmark_ret']].rename({'benchmark_ret': 'ret'}, axis=1)

2019-06-23 19:36:15,744 - INFO - 8190 - Backtesting with defalt - all
2019-06-23 19:36:18,951 - INFO - 8190 - Backtesting with defalt - close_adj
2019-06-23 19:36:21,794 - INFO - 8190 - Backtesting with defalt - olh
2019-06-23 19:36:24,771 - INFO - 8190 - Backtesting with defalt - olhc
2019-06-23 19:36:27,728 - INFO - 8190 - Backtesting with defalt - olhv
2019-06-23 19:36:30,826 - INFO - 8190 - Backtesting with defalt - olhcv
2019-06-23 19:36:34,440 - INFO - 8190 - Backtesting with defalt - volume
2019-06-23 19:36:37,458 - INFO - 8190 - Backtesting with defalt - sma
2019-06-23 19:36:40,334 - INFO - 8190 - Backtesting with defalt - ema
2019-06-23 19:36:43,469 - INFO - 8190 - Backtesting with defalt - lagged
2019-06-23 19:36:46,387 - INFO - 8190 - Backtesting with defalt - olh_sma
2019-06-23 19:36:49,274 - INFO - 8190 - Backtesting with defalt - olh_ema
2019-06-23 19:36:52,263 - INFO - 8190 - Backtesting with defalt - olh_slope
2019-06-23 19:36:55,165 - INFO - 8190 - Backtesting with min

CPU times: user 2min 41s, sys: 292 ms, total: 2min 41s
Wall time: 2min 41s


In [25]:
pd.concat(backtest_summaries_list, axis=1).T.sort_values('return', ascending=False)[:30]

Unnamed: 0,alpha,beta,cagr,max_drawdown,return,sharpe,var,volatility
defalt-olh,0.07777427,0.306185,0.083312,-0.381271,3.41302,0.51502,-0.017202,0.190479
defalt-sma,0.06151127,0.265823,0.062991,-0.383837,2.105755,0.4157,-0.017515,0.190421
defalt-olh_ema,0.05060987,0.303266,0.054098,-0.296095,1.657511,0.371909,-0.01745,0.190214
defalt-olhcv,0.04756235,0.346221,0.053894,-0.393737,1.647997,0.370959,-0.017597,0.190172
defalt-olhv,0.04763374,0.344222,0.05383,-0.440767,1.645012,0.370654,-0.01745,0.190164
benchmark,-3.681556e-16,1.0,0.049527,-0.551894,1.451705,0.349346,-0.019044,0.190026
defalt-ema,0.04330849,0.321117,0.047672,-0.322004,1.372525,0.339788,-0.017588,0.190195
min_samp_split-10-olhv,0.0353818,0.328121,0.039884,-0.368624,1.065843,0.300555,-0.01745,0.190195
min_samp_split-10-olhc,0.03148797,0.323168,0.035501,-0.368624,0.910151,0.278322,-0.017597,0.190216
defalt-olhc,0.03127856,0.31663,0.034839,-0.492169,0.887614,0.275023,-0.017722,0.190158


`min_sample_split` neither seems to help

### Try different error criterions

In [26]:
model_params = OrderedDict()

model_params['defalt'] = {'n_estimators': 10, 'random_state': 100, }

#model_params['depth3'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 3}
#model_params['depth4'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 4}
#model_params['depth5'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 5}
#model_params['depth6'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 6}
#model_params['depth7'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 7}
#model_params['depth8'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 8}
#model_params['depth9'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 9}
#model_params['depth10'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 10}

#model_params['min_samp_leaf-10'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 10}
#model_params['min_samp_leaf-20'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 10}
#model_params['min_samp_leaf-50'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 50}
#model_params['min_samp_leaf-100'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 100}
#model_params['min_samp_leaf-200'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 200}
#model_params['min_samp_leaf-500'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 500}
#model_params['min_samp_leaf-1000'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 1000}

#model_params['min_samp_split-10'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 10}
#model_params['min_samp_split-50'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 50}
#model_params['min_samp_split-100'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 100}
#model_params['min_samp_split-200'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 200}
#model_params['min_samp_split-500'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 500}
#model_params['min_samp_split-1000'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 1000}

model_params['friedman_mse'] = {'random_state': 100, 'n_estimators': 10, 'criterion': 'friedman_mse'}
model_params['mae'] = {'random_state': 100, 'n_estimators': 10, 'criterion': 'mae'}

In [28]:
%%time
backtest_summaries_list = []
performance_track = OrderedDict()
for model_params_name, current_model_params in model_params.items():
    for inputs_name, x_var_current in x_var_set.items():
        x_vars = x_var_current
        y_var = 'close_adj_shift_1'
        buy_price_col = 'close_adj'
        sell_price_col = 'close_adj_shift_1'
        model_class = RandomForestRegressor

        logging.info('Backtesting with %s - %s', model_params_name, inputs_name)
        df_backtest = train_model_and_backtest_regressor(df, x_vars=x_vars, y_var=y_var, 
            buy_price_col=buy_price_col, sell_price_col=sell_price_col,
            model_class=model_class, model_params=current_model_params, 
            backtest_start='2000-06-01', backtest_end='2018-12-31', 
            model_update_frequency='M', train_history_period=relativedelta(months=1, days=1),
            col_date_shift='date_shift_1'
        )
        name = '%s-%s' % (model_params_name, inputs_name)
        performance_track[name] = df_backtest[['date', 'ret']]
        backtest_summary = get_backtest_performance_metrics(df_backtest.ret, df_backtest.benchmark_ret, 
                                                            with_benchmark=True, with_delta=True)
        backtest_summary_no_benchmark = backtest_summary['main']
        backtest_summary_no_benchmark.name = name
        backtest_summaries_list.append(backtest_summary_no_benchmark)
backtest_summaries_list.append(backtest_summary.benchmark)
performance_track['benchmark'] = df_backtest[['date', 'benchmark_ret']].rename({'benchmark_ret': 'ret'}, axis=1)

2019-06-23 19:41:51,499 - INFO - 8190 - Backtesting with defalt - all
2019-06-23 19:41:54,780 - INFO - 8190 - Backtesting with defalt - close_adj
2019-06-23 19:41:57,846 - INFO - 8190 - Backtesting with defalt - olh
2019-06-23 19:42:00,939 - INFO - 8190 - Backtesting with defalt - olhc
2019-06-23 19:42:04,559 - INFO - 8190 - Backtesting with defalt - olhv
2019-06-23 19:42:08,126 - INFO - 8190 - Backtesting with defalt - olhcv
2019-06-23 19:42:11,603 - INFO - 8190 - Backtesting with defalt - volume
2019-06-23 19:42:14,916 - INFO - 8190 - Backtesting with defalt - sma
2019-06-23 19:42:18,125 - INFO - 8190 - Backtesting with defalt - ema
2019-06-23 19:42:21,313 - INFO - 8190 - Backtesting with defalt - lagged
2019-06-23 19:42:24,455 - INFO - 8190 - Backtesting with defalt - olh_sma
2019-06-23 19:42:27,620 - INFO - 8190 - Backtesting with defalt - olh_ema
2019-06-23 19:42:30,891 - INFO - 8190 - Backtesting with defalt - olh_slope
2019-06-23 19:42:34,081 - INFO - 8190 - Backtesting with fri

CPU times: user 2min 7s, sys: 168 ms, total: 2min 7s
Wall time: 2min 7s


In [29]:
pd.concat(backtest_summaries_list, axis=1).T.sort_values('return', ascending=False)[:30]

Unnamed: 0,alpha,beta,cagr,max_drawdown,return,sharpe,var,volatility
friedman_mse-olh,0.08059738,0.306582,0.086403,-0.344321,3.652542,0.529977,-0.017256,0.190479
defalt-olh,0.07777427,0.306185,0.083312,-0.381271,3.41302,0.51502,-0.017202,0.190479
mae-olhv,0.06819692,0.350583,0.076171,-0.37873,2.903417,0.480804,-0.017202,0.190245
mae-olh_ema,0.05883727,0.318802,0.0639,-0.350207,2.155386,0.420536,-0.017376,0.190236
friedman_mse-olh_ema,0.05898965,0.306731,0.063209,-0.294884,2.117567,0.417086,-0.017333,0.190253
defalt-sma,0.06151127,0.265823,0.062991,-0.383837,2.105755,0.4157,-0.017515,0.190421
mae-olh,0.05791268,0.308074,0.062143,-0.426838,2.060077,0.411361,-0.017411,0.1905
friedman_mse-olhv,0.05425485,0.343742,0.060796,-0.329102,1.988881,0.405293,-0.017375,0.190169
mae-olh_sma,0.05482867,0.283557,0.057155,-0.359319,1.80422,0.386773,-0.017515,0.190429
friedman_mse-ema,0.05012877,0.317155,0.05456,-0.328484,1.679247,0.374172,-0.017362,0.190242


The three criterion are in the top-3: MSE, Friedman MSE and MAE. The three criterion have to be tested with more depth.

### Try the most important parameter: the depth

In [30]:
model_params = OrderedDict()

model_params['defalt'] = {'n_estimators': 10, 'random_state': 100, }

model_params['depth3'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 3}
model_params['depth4'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 4}
model_params['depth5'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 5}
model_params['depth6'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 6}
model_params['depth7'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 7}
model_params['depth8'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 8}
model_params['depth9'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 9}
model_params['depth10'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 10}

#model_params['min_samp_leaf-10'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 10}
#model_params['min_samp_leaf-20'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 10}
#model_params['min_samp_leaf-50'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 50}
#model_params['min_samp_leaf-100'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 100}
#model_params['min_samp_leaf-200'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 200}
#model_params['min_samp_leaf-500'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 500}
#model_params['min_samp_leaf-1000'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 1000}

#model_params['min_samp_split-10'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 10}
#model_params['min_samp_split-50'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 50}
#model_params['min_samp_split-100'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 100}
#model_params['min_samp_split-200'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 200}
#model_params['min_samp_split-500'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 500}
#model_params['min_samp_split-1000'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 1000}

# model_params['friedman_mse'] = {'random_state': 100, 'n_estimators': 10, 'criterion': 'friedman_mse'}
#model_params['mae'] = {'random_state': 100, 'n_estimators': 10, 'criterion': 'mae'}

In [32]:
%%time
backtest_summaries_list = []
performance_track = OrderedDict()
for model_params_name, current_model_params in model_params.items():
    for inputs_name, x_var_current in x_var_set.items():
        x_vars = x_var_current
        y_var = 'close_adj_shift_1'
        buy_price_col = 'close_adj'
        sell_price_col = 'close_adj_shift_1'
        model_class = RandomForestRegressor

        logging.info('Backtesting with %s - %s', model_params_name, inputs_name)
        df_backtest = train_model_and_backtest_regressor(df, x_vars=x_vars, y_var=y_var, 
            buy_price_col=buy_price_col, sell_price_col=sell_price_col,
            model_class=model_class, model_params=current_model_params, 
            backtest_start='2000-06-01', backtest_end='2018-12-31', 
            model_update_frequency='M', train_history_period=relativedelta(months=1, days=1),
            col_date_shift='date_shift_1'
        )
        name = '%s-%s' % (model_params_name, inputs_name)
        performance_track[name] = df_backtest[['date', 'ret']]
        backtest_summary = get_backtest_performance_metrics(df_backtest.ret, df_backtest.benchmark_ret, 
                                                            with_benchmark=True, with_delta=True)
        backtest_summary_no_benchmark = backtest_summary['main']
        backtest_summary_no_benchmark.name = name
        backtest_summaries_list.append(backtest_summary_no_benchmark)
backtest_summaries_list.append(backtest_summary.benchmark)
performance_track['benchmark'] = df_backtest[['date', 'benchmark_ret']].rename({'benchmark_ret': 'ret'}, axis=1)

2019-06-23 19:45:57,901 - INFO - 8190 - Backtesting with defalt - all
2019-06-23 19:46:01,222 - INFO - 8190 - Backtesting with defalt - close_adj
2019-06-23 19:46:04,490 - INFO - 8190 - Backtesting with defalt - olh
2019-06-23 19:46:07,770 - INFO - 8190 - Backtesting with defalt - olhc
2019-06-23 19:46:10,976 - INFO - 8190 - Backtesting with defalt - olhv
2019-06-23 19:46:14,246 - INFO - 8190 - Backtesting with defalt - olhcv
2019-06-23 19:46:17,507 - INFO - 8190 - Backtesting with defalt - volume
2019-06-23 19:46:20,685 - INFO - 8190 - Backtesting with defalt - sma
2019-06-23 19:46:23,846 - INFO - 8190 - Backtesting with defalt - ema
2019-06-23 19:46:27,031 - INFO - 8190 - Backtesting with defalt - lagged
2019-06-23 19:46:30,257 - INFO - 8190 - Backtesting with defalt - olh_sma
2019-06-23 19:46:33,516 - INFO - 8190 - Backtesting with defalt - olh_ema
2019-06-23 19:46:36,720 - INFO - 8190 - Backtesting with defalt - olh_slope
2019-06-23 19:46:39,905 - INFO - 8190 - Backtesting with dep

2019-06-23 19:51:54,781 - INFO - 8190 - Backtesting with depth10 - olh_sma
2019-06-23 19:51:57,777 - INFO - 8190 - Backtesting with depth10 - olh_ema
2019-06-23 19:52:00,706 - INFO - 8190 - Backtesting with depth10 - olh_slope


CPU times: user 6min 5s, sys: 496 ms, total: 6min 5s
Wall time: 6min 5s


In [34]:
pd.concat(backtest_summaries_list, axis=1).T.sort_values('return', ascending=False)[:30]

Unnamed: 0,alpha,beta,cagr,max_drawdown,return,sharpe,var,volatility
defalt-olh,0.077774,0.306185,0.083312,-0.381271,3.41302,0.51502,-0.017202,0.190479
depth8-olh,0.077774,0.306185,0.083312,-0.381271,3.41302,0.51502,-0.017202,0.190479
depth10-olh,0.077774,0.306185,0.083312,-0.381271,3.41302,0.51502,-0.017202,0.190479
depth9-olh,0.077774,0.306185,0.083312,-0.381271,3.41302,0.51502,-0.017202,0.190479
depth6-olh,0.07774,0.306567,0.083303,-0.392694,3.412288,0.514973,-0.017202,0.190479
depth7-olh,0.068023,0.331444,0.074616,-0.381271,2.800141,0.473153,-0.017256,0.190269
depth4-olh,0.062309,0.330746,0.068443,-0.39124,2.414959,0.442846,-0.017375,0.190282
depth5-olh,0.058726,0.332907,0.064774,-0.409587,2.203838,0.424751,-0.017333,0.19029
depth6-sma,0.062703,0.270413,0.064582,-0.383837,2.19314,0.42354,-0.017425,0.190429
depth5-sma,0.061891,0.269759,0.063672,-0.383837,2.142864,0.419047,-0.017515,0.190429


Several depths are having good returns. Nevertheless, the best performance is still the default, with no max-depth. This is probably because the dataset has no too much historical inputs.

### Try the amount of parallel trees

In [35]:
model_params = OrderedDict()

model_params['defalt'] = {'n_estimators': 10, 'random_state': 100, }

#model_params['depth3'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 3}
#model_params['depth4'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 4}
#model_params['depth5'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 5}
#model_params['depth6'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 6}
#model_params['depth7'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 7}
#model_params['depth8'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 8}
#model_params['depth9'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 9}
#model_params['depth10'] = {'random_state': 100, 'n_estimators': 10, 'max_depth': 10}

#model_params['min_samp_leaf-10'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 10}
#model_params['min_samp_leaf-20'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 10}
#model_params['min_samp_leaf-50'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 50}
#model_params['min_samp_leaf-100'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 100}
#model_params['min_samp_leaf-200'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 200}
#model_params['min_samp_leaf-500'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 500}
#model_params['min_samp_leaf-1000'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_leaf': 1000}

#model_params['min_samp_split-10'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 10}
#model_params['min_samp_split-50'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 50}
#model_params['min_samp_split-100'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 100}
#model_params['min_samp_split-200'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 200}
#model_params['min_samp_split-500'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 500}
#model_params['min_samp_split-1000'] = {'random_state': 100, 'n_estimators': 10, 'min_samples_split': 1000}

# model_params['friedman_mse'] = {'random_state': 100, 'n_estimators': 10, 'criterion': 'friedman_mse'}
#model_params['mae'] = {'random_state': 100, 'n_estimators': 10, 'criterion': 'mae'}

model_params['defalt-n20'] = {'n_estimators': 20, 'random_state': 100, }
model_params['defalt-n50'] = {'n_estimators': 50, 'random_state': 100, }
model_params['defalt-n100'] = {'n_estimators': 100, 'random_state': 100, }

In [36]:
%%time
backtest_summaries_list = []
performance_track = OrderedDict()
for model_params_name, current_model_params in model_params.items():
    for inputs_name, x_var_current in x_var_set.items():
        x_vars = x_var_current
        y_var = 'close_adj_shift_1'
        buy_price_col = 'close_adj'
        sell_price_col = 'close_adj_shift_1'
        model_class = RandomForestRegressor

        logging.info('Backtesting with %s - %s', model_params_name, inputs_name)
        df_backtest = train_model_and_backtest_regressor(df, x_vars=x_vars, y_var=y_var, 
            buy_price_col=buy_price_col, sell_price_col=sell_price_col,
            model_class=model_class, model_params=current_model_params, 
            backtest_start='2000-06-01', backtest_end='2018-12-31', 
            model_update_frequency='M', train_history_period=relativedelta(months=1, days=1),
            col_date_shift='date_shift_1'
        )
        name = '%s-%s' % (model_params_name, inputs_name)
        performance_track[name] = df_backtest[['date', 'ret']]
        backtest_summary = get_backtest_performance_metrics(df_backtest.ret, df_backtest.benchmark_ret, 
                                                            with_benchmark=True, with_delta=True)
        backtest_summary_no_benchmark = backtest_summary['main']
        backtest_summary_no_benchmark.name = name
        backtest_summaries_list.append(backtest_summary_no_benchmark)
backtest_summaries_list.append(backtest_summary.benchmark)
performance_track['benchmark'] = df_backtest[['date', 'benchmark_ret']].rename({'benchmark_ret': 'ret'}, axis=1)

2019-06-23 20:09:45,630 - INFO - 8190 - Backtesting with defalt - all
2019-06-23 20:09:48,858 - INFO - 8190 - Backtesting with defalt - close_adj
2019-06-23 20:09:51,719 - INFO - 8190 - Backtesting with defalt - olh
2019-06-23 20:09:54,891 - INFO - 8190 - Backtesting with defalt - olhc
2019-06-23 20:09:58,073 - INFO - 8190 - Backtesting with defalt - olhv
2019-06-23 20:10:01,112 - INFO - 8190 - Backtesting with defalt - olhcv
2019-06-23 20:10:04,102 - INFO - 8190 - Backtesting with defalt - volume
2019-06-23 20:10:06,985 - INFO - 8190 - Backtesting with defalt - sma
2019-06-23 20:10:10,022 - INFO - 8190 - Backtesting with defalt - ema
2019-06-23 20:10:12,931 - INFO - 8190 - Backtesting with defalt - lagged
2019-06-23 20:10:15,842 - INFO - 8190 - Backtesting with defalt - olh_sma
2019-06-23 20:10:18,762 - INFO - 8190 - Backtesting with defalt - olh_ema
2019-06-23 20:10:21,669 - INFO - 8190 - Backtesting with defalt - olh_slope
2019-06-23 20:10:24,587 - INFO - 8190 - Backtesting with def

CPU times: user 5min 16s, sys: 260 ms, total: 5min 16s
Wall time: 5min 16s


In [37]:
pd.concat(backtest_summaries_list, axis=1).T.sort_values('return', ascending=False)[:30]

Unnamed: 0,alpha,beta,cagr,max_drawdown,return,sharpe,var,volatility
defalt-olh,0.07777427,0.306185,0.083312,-0.381271,3.41302,0.51502,-0.017202,0.190479
defalt-n100-olh,0.07606789,0.279493,0.079557,-0.393737,3.137696,0.496981,-0.017256,0.190394
defalt-n20-olh,0.06889861,0.284408,0.072189,-0.381271,2.644048,0.460811,-0.017256,0.190488
defalt-n100-sma,0.06835473,0.259655,0.069849,-0.376199,2.499309,0.449411,-0.017425,0.190454
defalt-n100-olh_sma,0.06756302,0.268586,0.069639,-0.357648,2.486533,0.448428,-0.017362,0.190428
defalt-n50-sma,0.06814312,0.259269,0.069597,-0.352038,2.483998,0.448197,-0.017515,0.19044
defalt-n50-olh,0.0652181,0.293832,0.068919,-0.392694,2.443302,0.44478,-0.017411,0.190485
defalt-n50-olh_sma,0.06511565,0.270771,0.067179,-0.379794,2.340785,0.436347,-0.017376,0.190423
defalt-n100-olhv,0.05755828,0.349596,0.064713,-0.387281,2.200439,0.424524,-0.017375,0.190251
defalt-sma,0.06151127,0.265823,0.062991,-0.383837,2.105755,0.4157,-0.017515,0.190421


The default with 10 parallel trees is still the best options.

For the moment, the default alternative was in the top of the list. Now would worth to combine all the parameters together. Having the experience of previous notebook, the parameter combination will be done on the next notebook, combining with different historical window lenght. Adding more history, adds more data to the dataset and some parameters could produce a better perfomance than the model with 1 month of history.