# Decision Tree graphical visualization

This notebook is going to graphically represent the best decision tree built up to the moment.

In [1]:
import logging
import imp
from dateutil.relativedelta import relativedelta
from collections import OrderedDict
import sys

In [40]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz

In [21]:
sys.path.append('../..')

from helpers.dataset import read_quote_dataset, preprocess_quotes
from helpers.backtest import train_model_and_backtest_regressor, get_backtest_performance_metrics
from helpers.visualization import plot_return
from helpers.machine_learning import train_model, get_trailing_df

In [4]:
# Configir logging module for jypter notebook
imp.reload(logging)
logging_format = '%(asctime)s - %(levelname)s - %(process)s - %(message)s'
logging.basicConfig(level=logging.DEBUG, format=logging_format)

# Disable backtesting logs
logging.getLogger('helpers.backtest').setLevel(level=logging.WARNING)

In [5]:
PARAM_DATASET = '../../../data/SPY_postprocess_adj.csv.gz'

In [6]:
df = read_quote_dataset(PARAM_DATASET)

In [7]:
df.head()

Unnamed: 0,date,open,high,low,close,close_adj,volume,open_adj,low_adj,high_adj,...,ratio_close_adj_000_close_adj_005_norm,ratio_close_adj_000_close_adj_020_norm,ratio_close_adj_000_ema_005_norm,ratio_close_adj_000_ema_010_norm,ratio_close_adj_000_ema_020_norm,ratio_close_adj_000_ema_050_norm,ratio_close_adj_000_sma_005_norm,ratio_close_adj_000_sma_010_norm,ratio_close_adj_000_sma_020_norm,ratio_close_adj_000_sma_050_norm
0,2000-01-03,148.25,148.25,143.875,145.4375,101.425385,8164300,103.38677,100.335727,103.38677,...,,,,,,,,,,
1,2000-01-04,143.531204,144.0625,139.640594,139.75,97.459068,8089800,100.09601,97.38277,100.466526,...,,,,,,,,,,
2,2000-01-05,139.9375,141.531204,137.25,140.0,97.633377,12177900,97.589791,95.715579,98.70121,...,,,,,,,,,,
3,2000-01-06,139.625,141.5,137.75,137.75,96.064301,6227200,97.371891,96.064301,98.679482,...,,,0.48663,,,,,,,
4,2000-01-07,140.3125,145.75,140.0625,145.75,101.643333,8066500,97.851322,97.676977,101.643333,...,,,0.815422,,,,0.740588,,,


In [8]:
vars_to_shift = ['close_adj', 'close_adj_norm', 'close_adj_std']
shift_periods = [1, 5, 10, 20]
vars_for_return = ['close_adj']
return_periods = [1, 5, 10, 20]

In [9]:
df = preprocess_quotes(
    df, vars_to_shift=vars_to_shift, shift_periods=shift_periods,
    vars_for_return=vars_for_return, return_periods=return_periods,
    shift_date=True
)

In [10]:
df[['date', 'close_adj', 'date_shift_1', 'close_adj_shift_1', 'close_adj_ret_1', 
    'date_shift_5', 'close_adj_shift_5', 'close_adj_ret_5']].head(10)

Unnamed: 0,date,close_adj,date_shift_1,close_adj_shift_1,close_adj_ret_1,date_shift_5,close_adj_shift_5,close_adj_ret_5
0,2000-01-03,101.425385,2000-01-04,97.459068,-0.039106,2000-01-10,101.992004,0.005587
1,2000-01-04,97.459068,2000-01-05,97.633377,0.001789,2000-01-11,100.771645,0.033989
2,2000-01-05,97.633377,2000-01-06,96.064301,-0.016071,2000-01-12,99.76915,0.021875
3,2000-01-06,96.064301,2000-01-07,101.643333,0.058076,2000-01-13,101.120308,0.052631
4,2000-01-07,101.643333,2000-01-10,101.992004,0.00343,2000-01-14,102.493233,0.008362
5,2000-01-10,101.992004,2000-01-11,100.771645,-0.011965,2000-01-18,101.686958,-0.002991
6,2000-01-11,100.771645,2000-01-12,99.76915,-0.009948,2000-01-19,102.51506,0.017301
7,2000-01-12,99.76915,2000-01-13,101.120308,0.013543,2000-01-20,100.945953,0.011795
8,2000-01-13,101.120308,2000-01-14,102.493233,0.013577,2000-01-21,100.727989,-0.00388
9,2000-01-14,102.493233,2000-01-18,101.686958,-0.007867,2000-01-24,97.873047,-0.045078


# Processing all the input variables

On the data processing notebook, it was computed all the historical variables. Lets review them

In [11]:
x_vars_all = list(filter(lambda varname: 
                ('_adj' in varname or 'volume' in varname)and
                not '_shift_' in varname and
                not '_std' in varname and
                not '_norm' in varname and
                not '_ret_' in varname, 
            df.columns))

In [12]:
logging.info('There are in total %d dependent variables', len(x_vars_all))

2019-06-11 20:51:18,227 - INFO - 11698 - There are in total 53 dependent variables


Divive them by categories

In [13]:
x_vars_slope = list(filter(lambda var: var.startswith('slope_'), x_vars_all))
x_vars_sma = list(filter(lambda var: var.startswith('sma_'), x_vars_all))
x_vars_ema = list(filter(lambda var: var.startswith('ema_'), x_vars_all))
x_vars_lagged = list(filter(lambda var: var.startswith('lag_'), x_vars_all))
x_vars_ratio_close_adj = list(filter(lambda var: var.startswith('ratio_close_adj_'), x_vars_all))
x_vars_ratio_volume = list(filter(lambda var: var.startswith('ratio_volume_'), x_vars_all))

# Train the model

On the backtest, the model is trained monthly. It means that each month, a new decision tree is built. On this example it is going to build a model with data until 2018-12-31, to be used on January 2019.

In [23]:
x_vars = ['open_adj', 'low_adj', 'high_adj']
y_var = 'close_adj_shift_1'
buy_price_col = 'close_adj'
sell_price_col = 'close_adj_shift_1'
model_class = DecisionTreeRegressor
model_params = {'random_state': 100, 'max_depth': 9}
train_history_period = relativedelta(years=6)
ref_date = pd.to_datetime('2018-12-31')
date_shift_col='date_shift_1'

In [26]:
df_train = get_trailing_df(
            df,
            ref_date,
            train_history_period,
            date_col='date',
            date_shift_col=date_shift_col,
        )

Get the training date ranges

In [33]:
df_train.date.min(), df_train.date.max()

(Timestamp('2012-12-31 00:00:00'), Timestamp('2018-12-27 00:00:00'))

Train the model

In [34]:
model = train_model(df_train, x_vars, y_var, model_class, model_params)

Save the tree representation in .dot

In [41]:
filename = 'cart_tree.dot'
export_graphviz(model, filename, feature_names=x_vars, node_ids=True, filled=True)

Export to pdf and jpg (dot command has to be installed locally)

In [43]:
! dot -Tjpg cart_tree.dot -o cart_tree.jpg

dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.602645 to fit


In [44]:
! dot -Tpdf cart_tree.dot -o cart_tree.pdf

On the links below could be seen CART tree trained on 2018-12-31 to be used in January 2019. It has a depth of 9 nodes, so the reader has to zoom in to be able to see the nodes content.

https://raw.githubusercontent.com/gmoncarz/machine_learning_tour/master/notebooks/07_decision_tree/regressor/cart_tree.jpg

https://raw.githubusercontent.com/gmoncarz/machine_learning_tour/master/notebooks/07_decision_tree/regressor/cart_tree.pdf

https://docs.google.com/viewer?url=https://raw.githubusercontent.com/gmoncarz/machine_learning_tour/master/notebooks/07_decision_tree/regressor/cart_tree.pdf