# Decision Trees & Random Forests

In [1]:
import math
import numpy as np
import pandas as pd
from datetime import datetime

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 
plt.style.use('seaborn-whitegrid')

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Step1: Get the Data

In [12]:
df = pd.read_csv('FeatureEngineering.csv')
df.head()

Unnamed: 0,Date,f1_day,f1_weekday,f1_month,AAPL_label,MSFT_label,INTC_label,ORCL_label,CSCO_label,IBM_label,...,AAPL_cat_ret,MSFT_cat_ret,INTC_cat_ret,ORCL_cat_ret,CSCO_cat_ret,IBM_cat_ret,NVDA_cat_ret,TXN_cat_ret,QCOM_cat_ret,ADBE_cat_ret
0,2006-06-28,28,2,6,b. -2.5% to -1.5%,e. 0.5% to 1.5%,g. 2.5% to inf,e. 0.5% to 1.5%,d. -0.5% to 0.5%,d. -0.5% to 0.5%,...,DUDD,UUDD,UUUD,UUDD,DUDU,DUDD,DUDD,DUDD,DUDU,UUDU
1,2006-06-29,29,3,6,g. 2.5% to inf,e. 0.5% to 1.5%,g. 2.5% to inf,e. 0.5% to 1.5%,g. 2.5% to inf,e. 0.5% to 1.5%,...,DUDD,UUDD,UUUD,UUUD,UDUD,UDUD,UDUD,UDDD,DDUD,DUUD
2,2006-06-30,30,4,6,a. -inf to -2.5%,c. -1.5% to -0.5%,b. -2.5% to -1.5%,b. -2.5% to -1.5%,b. -2.5% to -1.5%,c. -1.5% to -0.5%,...,DUDD,UUUD,UDUD,DUUD,DDDD,DDDD,UDUD,UDDD,UDUD,DUUD
3,2006-07-03,3,0,7,e. 0.5% to 1.5%,f. 1.5% to 2.5%,f. 1.5% to 2.5%,f. 1.5% to 2.5%,f. 1.5% to 2.5%,f. 1.5% to 2.5%,...,DUUD,UUUD,UUUD,UUUD,UDUD,UDUD,UDUD,UDUD,DDUD,DUDD
4,2006-07-05,5,2,7,b. -2.5% to -1.5%,c. -1.5% to -0.5%,a. -inf to -2.5%,b. -2.5% to -1.5%,b. -2.5% to -1.5%,d. -0.5% to 0.5%,...,DDDD,UUUD,UDUD,UUUD,UDUD,UDUD,UDUD,UDUD,DDUD,DUUD


# Step 2: Model Ready Data

In [13]:
#Investing in Apple
df.drop(['MSFT_label','INTC_label','ORCL_label','CSCO_label','IBM_label','NVDA_label','TXN_label','QCOM_label','ADBE_label',
         'AAPL','MSFT','INTC','ORCL','CSCO','IBM','NVDA','TXN','QCOM','ADBE',
         'MSFT_ret','INTC_ret','ORCL_ret','CSCO_ret','IBM_ret','NVDA_ret','TXN_ret','QCOM_ret','ADBE_ret'],axis=1,inplace=True)

df.set_index('Date', inplace=True)
df.head()

Unnamed: 0_level_0,f1_day,f1_weekday,f1_month,AAPL_label,AAPL_ret,AAPL_r1,MSFT_r1,INTC_r1,ORCL_r1,CSCO_r1,...,AAPL_cat_ret,MSFT_cat_ret,INTC_cat_ret,ORCL_cat_ret,CSCO_cat_ret,IBM_cat_ret,NVDA_cat_ret,TXN_cat_ret,QCOM_cat_ret,ADBE_cat_ret
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2006-06-28,28,2,6,b. -2.5% to -1.5%,-0.024858,-0.026801,0.001751,-0.012662,-0.011656,-0.015424,...,DUDD,UUDD,UUUD,UUDD,DUDU,DUDD,DUDD,DUDD,DUDU,UUDU
2006-06-29,29,3,6,g. 2.5% to inf,0.05132,-0.024858,0.013038,0.033237,0.005502,0.004137,...,DUDD,UUDD,UUUD,UUUD,UDUD,UDUD,UDUD,UDDD,DDUD,DUUD
2006-06-30,30,4,6,a. -inf to -2.5%,-0.029252,0.05132,0.013305,0.034753,0.010914,0.026981,...,DUDD,UUUD,UDUD,DUUD,DDDD,DDDD,UDUD,UDDD,UDUD,DUUD
2006-07-03,3,0,7,e. 0.5% to 1.5%,0.011804,-0.029252,-0.007278,-0.016697,-0.017106,-0.01927,...,DUUD,UUUD,UUUD,UUUD,UDUD,UDUD,UDUD,UDUD,DDUD,DUDD
2006-07-05,5,2,7,b. -2.5% to -1.5%,-0.016503,0.011804,0.017022,0.01877,0.021844,0.023781,...,DDDD,UUUD,UDUD,UUUD,UDUD,UDUD,UDUD,UDUD,DDUD,DUUD


In [14]:
cat_feats = ['AAPL_cat_ret','MSFT_cat_ret','INTC_cat_ret','ORCL_cat_ret','CSCO_cat_ret',
             'IBM_cat_ret','NVDA_cat_ret','TXN_cat_ret','QCOM_cat_ret','ADBE_cat_ret']
final_data = pd.get_dummies(df,columns=cat_feats,drop_first=True)

# Step 3: Train Test Split

In [25]:
x = final_data.drop(['AAPL_label','AAPL_ret'],axis=1)
y = final_data['AAPL_label']

x_train = x.loc['2009-01-01':'2016-12-31']
x_test  = x.loc['2014-01-01':'2016-12-31']
y_train = y.loc['2009-01-01':'2016-12-31']
y_test  = y.loc['2014-01-01':'2016-12-31']

x_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2014 entries, 2009-01-02 to 2016-12-30
Columns: 203 entries, f1_day to ADBE_cat_ret_UUUU
dtypes: float64(50), int64(3), uint8(150)
memory usage: 1.1+ MB


# Step 4: Building the Model (Decision Trees)

In [26]:
dtree = DecisionTreeClassifier()
dtree.fit(x_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

# Step 5: Predictions and Evaluation

In [27]:
y_pred = dtree.predict(x_test)

print('classification_report')
print(classification_report(y_test,y_pred))
print('\n')
print('confusion_matrix')
print(confusion_matrix(y_test,y_pred))

classification_report
                   precision    recall  f1-score   support

 a. -inf to -2.5%       1.00      1.00      1.00        35
b. -2.5% to -1.5%       1.00      1.00      1.00        51
c. -1.5% to -0.5%       1.00      1.00      1.00       143
 d. -0.5% to 0.5%       1.00      1.00      1.00       253
  e. 0.5% to 1.5%       1.00      1.00      1.00       181
  f. 1.5% to 2.5%       1.00      1.00      1.00        60
   g. 2.5% to inf       1.00      1.00      1.00        33

      avg / total       1.00      1.00      1.00       756



confusion_matrix
[[ 35   0   0   0   0   0   0]
 [  0  51   0   0   0   0   0]
 [  0   0 143   0   0   0   0]
 [  0   0   0 253   0   0   0]
 [  0   0   0   0 181   0   0]
 [  0   0   0   0   0  60   0]
 [  0   0   0   0   0   0  33]]


# Step 6: Building the Model (Random Forests)

In [28]:
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

# Step 7: Predictions and Evaluation

In [29]:
y_pred = rfc.predict(x_test)

print('classification_report')
print(classification_report(y_test,y_pred))
print('\n')
print('confusion_matrix')
print(confusion_matrix(y_test,y_pred))

classification_report
                   precision    recall  f1-score   support

 a. -inf to -2.5%       1.00      1.00      1.00        35
b. -2.5% to -1.5%       1.00      1.00      1.00        51
c. -1.5% to -0.5%       1.00      1.00      1.00       143
 d. -0.5% to 0.5%       1.00      1.00      1.00       253
  e. 0.5% to 1.5%       1.00      1.00      1.00       181
  f. 1.5% to 2.5%       1.00      1.00      1.00        60
   g. 2.5% to inf       1.00      1.00      1.00        33

      avg / total       1.00      1.00      1.00       756



confusion_matrix
[[ 35   0   0   0   0   0   0]
 [  0  51   0   0   0   0   0]
 [  0   0 143   0   0   0   0]
 [  0   0   0 253   0   0   0]
 [  0   0   0   0 181   0   0]
 [  0   0   0   0   0  60   0]
 [  0   0   0   0   0   0  33]]
