<a href="https://colab.research.google.com/github/monsin/analystmyc/blob/master/O'Reilly_Class_Gradient_Boosting_Machines_for_Classification_and_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries and Market Data

In [None]:
#Import Python Libraries
import numpy as np
import pandas as pd
from datetime import datetime

import pandas_datareader.data as pdr
import fix_yahoo_finance as yf
yf.pdr_override()

import matplotlib.pyplot as plt
plt.style.use('seaborn')

In [None]:
#Import data
start = datetime(2020, 1, 1)
end = datetime(2021, 3, 26)

stock = pdr.get_data_yahoo('AAPL', start, end) #Apple Inc. stock
market = pdr.get_data_yahoo('SPY', start, end) #S&P 500 index
vix = pdr.get_data_yahoo('^VIX', start, end)   #Volatility index
dxy = pdr.get_data_yahoo('UUP', start, end)    #Dollar index
junk = pdr.get_data_yahoo('JNK', start, end)   #Junk bond index

#Design Model

In [None]:
#Create target dataframe
target = pd.DataFrame()
#Use adjusted closing prices instead of closing prices to adjust for corporate actions such as dividends, splits and mergers
target['return'] = (stock['Open']-stock['Adj Close'].shift(1))/stock['Adj Close'].shift(1) #Returns based on buying on the close the day before and selling on the open the day after
target = target.dropna() #get rid of the NaNs
target['direction'] = np.where(target['return'] > 0, 1, -1) #Overnight direction of the stock
target.head()

Unnamed: 0_level_0,return,direction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-03,-0.000619,-1
2020-01-06,-0.002219,-1
2020-01-07,0.010278,1
2020-01-08,0.005979,1
2020-01-09,0.023637,1


In [None]:
#Create features dataframe
features = pd.DataFrame()
features['market'] = market['Adj Close'].pct_change(1)*100
features['vix'] = vix['Adj Close'].diff() #VIX is measured in percentage terms
features['dxy'] = dxy['Adj Close'].pct_change(1)*100
features['junk'] = junk['Adj Close'].pct_change(1)*100
features = features.dropna()
features.tail()

Unnamed: 0_level_0,market,vix,dxy,junk
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-03-19,-0.185029,-0.629999,0.040375,0.467028
2021-03-22,0.798497,-2.070002,-0.20178,0.269617
2021-03-23,-0.78708,1.42,0.606547,0.055638
2021-03-24,-0.508347,0.900002,0.241166,0.111199
2021-03-25,0.562558,-1.390002,0.28067,0.083314


In [None]:
lastknown = features[-1:] #Values of features from the last trading session
features = features[:-1] #Subtracts last row from the features matrix so that it aligns with labels vector
features.tail()

Unnamed: 0_level_0,market,vix,dxy,junk
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-03-18,-1.454969,2.35,0.609257,-0.723292
2021-03-19,-0.185029,-0.629999,0.040375,0.467028
2021-03-22,0.798497,-2.070002,-0.20178,0.269617
2021-03-23,-0.78708,1.42,0.606547,0.055638
2021-03-24,-0.508347,0.900002,0.241166,0.111199


#Gradient Boosting Classifier for Overnight Direction

In [None]:
#Get rid of return column for classifiers
targetclass = target.drop(axis=1, columns='return')
targetclass = targetclass[1:] #Removes the first row of labels since we are correlating today's features with tomorrow's opening values
targetclass.head()                                                                        

Unnamed: 0_level_0,direction
Date,Unnamed: 1_level_1
2020-01-06,-1
2020-01-07,1
2020-01-08,1
2020-01-09,1
2020-01-10,1


In [None]:
#Get rid of direction column for regressors
targetvalue = target.drop(axis=1, columns='direction')
targetvalue = targetvalue[1:] #Removes the first row of labels since we are correlating today's features with tomorrow's opening values
targetvalue.head()                                                                        


Unnamed: 0_level_0,return
Date,Unnamed: 1_level_1
2020-01-06,-0.002219
2020-01-07,0.010278
2020-01-08,0.005979
2020-01-09,0.023637
2020-01-10,0.013308


#Train, Test and Regularize Gradient Boosting Classifier

In [None]:
#Train and test classifier using Gini impurity performance metric
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

classifier = GradientBoostingClassifier(random_state=1, learning_rate=0.01, max_depth=3)
targetclass = np.ravel(targetclass) 
features_train, features_test, targetclass_train, targetclass_test = train_test_split(features, targetclass, test_size = 0.25, random_state=0)
classifier.fit(features_train, targetclass_train)
print("Training score:", classifier.score(features_train, targetclass_train))
print("Testing score:", classifier.score(features_test, targetclass_test))

Training score: 0.7445887445887446
Testing score: 0.7272727272727273


In [None]:
print("Tomorrow's change:", classifier.predict(lastknown))
print("Probability of change", classifier.predict_proba(lastknown))
lastknown

Tomorrow's change: [1]
Probability of change [[0.22704361 0.77295639]]


Unnamed: 0_level_0,market,vix,dxy,junk
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-03-25,0.562558,-1.390002,0.28067,0.083314


In [None]:
#Inferring the importance of each feature
print(features.columns)
print(classifier.feature_importances_)

Index(['market', 'vix', 'dxy', 'junk'], dtype='object')
[0.36784929 0.27733588 0.20556345 0.14925138]


#Gradient Boosting Regressor for Overnight Value Changes

#Train and Test GBRT Model

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

regressor = GradientBoostingRegressor(random_state=1)
targetvalue = np.ravel(targetvalue) #Need to covert column vector into a 1-d array
features_train, features_test, targetvalue_train, targetvalue_test = train_test_split(features, targetvalue, test_size = 0.25, random_state=0)
regressor.fit(features_train, targetvalue_train)
print("Training score:", regressor.score(features_train, targetvalue_train))
print("Testing score:", regressor.score(features_test, targetvalue_test)) #Note that scikit-learn library makes the default mean squared error(mse) test score negative so that it is maximized instead of minimized
print("Tomorrow's value change:", regressor.predict(lastknown))

Training score: 0.8830184687976247
Testing score: -0.7429081624781926
Tomorrow's value change: [0.01252763]


#Regularize and Test GBRT Model

In [None]:
regressor_mae = GradientBoostingRegressor(criterion="mae", random_state=1, learning_rate=0.4, max_depth=5, n_estimators=200, min_samples_split=5, min_samples_leaf=10) #Reduce learning rate (between 0 and 1) to avoid overfitting
features_train, features_test, targetvalue_train, targetvalue_test = train_test_split(features, targetvalue, test_size = 0.25, random_state=1)
regressor_mae.fit(features_train, targetvalue_train)
print("Training score:", regressor_mae.score(features_train, targetvalue_train))
print("Testing score:", regressor_mae.score(features_test, targetvalue_test)) #Note that scikit-learn library makes the mean absolute error(mae) test score negative so that it is maximized instead of minimized
print("Tomorrow's value change:", regressor_mae.predict(lastknown))

Training score: 0.6218608963692029
Testing score: -0.6403926722787197
Tomorrow's value change: [0.01989613]


In [None]:
#Inferring the importance of each feature
print(features.columns)
print(regressor.feature_importances_)
print(regressor_mae.feature_importances_)

Index(['market', 'vix', 'dxy', 'junk'], dtype='object')
[0.31848034 0.18498444 0.288058   0.20847723]
[0.23429362 0.24786429 0.25505223 0.26278986]
