<a href="https://colab.research.google.com/github/datametal/ML-Algorithmic-Trading/blob/main/O'Reilly_Class_Logistic_Regression_and_Regularization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import Libraries

In [None]:
import numpy as np
import pandas as pd
import pandas_datareader.data as pdr
from datetime import datetime
import matplotlib.pyplot as plt

#Import Data

In [None]:
start = datetime(1982, 1, 1)
end = datetime(2024, 1, 1)

recession = pdr.DataReader('USREC', 'fred', start, end) #NBER business cycle classification
yield_curve = pdr.DataReader('T10Y3MM', 'fred', start, end) #Difference between the 3 month and 10 year treasury yields
unemployment = pdr.DataReader('UNRATE', 'fred', start, end) #Unemployment rate
industrial_capacity = pdr.DataReader('TCU', 'fred', start, end) #Total industrial capacity utilization

# Build and Train Model

In [None]:
#Create features dataframe
features = pd.DataFrame()
features['curve'] = yield_curve['T10Y3MM'].diff() #Difference between the yields of the 3 month bill and the 10 year note
features['unemployment'] = unemployment['UNRATE'].diff()
features['industrial'] = industrial_capacity['TCU'].diff()
features = features.dropna()
features.head()

Unnamed: 0_level_0,curve,unemployment,industrial
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1982-02-01,-1.52,0.3,1.3711
1982-03-01,0.4,0.1,-0.6647
1982-04-01,-0.02,0.3,-0.8298
1982-05-01,0.38,0.1,-0.6301
1982-06-01,0.31,0.2,-0.3086


In [None]:
testsample = features[-12:] # Our out-of-sample test data
features = features[:-12] # Now need to get rid of those rows
features.tail()

Unnamed: 0_level_0,curve,unemployment,industrial
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-08-01,-0.42,0.1,-0.02
2022-09-01,0.12,-0.1,0.1251
2022-10-01,-0.19,0.1,-0.1992
2022-11-01,-0.54,0.0,-0.3751
2022-12-01,-0.31,-0.1,-1.3431


In [None]:
#Create classification target dataframe
target = recession[1:] #Feature matrix does not have the first month and needs to be aligned with predictors
target.tail()

Unnamed: 0_level_0,USREC
DATE,Unnamed: 1_level_1
2023-08-01,0
2023-09-01,0
2023-10-01,0
2023-11-01,0
2023-12-01,0


In [None]:
#Percentage of time the US economy was in recession since 1982
round(target['USREC'].sum()/target['USREC'].count()*100, 2)

9.15

In [None]:
target = target[:-12] #Remove for prediction purposes

In [None]:
#Create logistic regression model
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

#Need to covert column vector into a 1-d Numpy array
target = np.ravel(target)

classifier = LogisticRegression(random_state = 0)

scaler = StandardScaler() #Scales input data so that it has a mean of zero and variance of one
features_standardized = scaler.fit_transform(features) #Every feature now has zero mean and unit variance

#Split dataset into train and test subsets. Test size is 25% of the total dataset
from sklearn.model_selection import train_test_split

features_train, features_test, target_train, target_test = train_test_split(features, target, test_size = 0.25, random_state=0, shuffle=False)
classifier.fit(features_train, target_train)
print("Model coefficients:", classifier.coef_)


Model coefficients: [[ 1.40704779  3.37577269 -2.16175724]]


#Evaluate and Predict

In [None]:
print("Training score:", classifier.score(features_train, target_train))
print("Testing score:", classifier.score(features_test, target_test))

Training score: 0.907608695652174
Testing score: 0.991869918699187


In [None]:
testsample #out-of-sample test data

Unnamed: 0_level_0,curve,unemployment,industrial
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-01-01,-0.42,-0.1,0.7179
2023-02-01,0.12,0.2,-0.092
2023-03-01,-0.16,-0.1,-0.0336
2023-04-01,-0.41,-0.1,0.2782
2023-05-01,-0.13,0.3,-0.2756
2023-06-01,0.07,-0.1,-0.5783
2023-07-01,0.08,-0.1,0.5938
2023-08-01,0.2,0.3,-0.0499
2023-09-01,0.21,0.0,-0.0143
2023-10-01,0.38,0.0,-0.7566


In [None]:
#Use out-of-sample data to predict recession

print("Recession class prediction based on test data:", classifier.predict(testsample))
print("Probability of recession class based on test data", classifier.predict_proba(testsample))

Recession class prediction based on test data: [0 0 0 0 0 0 0 0 0 0 0 0]
Probability of recession class based on test data [[0.99412671 0.00587329]
 [0.83314812 0.16685188]
 [0.95855489 0.04144511]
 [0.98473567 0.01526433]
 [0.77301088 0.22698912]
 [0.83752496 0.16247504]
 [0.98462703 0.01537297]
 [0.77712219 0.22287781]
 [0.91089351 0.08910649]
 [0.61791865 0.38208135]
 [0.95578666 0.04421334]
 [0.95812793 0.04187207]]


#Ridge Regression/L2 Regularization

In [None]:
 #Regularize logistic regression model with C hyperparameter. Reducing C increases regularization since it is the reciprocal of alpha.
regularized_classifier2 = LogisticRegression(penalty='l2', C=1, random_state = 0) #L1 penalty is Lasso regression and L2 penalty is ridge regression

regularized_classifier2.fit(features_train, target_train)
print("Model coefficients:", regularized_classifier2.coef_)

Model coefficients: [[ 1.40704779  3.37577269 -2.16175724]]


In [None]:
print("Training score:", regularized_classifier2.score(features_train, target_train))
print("Testing score:", regularized_classifier2.score(features_test, target_test))

Training score: 0.907608695652174
Testing score: 0.991869918699187


#Lasso Regression/L1 Regularization

In [None]:
#Regularize logistic regression model with C hyperparameter. Reducing C increases regularization since it is the reciprocal of alpha.
regularized_classifier1 = LogisticRegression(penalty='l1', solver='liblinear', C=0.1, random_state = 0) #L1 penalty is Lasso regression and is not supported by the default solver

regularized_classifier1.fit(features_train, target_train)
print("Model coefficients:", regularized_classifier1.coef_)

Model coefficients: [[ 0.          0.         -1.29725608]]


In [None]:
print("Training score:", regularized_classifier1.score(features_train, target_train))
print("Testing score:", regularized_classifier1.score(features_test, target_test))

Training score: 0.8858695652173914
Testing score: 0.991869918699187


# Model Evaluation

In [None]:
#Use K-fold cross validation (default folds = 5, default scoring metric = accuracy)
from sklearn.model_selection import cross_val_score

score = cross_val_score(classifier, features_standardized, target)
print(score)
print('Mean score and standard deviation of score', score.mean(), score.std())

[0.92929293 0.89795918 0.89795918 0.93877551 0.97959184]
Mean score and standard deviation of score 0.9287157287157287 0.030271573652951023


In [None]:
#Use accuracy = (TP + TN)/(TP+TN+FP+FN) as the evaluation metric
cross_val_score(classifier, features_standardized, target, scoring = "accuracy")

array([0.92929293, 0.89795918, 0.89795918, 0.93877551, 0.97959184])

In [None]:
#Use precision = TP/(TP+FP) as the evaluation metric
cross_val_score(classifier, features_standardized, target, scoring = "precision")

array([1.        , 0.33333333, 0.33333333, 1.        , 0.88888889])

In [None]:
#Use Sensitivity = TP/(TP + FN) as the evaluation metric
cross_val_score(classifier, features_standardized, target, scoring = "recall")

array([0.3       , 0.11111111, 0.11111111, 0.33333333, 0.88888889])

In [None]:
#Use F measure = 2*(precision*recall/(precision+recall)) as the evaluation metric
cross_val_score(classifier, features_standardized, target, scoring = "f1")

array([0.46153846, 0.16666667, 0.16666667, 0.5       , 0.88888889])