In [49]:
# Random Forest


In [50]:
# Let's go back to our "bank loan" example


import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm

import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV

In [51]:
# Classification task

df = pd.read_stata("/Users/benya/OneDrive/Desktop/BADM453/Test_2.dta")
df

Unnamed: 0,age,numberofsexualpartners,firstsexualintercourse,numofpregnancies,smokes,smokesyears,smokespacksyear,hormonalcontraceptives,hormonalcontraceptivesyears,iud,...,stdsnumber,stdsnumberofdiagnosis,dxcancer,dxcin,dxhpv,dx,hinselmann,schiller,citology,biopsy
0,18,4,15,1,0,0.0,0.0,0,0.00,0,...,0,0,0,0,0,0,0,0,0,0
1,15,1,14,1,0,0.0,0.0,0,0.00,0,...,0,0,0,0,0,0,0,0,0,0
2,52,5,16,4,1,37.0,37.0,1,3.00,0,...,0,0,1,0,1,0,0,0,0,0
3,46,3,21,4,0,0.0,0.0,1,15.00,0,...,0,0,0,0,0,0,0,0,0,0
4,42,3,23,2,0,0.0,0.0,0,0.00,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
663,34,3,18,0,0,0.0,0.0,0,0.00,0,...,0,0,0,0,0,0,0,0,0,0
664,32,2,19,1,0,0.0,0.0,1,8.00,0,...,0,0,0,0,0,0,0,0,0,0
665,25,2,17,0,0,0.0,0.0,1,0.08,0,...,0,0,0,0,0,0,0,0,1,0
666,33,2,24,2,0,0.0,0.0,1,0.08,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
# Baseline Logit from Test 2

y = df['biopsy']
X = df[['age','numberofsexualpartners','firstsexualintercourse','numofpregnancies','smokes','smokesyears','smokespacksyear','hormonalcontraceptives','hormonalcontraceptivesyears','iud','iudyears','stds','stdsnumber','stdsnumberofdiagnosis','dxcancer','dxcin','dxhpv','dx','hinselmann','schiller','citology']]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state = 0) # random sample

logit = LogisticRegression(max_iter = 500)
logit.fit(X_train, y_train)

y_pred = logit.predict(X_val)

print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

[[177  10]
 [  7   7]]
              precision    recall  f1-score   support

           0       0.96      0.95      0.95       187
           1       0.41      0.50      0.45        14

    accuracy                           0.92       201
   macro avg       0.69      0.72      0.70       201
weighted avg       0.92      0.92      0.92       201



In [53]:
# Let's take a look at the main parameters behind RF

# CLASSIFIER -- categorical outcome task
# REGRESSOR -- numerical outcome task

# n_estimators -- The number of trees in the forest (integer, default 100)
# max_depth -- The maximum depth (size or levels) of the tree (integer, default=2)
# max_features -- The number of features to consider when looking for the best split (none -- available features, integer, sqrt, log2, default auto)


In [54]:
# Random Forest (default)

y = df['biopsy']
X = df[['age','numberofsexualpartners','firstsexualintercourse','numofpregnancies','smokes','smokesyears','smokespacksyear','hormonalcontraceptives','hormonalcontraceptivesyears','iud','iudyears','stds','stdsnumber','stdsnumberofdiagnosis','dxcancer','dxcin','dxhpv','dx','hinselmann','schiller','citology']]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state = 0) # Consistency of initial sampling

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_val)

print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

[[178   9]
 [  9   5]]
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       187
           1       0.36      0.36      0.36        14

    accuracy                           0.91       201
   macro avg       0.65      0.65      0.65       201
weighted avg       0.91      0.91      0.91       201



In [55]:
# Random Forest (parameters)

y = df['biopsy']
X = df[['age','numberofsexualpartners','firstsexualintercourse','numofpregnancies','smokes','smokesyears','smokespacksyear','hormonalcontraceptives','hormonalcontraceptivesyears','iud','iudyears','stds','stdsnumber','stdsnumberofdiagnosis','dxcancer','dxcin','dxhpv','dx','hinselmann','schiller','citology']]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state = 0) # random sample


rfc = RandomForestClassifier(max_depth = 10, max_features = 15, n_estimators = 20, random_state = 0) # Consistency of tree sampling
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_val)

print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

[[176  11]
 [  7   7]]
              precision    recall  f1-score   support

           0       0.96      0.94      0.95       187
           1       0.39      0.50      0.44        14

    accuracy                           0.91       201
   macro avg       0.68      0.72      0.69       201
weighted avg       0.92      0.91      0.92       201



In [56]:
# Random Forest (Grid Search)

y = df['biopsy']
X = df[['age','numberofsexualpartners','firstsexualintercourse','numofpregnancies','smokes','smokesyears','smokespacksyear','hormonalcontraceptives','hormonalcontraceptivesyears','iud','iudyears','stds','stdsnumber','stdsnumberofdiagnosis','dxcancer','dxcin','dxhpv','dx','hinselmann','schiller','citology']]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state = 0) 

# Define parameter grid
param_grid = [{'max_depth': [2, 4, 6, 8, 10, 15, 20], 'max_features': [5, 10, 15], 'n_estimators': [10, 20, 50, 200]}]

# Instantiate model of interest (note I am using a different name)
rf = RandomForestClassifier(random_state = 0)

# Fit grid search on the model and parameter grid
# Note that you can get results based on cross-validated model
grid_search = GridSearchCV(rf, param_grid, cv=3)

grid_search.fit(X_train, y_train)

# The command below prints out the "best" parameters given the parameter grid
grid_search.best_estimator_

In [57]:
# Let's now see how the model is going to perform with these parameters 

y = df['biopsy']
X = df[['age','numberofsexualpartners','firstsexualintercourse','numofpregnancies','smokes','smokesyears','smokespacksyear','hormonalcontraceptives','hormonalcontraceptivesyears','iud','iudyears','stds','stdsnumber','stdsnumberofdiagnosis','dxcancer','dxcin','dxhpv','dx','hinselmann','schiller','citology']]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state = 0)

rfc = RandomForestClassifier(max_depth = 2, max_features = 15, n_estimators = 20, random_state = 0)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_val)

print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

[[178   9]
 [  5   9]]
              precision    recall  f1-score   support

           0       0.97      0.95      0.96       187
           1       0.50      0.64      0.56        14

    accuracy                           0.93       201
   macro avg       0.74      0.80      0.76       201
weighted avg       0.94      0.93      0.93       201



In [58]:
# Regression task

In [59]:
dataset = pd.read_csv('/Users/benya/OneDrive/Desktop/BADM453/winequality-white.csv', sep = ';')
dataset

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [60]:
# Baseline RF regressor model

X = dataset[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']]
y = dataset['quality']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) # test_size = 0.3 means its 30%

rfr = RandomForestRegressor(random_state = 0)
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Root Mean Squared Error: 0.6595198212016248


In [61]:
# RF with parametrization

X = dataset[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']]
y = dataset['quality']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

param_grid = [{'max_depth': [2, 4, 6], 'max_features': [3, 6, 9, 11], 'n_estimators': [10, 20, 50, 100]}]

rf = RandomForestRegressor(random_state = 0)

grid_search = GridSearchCV(rf, param_grid, cv=3)

grid_search.fit(X_train, y_train)

grid_search.best_estimator_

In [62]:
# Let's use the parameters in the model

X = dataset[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']]
y = dataset['quality']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) # test_size = 0.3 means its 30%

rfr = RandomForestRegressor(max_depth=6, max_features=11, n_estimators=50, random_state=0)

rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Root Mean Squared Error: 0.7382324685492256
