# Random Forest Modeling

In [3]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, precision_score, accuracy_score, make_scorer

import warnings
warnings.filterwarnings("ignore")

# 1.1 Load Data

In [4]:
# non-engineered data
df = pd.read_csv("train.csv")

# engineered data
df_eng = pd.read_csv("train_engineered.csv")

df.head()

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis,class
0,80.111572,33.942432,85.101608,46.169139,125.593624,100.292107,Spondylolisthesis
1,63.404481,14.115327,48.136806,49.289153,111.916008,31.784495,Spondylolisthesis
2,52.419385,19.011561,35.87266,33.407825,116.559771,1.694705,Hernia
3,91.468741,24.508177,84.620272,66.960564,117.307897,52.623047,Spondylolisthesis
4,44.36249,8.945435,46.902096,35.417055,129.220682,4.994195,Normal


In [5]:
df_eng.head()

Unnamed: 0,lumbar_lordosis_angle,pelvic_radius,class__Hernia,class__Normal,class__Spondylolisthesis
0,85.101608,125.593624,0,0,1
1,48.136806,111.916007,0,0,1
2,35.87266,116.559771,1,0,0
3,84.620272,117.307897,0,0,1
4,46.902096,129.220682,0,1,0


# 1.2 One Hot Encode Target Variable

In [6]:
# ohe non-engineered data
dummies = pd.get_dummies(df["class"], prefix="class_")
df.drop("class", axis=1, inplace=True)
df = pd.concat([df, dummies], axis=1)

df.head()

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis,class__Hernia,class__Normal,class__Spondylolisthesis
0,80.111572,33.942432,85.101608,46.169139,125.593624,100.292107,0,0,1
1,63.404481,14.115327,48.136806,49.289153,111.916008,31.784495,0,0,1
2,52.419385,19.011561,35.87266,33.407825,116.559771,1.694705,1,0,0
3,91.468741,24.508177,84.620272,66.960564,117.307897,52.623047,0,0,1
4,44.36249,8.945435,46.902096,35.417055,129.220682,4.994195,0,1,0


# 2.1 Preprocessing

Here I will seperate the independent and dependent variables. I will also scale the independent variables.

In [7]:
# seperate independent and dependent variables

targets = [col for col in df if col.startswith("class")]

# non-engineered data
X = df.drop(targets, axis=1)
y_hernia = df["class__Hernia"]
y_spond = df["class__Spondylolisthesis"]

# engineered data
Xe = df_eng.drop(targets, axis=1)
ye_hernia = df_eng["class__Hernia"]
ye_spond = df_eng["class__Spondylolisthesis"]

X.shape, y_hernia.shape, y_spond.shape, Xe.shape, ye_hernia.shape, ye_spond.shape

((279, 6), (279,), (279,), (279, 2), (279,), (279,))

In [8]:
# scale data
sc = StandardScaler()

# non-engineered data
X = sc.fit_transform(X)

# engineered data
Xe = sc.fit_transform(Xe)

# 3.1 Baseline Models

Here I will look at the baseline performance of the default LogisticRegression model on the data sets. I will use cross validation.

In [9]:
# instantiate the default logistic regression model
clf = RandomForestClassifier()

In [10]:
# create a function to print desired scores
scorers = [make_scorer(g) for g in [recall_score, precision_score, accuracy_score]]

def find_scores(classifier, X, y, scorers):
    for scorer, score_type in zip(scorers, ["recall", "precision", "accuracy"]):
        print(f"\n\n{score_type}")
        results = cross_validate(clf, X, y, scoring=scorer, cv=4)["test_score"]
        print(results)
        print("avg:", sum(results)/4)
        print("var:", np.array(results).var())

# run function on non-engineered data pertaining to hernias
find_scores(clf, X, y_hernia, scorers)



recall
[0.5        0.35714286 0.61538462 0.46153846]
avg: 0.4835164835164836
var: 0.008528559352735178


precision
[0.72727273 0.8        0.63636364 0.6       ]
avg: 0.6909090909090909
var: 0.006115702479338847


accuracy
[0.85915493 0.85714286 0.84057971 0.88405797]
avg: 0.8602338669699355
var: 0.0002411480654189667


In [11]:
# find scores for non-engineered data pertaining to Spondylolisthesis
find_scores(clf, X, y_spond, scorers)



recall
[0.97058824 0.94117647 0.97058824 0.96969697]
avg: 0.9630124777183601
var: 0.00015906946152306344


precision
[0.94117647 1.         0.97142857 0.96969697]
avg: 0.9705755029284441
var: 0.00043290091923984007


accuracy
[0.94285714 0.95714286 0.98571429 0.95652174]
avg: 0.9605590062111802
var: 0.00024352841325566213


In [12]:
# find scores for engineered data pertaining to hernia
find_scores(clf, Xe, ye_hernia, scorers)



recall
[0.5        0.57142857 0.15384615 0.38461538]
avg: 0.4024725274725275
var: 0.025047925975123776


precision
[0.55555556 0.25       0.42857143 0.45454545]
avg: 0.4221681096681097
var: 0.01213084091533767


accuracy
[0.83098592 0.74285714 0.82608696 0.82608696]
avg: 0.8065042428483947
var: 0.0013543177455990838


In [13]:
# find scores for engineered data pertaining to Spondylolisthesis
find_scores(clf, Xe, ye_spond, scorers)



recall
[0.67647059 0.70588235 0.82352941 0.87878788]
avg: 0.7711675579322639
var: 0.006888392814588154


precision
[0.82142857 0.8        0.82857143 0.83870968]
avg: 0.8221774193548388
var: 0.000201649403257661


accuracy
[0.75714286 0.8        0.82857143 0.84057971]
avg: 0.8065734989648032
var: 0.0010317326363437629


Overall the non-engineered data seemed to work the best so far.

# 4.1 Hyper paramerer tuning

Now I will tune the hyper parameters for the models. From now on I am only concerned with the non-engineered data.

In [14]:
# explore parameter tuning for finding hernias
params = {"max_depth":[2,3,4,5,None],
         "n_estimators":[50, 100, 200]}
gscv = GridSearchCV(clf, params, cv=4)
gscv.fit(X, y_hernia)

GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [16]:
print(gscv.best_estimator_.get_params()["max_depth"])
print(gscv.best_estimator_.get_params()["n_estimators"])

None
50


In [17]:
# baseline scores
find_scores(clf, X, y_hernia, scorers)



recall
[0.35714286 0.42857143 0.61538462 0.61538462]
avg: 0.5041208791208791
var: 0.013017374109407079


precision
[0.75       0.83333333 0.85714286 0.7       ]
avg: 0.7851190476190477
var: 0.003997661564625852


accuracy
[0.90140845 0.88571429 0.86956522 0.84057971]
avg: 0.8743169159886857
var: 0.0005061572469024458


In [18]:
# after tuning
find_scores(gscv.best_estimator_, X, y_hernia, scorers)



recall
[0.5        0.5        0.38461538 0.46153846]
avg: 0.46153846153846156
var: 0.0022189349112426027


precision
[0.6        0.875      0.58333333 0.63636364]
avg: 0.6736742424242425
var: 0.013878378960055096


accuracy
[0.85915493 0.88571429 0.82608696 0.85507246]
avg: 0.8565071588954014
var: 0.00044687850229050183


In [19]:
# explore parameter tuning for finding hernias
gscv2 = GridSearchCV(clf, params, cv=4)
gscv2.fit(X, y_spond)

GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [21]:
print(gscv2.best_estimator_.get_params()["max_depth"])
print(gscv2.best_estimator_.get_params()["n_estimators"])

5
200


In [22]:
# baseline scores
find_scores(clf, X, y_spond, scorers)



recall
[0.97058824 0.97058824 0.88235294 0.93939394]
avg: 0.9407308377896613
var: 0.0012981736204447755


precision
[0.97058824 1.         0.96969697 0.94117647]
avg: 0.9703654188948307
var: 0.00043267489300046727


accuracy
[0.94285714 0.98571429 0.98571429 0.94202899]
avg: 0.9640786749482402
var: 0.00046818538379435126


In [23]:
# after tuning
find_scores(gscv2.best_estimator_, X, y_spond, scorers)



recall
[0.94117647 0.97058824 1.         0.96969697]
avg: 0.9703654188948306
var: 0.00043267489300046727


precision
[0.96969697 1.         0.97142857 0.93939394]
avg: 0.9701298701298702
var: 0.00045969903112760173


accuracy
[0.95714286 0.97142857 0.98571429 0.95652174]
avg: 0.9677018633540373
var: 0.00014370587554492502
