# Decision Tree Modeling

In [10]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score, precision_score, accuracy_score, make_scorer

import warnings
warnings.filterwarnings("ignore")

# 1.1 Load Data

In [11]:
# non-engineered data
df = pd.read_csv("train.csv")

# engineered data
df_eng = pd.read_csv("train_engineered.csv")

df.head()

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis,class
0,80.111572,33.942432,85.101608,46.169139,125.593624,100.292107,Spondylolisthesis
1,63.404481,14.115327,48.136806,49.289153,111.916008,31.784495,Spondylolisthesis
2,52.419385,19.011561,35.87266,33.407825,116.559771,1.694705,Hernia
3,91.468741,24.508177,84.620272,66.960564,117.307897,52.623047,Spondylolisthesis
4,44.36249,8.945435,46.902096,35.417055,129.220682,4.994195,Normal


In [12]:
df_eng.head()

Unnamed: 0,lumbar_lordosis_angle,pelvic_radius,class__Hernia,class__Normal,class__Spondylolisthesis
0,85.101608,125.593624,0,0,1
1,48.136806,111.916007,0,0,1
2,35.87266,116.559771,1,0,0
3,84.620272,117.307897,0,0,1
4,46.902096,129.220682,0,1,0


# 1.2 One Hot Encode Target Variable

In [13]:
# ohe non-engineered data
dummies = pd.get_dummies(df["class"], prefix="class_")
df.drop("class", axis=1, inplace=True)
df = pd.concat([df, dummies], axis=1)

df.head()

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis,class__Hernia,class__Normal,class__Spondylolisthesis
0,80.111572,33.942432,85.101608,46.169139,125.593624,100.292107,0,0,1
1,63.404481,14.115327,48.136806,49.289153,111.916008,31.784495,0,0,1
2,52.419385,19.011561,35.87266,33.407825,116.559771,1.694705,1,0,0
3,91.468741,24.508177,84.620272,66.960564,117.307897,52.623047,0,0,1
4,44.36249,8.945435,46.902096,35.417055,129.220682,4.994195,0,1,0


# 2.1 Preprocessing

Here I will seperate the independent and dependent variables. I will also scale the independent variables.

In [14]:
# seperate independent and dependent variables

targets = [col for col in df if col.startswith("class")]

# non-engineered data
X = df.drop(targets, axis=1)
y_hernia = df["class__Hernia"]
y_spond = df["class__Spondylolisthesis"]

# engineered data
Xe = df_eng.drop(targets, axis=1)
ye_hernia = df_eng["class__Hernia"]
ye_spond = df_eng["class__Spondylolisthesis"]

X.shape, y_hernia.shape, y_spond.shape, Xe.shape, ye_hernia.shape, ye_spond.shape

((279, 6), (279,), (279,), (279, 2), (279,), (279,))

In [15]:
# scale data
sc = StandardScaler()

# non-engineered data
X = sc.fit_transform(X)

# engineered data
Xe = sc.fit_transform(Xe)

# 3.1 Baseline Models

Here I will look at the baseline performance of the default LogisticRegression model on the data sets. I will use cross validation.

In [16]:
# instantiate the default logistic regression model
clf = DecisionTreeClassifier()

In [17]:
# create a function to print desired scores
scorers = [make_scorer(g) for g in [recall_score, precision_score, accuracy_score]]

def find_scores(classifier, X, y, scorers):
    for scorer, score_type in zip(scorers, ["recall", "precision", "accuracy"]):
        print(f"\n\n{score_type}")
        results = cross_validate(clf, X, y, scoring=scorer, cv=4)["test_score"]
        print(results)
        print("avg:", sum(results)/4)
        print("var:", np.array(results).var())

# run function on non-engineered data pertaining to hernias
find_scores(clf, X, y_hernia, scorers)



recall
[0.5        0.57142857 0.61538462 0.61538462]
avg: 0.5755494505494505
var: 0.0022245954594855705


precision
[0.57142857 0.625      0.57142857 0.57142857]
avg: 0.5848214285714286
var: 0.0005381058673469394


accuracy
[0.83098592 0.87142857 0.84057971 0.84057971]
avg: 0.8458934768028461
var: 0.00023268716848561053


In [18]:
# find scores for non-engineered data pertaining to Spondylolisthesis
find_scores(clf, X, y_spond, scorers)



recall
[0.94117647 0.91176471 0.97058824 0.93939394]
avg: 0.9407308377896614
var: 0.00043312171733058823


precision
[0.91428571 0.96666667 0.97142857 0.96875   ]
avg: 0.9552827380952381
var: 0.0005631012170493202


accuracy
[0.92857143 0.92857143 0.94285714 0.95652174]
avg: 0.9391304347826086
var: 0.00013483276108174828


In [19]:
# find scores for engineered data pertaining to hernia
find_scores(clf, Xe, ye_hernia, scorers)



recall
[0.42857143 0.42857143 0.30769231 0.15384615]
avg: 0.3296703296703296
var: 0.012740007245501747


precision
[0.46153846 0.35294118 0.44444444 0.25      ]
avg: 0.37723102061337355
var: 0.0071007795183084645


accuracy
[0.76056338 0.72857143 0.7826087  0.75362319]
avg: 0.7563416732277723
var: 0.0003715889813734043


In [20]:
# find scores for engineered data pertaining to Spondylolisthesis
find_scores(clf, Xe, ye_spond, scorers)



recall
[0.70588235 0.67647059 0.82352941 0.78787879]
avg: 0.7484402852049912
var: 0.003546146825283342


precision
[0.82142857 0.71428571 0.71794872 0.8125    ]
avg: 0.7665407509157509
var: 0.0025541749845278336


accuracy
[0.74285714 0.71428571 0.77142857 0.8115942 ]
avg: 0.7600414078674949
var: 0.0012940601571441441


Overall the non-engineered data seemed to work the best so far.

# 4.1 Hyper paramerer tuning

Now I will tune the hyper parameters for the models. From now on I am only concerned with the non-engineered data.

In [21]:
# explore parameter tuning for finding hernias
params = {"max_depth":[2,3,4,5,None]}
gscv = GridSearchCV(clf, params, cv=4)
gscv.fit(X, y_hernia)

GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [2, 3, 4, 5, None]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
 

In [25]:
print(gscv.best_estimator_.get_params()["max_depth"])

None


In [26]:
# baseline scores
find_scores(clf, X, y_hernia, scorers)



recall
[0.5        0.5        0.53846154 0.61538462]
avg: 0.5384615384615384
var: 0.002218934911242605


precision
[0.53846154 0.6        0.5        0.6       ]
avg: 0.5596153846153846
var: 0.0018158284023668632


accuracy
[0.8028169  0.8        0.85507246 0.84057971]
avg: 0.8246172688303737
var: 0.0005658960939535454


In [27]:
# after tuning
find_scores(gscv.best_estimator_, X, y_hernia, scorers)



recall
[0.5        0.42857143 0.53846154 0.61538462]
avg: 0.5206043956043956
var: 0.004549193937930205


precision
[0.53846154 0.54545455 0.58333333 0.6       ]
avg: 0.5668123543123543
var: 0.0006585718331784773


accuracy
[0.81690141 0.85714286 0.8115942  0.86956522]
avg: 0.838800921470854
var: 0.0006256656784172885


In [28]:
# explore parameter tuning for finding hernias
gscv2 = GridSearchCV(clf, params, cv=4)
gscv2.fit(X, y_spond)

GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [2, 3, 4, 5, None]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
 

In [29]:
print(gscv2.best_estimator_.get_params()["max_depth"])

2


In [30]:
# baseline scores
find_scores(clf, X, y_spond, scorers)



recall
[0.94117647 0.91176471 0.97058824 0.93939394]
avg: 0.9407308377896614
var: 0.00043312171733058823


precision
[0.94117647 0.96774194 0.94117647 0.96875   ]
avg: 0.9547112191650854
var: 0.00018331644329713082


accuracy
[0.94285714 0.91428571 0.95714286 0.95652174]
avg: 0.9427018633540374
var: 0.00030175822692025863


In [31]:
# after tuning
find_scores(gscv2.best_estimator_, X, y_spond, scorers)



recall
[0.94117647 0.94117647 1.         0.93939394]
avg: 0.9554367201426025
var: 0.000662491540126016


precision
[0.94117647 0.96875    0.93939394 0.96969697]
avg: 0.9547543449197862
var: 0.00020986528140946653


accuracy
[0.94285714 0.94285714 0.92857143 0.94202899]
avg: 0.93907867494824
var: 3.691504957370443e-05
