# Decision Tree Modeling

In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot
%matplotlib inline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# 1 Load Data

In [2]:
train = pd.read_csv("train_engineered.csv")
val = pd.read_csv("val_engineered.csv")

train.head()

Unnamed: 0,class_p,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_g,cap-surface_s,cap-surface_y,cap-color_c,...,population_n,population_s,population_v,population_y,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,1,0,0,0,0,1,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0
1,1,0,1,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,0
3,1,0,1,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
4,0,0,0,0,0,1,0,1,0,0,...,0,1,0,0,1,0,0,0,0,0


# 2 Preprocessing

Since the entire data set here is one-hot-encoded scaling is not necessary

In [3]:
# split x and y variables
X_train, y_train = train.drop("class_p", axis=1), train.class_p
X_val, y_val = val.drop("class_p", axis=1), val.class_p

# 3 Hyper parameter tuning

In [4]:
params = {"max_depth":[None, 2, 3, 4, 5]}
gs = GridSearchCV(DecisionTreeClassifier(), params)
gs.fit(X_train, y_train)
gs.best_params_



{'max_depth': None}

In [5]:
model = gs.best_estimator_

# 4 Evaluation

In [6]:
model.fit(X_train, y_train)
train_pred = model.predict(X_train)
print("training results")
print(classification_report(y_train, train_pred))

val_pred = model.predict(X_val)
print("\n\n\n validation results")
print(classification_report(y_val, val_pred))

training results
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2357
           1       1.00      1.00      1.00      2212

    accuracy                           1.00      4569
   macro avg       1.00      1.00      1.00      4569
weighted avg       1.00      1.00      1.00      4569




 validation results
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       811
           1       1.00      1.00      1.00       713

    accuracy                           1.00      1524
   macro avg       1.00      1.00      1.00      1524
weighted avg       1.00      1.00      1.00      1524



The model is over fitting severely, so I will decrease the max depth parameter to 2

In [7]:
model = model.set_params(max_depth=2)
model

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [8]:
model.fit(X_train, y_train)
train_pred = model.predict(X_train)
print("training results")
print(classification_report(y_train, train_pred))

val_pred = model.predict(X_val)
print("\n\n\n validation results")
print(classification_report(y_val, val_pred))

training results
              precision    recall  f1-score   support

           0       0.98      0.94      0.96      2357
           1       0.93      0.98      0.96      2212

    accuracy                           0.96      4569
   macro avg       0.96      0.96      0.96      4569
weighted avg       0.96      0.96      0.96      4569




 validation results
              precision    recall  f1-score   support

           0       0.98      0.93      0.96       811
           1       0.93      0.98      0.95       713

    accuracy                           0.95      1524
   macro avg       0.95      0.96      0.95      1524
weighted avg       0.96      0.95      0.95      1524

