In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing

In [3]:
from sklearn.metrics import f1_score
from sklearn.metrics import jaccard_score

In [4]:
df = pd.read_csv('mod_df.csv')
df.shape

(194673, 5)

In [5]:
# The Modeling-Evaluation-Split may have the ratio of 3:7. We retrieve a random selection of rows from the working data.
# We accomplish this first split also using scit-learn train-test-split function
# Notice: For better modeling, take ROADCOND, WEATHER, LIGHTCOND as categorical data and apply One-Hot-Encoding. (But not Junctiontype)
# Define feature set
Feature = df[['JUNCTIONTYPE','WEATHER','ROADCOND','LIGHTCOND']]
X = Feature
X.shape

(194673, 4)

In [6]:
# Define our classifier label
y = df['SEVERITYCODE.1']

In [7]:
# Apply test-train-split in order to get the Modeling-Evaluation split
X_model, X_eval, y_model, y_eval = train_test_split( X, y, test_size=0.3, random_state=4)

In [8]:
X_model.shape

(136271, 4)

In [9]:
# Apply test-train-split in order to get the train and test data
X_train, X_test, y_train, y_test = train_test_split( X_model, y_model, test_size=0.2, random_state=4)

In [10]:
X_train.shape


(109016, 4)

In [14]:
# We normalize the data after the train test split.
X_train = preprocessing.StandardScaler().fit(X_train).transform(X_train)

In [15]:
# We use the training data to optimize K (depth of tree)
Ks = 10
mean_acc_dt = np.zeros((Ks-1))
std_acc_dt = np.zeros((Ks-1))
jac_sc_dt = np.zeros((Ks-1))
f1_sc_dt = np.zeros((Ks-1))

for n in range(1,Ks):
    
    #Train Model and Predict  
    collTree = DecisionTreeClassifier(criterion="entropy", max_depth = n)
    collTree.fit(X_train,y_train)
    yhat_dt=collTree.predict(X_test)
    mean_acc_dt[n-1] = metrics.accuracy_score(y_test, yhat_dt)
    f1_sc_dt[n-1] = f1_score(y_test, yhat_dt, average='weighted')
    jac_sc_dt[n-1] = jaccard_score(y_test, yhat_dt)
    
    std_acc_dt[n-1]=np.std(yhat_dt==y_test)/np.sqrt(yhat_dt.shape[0])

print(mean_acc_dt)
print(f1_sc_dt)
print(jac_sc_dt)


[0.69649606 0.69649606 0.69649606 0.69649606 0.69612915 0.69612915
 0.69609246 0.69605577 0.68189323]
[0.57189258 0.57189258 0.57189258 0.57189258 0.57218934 0.57218934
 0.57210386 0.57215367 0.58065075]
[0.69649606 0.69649606 0.69649606 0.69649606 0.69605109 0.69605109
 0.69602554 0.69597769 0.67878182]


In [20]:
# Using different k does not seems to have a huge impact. We calculate the sum of f1 and jac values to identify the
# best k
result = jac_sc_dt + f1_sc_dt
result

array([1.26838864, 1.26838864, 1.26838864, 1.26838864, 1.26824043,
       1.26824043, 1.2681294 , 1.26813136, 1.25943256])

In [21]:
# We can recognize that for example the 4th value meaning k=4 provides the highest score.
# We will use this to create the optimal model
collTree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
collTree.fit(X_train,y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=4)

In [31]:
# collTree is a optimized Decision Tree model. We will apply it to the Evaluation Data
# Normalize Evaluation Data
X_eval = preprocessing.StandardScaler().fit(X_eval).transform(X_eval)


In [24]:
X_eval[0:5]

array([[-0.81754732,  0.2414237 ,  0.52548927, -0.59230676],
       [ 0.25007932,  1.18498437,  0.52548927, -0.59230676],
       [-0.81754732,  1.18498437,  0.52548927,  0.31793452],
       [ 0.25007932,  1.18498437, -0.56931753,  0.31793452],
       [ 0.25007932, -0.70213697, -0.56931753, -0.59230676]])

In [29]:
# The Predicted labels are calculated:
yhat_test_2 = collTree.predict(X_eval)

In [26]:
# Calculating the F1 Score
f1_sc_test_2 = f1_score(y_eval, yhat_test_2, average='weighted')
f1_sc_test_2

0.5809904188654375

In [27]:
# Calculating the Jaggard Score
jac_sc_test_2 = jaccard_score(y_eval, yhat_test_2)
jac_sc_test_2

0.7034519365775145