## Importing All the Necessary Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_validate
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

#For Scoring
from sklearn.metrics import roc_curve, auc, accuracy_score, confusion_matrix, classification_report, roc_auc_score

# For Plotting Trees
from sklearn.tree import plot_tree



### Importing data and One Hot Encoding it

In [52]:
df = pd.read_csv('https://raw.githubusercontent.com/farhansayyed165/Highway-Data/main/Datasets/ML_TRAFFIC_VOILATIONS')

In [53]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [54]:
df.head()

Unnamed: 0,States/UTs,Number of Accidents,Persons Killed,People Injured,Traffic Voilation type,Year
0,Arunachal Pradesh,31.0,17.0,20,OverSpeeding,2017
1,Himachal Pradesh,628.0,256.0,1054,OverSpeeding,2017
2,Manipur,156.0,42.0,280,OverSpeeding,2017
3,Meghalaya,65.0,17.0,33,OverSpeeding,2017
4,Mizoram,16.0,12.0,8,OverSpeeding,2017


In [5]:
# Getting Dummies
endf = pd.get_dummies(df, columns=['States/UTs','Traffic Voilation type','Year'])

In [10]:
endf.head()

Unnamed: 0,Number of Accidents,Persons Killed,People Injured,States/UTs_A & N Islands,States/UTs_Andhra Pradesh,States/UTs_Arunachal Pradesh,States/UTs_Assam,States/UTs_Bihar,States/UTs_Chandigarh,States/UTs_Chhattisgarh,...,States/UTs_West Bengal,Traffic Voilation type_Driving On wrong side,Traffic Voilation type_Drunken Driving,Traffic Voilation type_Jumping red Lights,Traffic Voilation type_OverSpeeding,Traffic Voilation type_Use of Mobile Phones,Year_2017,Year_2018,Year_2019,Year_2020
0,31.0,17.0,20,0,0,1,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
1,628.0,256.0,1054,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
2,156.0,42.0,280,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
3,65.0,17.0,33,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
4,16.0,12.0,8,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0


All the Features here like State Names (Maharashtra, UP, etc), Voilation Type(Jumping red lights, Overspeeding, etc) and Years(2017-20) have their individual Columns. We can see it below

In [11]:
#Displaying All the Column Names
endf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 718 entries, 0 to 717
Data columns (total 48 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Number of Accidents                           718 non-null    float64
 1   Persons Killed                                718 non-null    float64
 2   People Injured                                718 non-null    int64  
 3   States/UTs_A & N Islands                      718 non-null    uint8  
 4   States/UTs_Andhra Pradesh                     718 non-null    uint8  
 5   States/UTs_Arunachal Pradesh                  718 non-null    uint8  
 6   States/UTs_Assam                              718 non-null    uint8  
 7   States/UTs_Bihar                              718 non-null    uint8  
 8   States/UTs_Chandigarh                         718 non-null    uint8  
 9   States/UTs_Chhattisgarh                       718 non-null    uin

# Splitting The Data into Target and Train Variables

In [14]:
y = endf.filter(like = 'Traffic')
X = endf.drop(y, axis =1)

In [15]:
X.head(3)

Unnamed: 0,Number of Accidents,Persons Killed,People Injured,States/UTs_A & N Islands,States/UTs_Andhra Pradesh,States/UTs_Arunachal Pradesh,States/UTs_Assam,States/UTs_Bihar,States/UTs_Chandigarh,States/UTs_Chhattisgarh,...,States/UTs_Tamil Nadu,States/UTs_Telangana,States/UTs_Tripura,States/UTs_Uttar Pradesh,States/UTs_Uttarakhand,States/UTs_West Bengal,Year_2017,Year_2018,Year_2019,Year_2020
0,31.0,17.0,20,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,628.0,256.0,1054,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,156.0,42.0,280,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [16]:
y.head(3)

Unnamed: 0,Traffic Voilation type_Driving On wrong side,Traffic Voilation type_Drunken Driving,Traffic Voilation type_Jumping red Lights,Traffic Voilation type_OverSpeeding,Traffic Voilation type_Use of Mobile Phones
0,0,0,0,1,0
1,0,0,0,1,0
2,0,0,0,1,0


In [18]:
# Train And Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [20]:
model1 = DecisionTreeClassifier(max_leaf_nodes=6)
model1.fit(X_train, y_train)

DecisionTreeClassifier(max_leaf_nodes=6)

In [23]:
y_train_pred = model1.predict(X_train)
y_pred = model1.predict(X_test)
auc_train = roc_auc_score(y_train, y_train_pred)
auc_test = roc_auc_score(y_test, y_pred)


In [24]:
print("Train:", auc_train, "Test:", auc_test)

Train: 0.549300691718889 Test: 0.5576388888888889


In [33]:
def tree_training(max_leaf, X_train, y_train, X_test, y_test):
    tree = DecisionTreeClassifier(max_leaf_nodes=max_leaf, 
                                  class_weight='balanced')
    tree.fit(X_train, y_train)
    
    y_train_pred = tree.predict(X_train)
    y_pred = tree.predict(X_test)
    
    auc_train = roc_auc_score(y_train, y_train_pred)
    auc_test = roc_auc_score(y_test, y_pred)
    
    print("Node: {}, Train: {}, Valid: {}, Diff: {}\n\n".format(max_leaf, auc_train, auc_test,auc_train-auc_test))

In [34]:
for i in range (2,20):
    tree_training(i, X_train, y_train, X_test, y_test)

Node: 2, Train: 0.5479085803499795, Valid: 0.5534722222222223, Diff: -0.005563641872242786


Node: 3, Train: 0.5479085803499795, Valid: 0.5534722222222223, Diff: -0.005563641872242786


Node: 4, Train: 0.5479085803499795, Valid: 0.5534722222222223, Diff: -0.005563641872242786


Node: 5, Train: 0.5455623739618796, Valid: 0.5576388888888889, Diff: -0.01207651492700934


Node: 6, Train: 0.5484195168190225, Valid: 0.5602704678362573, Diff: -0.011850951017234879


Node: 7, Train: 0.5521578345760318, Valid: 0.5602704678362573, Diff: -0.008112633260225488


Node: 8, Train: 0.5558961523330411, Valid: 0.5602704678362573, Diff: -0.0043743155032162084


Node: 9, Train: 0.5558961523330411, Valid: 0.5602704678362573, Diff: -0.0043743155032162084


Node: 10, Train: 0.5596344700900504, Valid: 0.5602704678362573, Diff: -0.0006359977462069288


Node: 11, Train: 0.5633727878470598, Valid: 0.5602704678362573, Diff: 0.0031023200108024618


Node: 12, Train: 0.5633727878470598, Valid: 0.5602704678362573, Di

In [50]:
model = DecisionTreeClassifier(max_leaf_nodes=14)
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_pred = model.predict(X_test)
auc_train = roc_auc_score(y_train, y_train_pred)
auc_test = roc_auc_score(y_test, y_pred)


In [51]:
y_pred

array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0,

In [None]:
plt.subplots(1, figsize=(10,10))
plt.title('Receiver Operating Characteristic - DecisionTree')
plt.plot(false_positive_rate1, true_positive_rate1)
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()