In [None]:
# Add your steps here
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import missingno as msno


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing


# This is for classification
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV


# evaluations
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score,auc,f1_score,precision_score, recall_score
from sklearn.model_selection import KFold, RandomizedSearchCV

from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
print('importing completed')

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

importing completed


### Upload a sample of our capstone project data.

In [None]:
# Load the dataset
data=pd.read_csv('/content/sample_data.csv')

In [None]:
# Checking Dimensions of Data
data.shape

(10994, 33)

In [None]:
#Encoding our categorical columns
catList = data.select_dtypes(include = ["object","bool"]).columns

In [None]:
print (catList)

Index(['Street', 'Side', 'State', 'Wind_Direction', 'Weather_Condition',
       'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit',
       'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming',
       'Traffic_Signal', 'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight',
       'Nautical_Twilight', 'Astronomical_Twilight'],
      dtype='object')


In [None]:
le = LabelEncoder()

for feat in catList:
    data[feat] = le.fit_transform(data[feat].astype(str))

print (data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10994 entries, 0 to 10993
Data columns (total 33 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Severity               10994 non-null  int64  
 1   Month                  10994 non-null  int64  
 2   Year                   10994 non-null  int64  
 3   Distance(mi)           10994 non-null  float64
 4   Street                 10994 non-null  int64  
 5   Side                   10994 non-null  int64  
 6   State                  10994 non-null  int64  
 7   Temperature(F)         10994 non-null  float64
 8   Wind_Chill(F)          10994 non-null  float64
 9   Humidity(%)            10994 non-null  int64  
 10  Pressure(in)           10994 non-null  float64
 11  Visibility(mi)         10994 non-null  int64  
 12  Wind_Direction         10994 non-null  int64  
 13  Wind_Speed(mph)        10994 non-null  float64
 14  Precipitation(in)      10994 non-null  int64  
 15  We

In [None]:
# X for the training dataset , y for the test dataset 
X = data.drop(columns=['Severity'])
y = pd.DataFrame(data['Severity']) #target class

print('X shape :', {X.shape})
print('y shape :', {y.shape})

X shape : {(10994, 32)}
y shape : {(10994, 1)}


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

In [None]:
y.value_counts()

Severity
2           10173
4             492
3             329
dtype: int64

### Oversampling the target column

In [None]:
from collections import Counter 
print(sorted(Counter(data['Severity']).items()))

[(2, 10173), (3, 329), (4, 492)]


In [None]:
from imblearn import over_sampling
from imblearn.over_sampling import RandomOverSampler
ros= RandomOverSampler()
ros_X_train, ros_y_train = ros.fit_resample(X_train,y_train)

In [None]:
print("Before sampling:-", Counter(y_train))
print("After sampling:-", Counter(ros_y_train))

Before sampling:- Counter({'Severity': 1})
After sampling:- Counter({'Severity': 1})


In [None]:
ros_y_train.value_counts()

Severity
2           8137
3           8137
4           8137
dtype: int64

### Training our best decision tree model, then optimizing it to find its best parameters for the Spark part of the model optimization.

In [None]:
# Decision Tree Classification
dtc = DecisionTreeClassifier(criterion='gini', max_depth=3)
dtc.fit(ros_X_train, ros_y_train)
preds_dtc = dtc.predict(X_test)

In [None]:
# Calculate the accuracies score for Decision Tree Classification 
val_train = round(dtc.score(ros_X_train, ros_y_train),2)*100
val_test = round(dtc.score(X_test, y_test),2)*100

print(f'Training Accuracy: {val_train}%')
print(f'Test Set Accuracy: {val_test}%')

Training Accuracy: 70.0%
Test Set Accuracy: 77.0%


### Hyperparameter Tuning

In [None]:
param_grid6 = {
    "criterion":("gini", "entropy"), 
    "max_depth":(list(range(1, 20)))
}


grid6 = GridSearchCV(dtc, param_grid6, scoring="accuracy", n_jobs=-1, verbose=1, cv=3)
grid6.fit(ros_X_train, ros_y_train)
grid6.best_params_

Fitting 3 folds for each of 38 candidates, totalling 114 fits


{'criterion': 'entropy', 'max_depth': 19}

We can see the 19 max depth is the best for our model

In [None]:
grid6.best_score_

0.9724714268157798