---
# Machine Learning

Now that we have done our EDA, we can move on to machine learning where we will try to predict `Severity` based on the other factors that are provided.

First, we need to split our data to `test` and `train` dataset

In [None]:
# from sklearn.model_selection import train_test_split

predictors = [ele for ele in ml_df.columns if ele != 'Severity']
# print(predictors)

y = pd.DataFrame(ml_df['Severity'])
X = pd.DataFrame(ml_df[predictors])

# Split the Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

## Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

dectree = DecisionTreeClassifier()  # create the decision tree object
dectree.fit(X_train, y_train)

In [None]:
# Predict Response corresponding to Predictors
y_train_pred = dectree.predict(X_train)
y_test_pred = dectree.predict(X_test)

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Classification Accuracy \t:", dectree.score(X_train, y_train))
print()

# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Classification Accuracy \t:", dectree.score(X_test, y_test))
print()

# Plot the Confusion Matrix for Train and Test
f, axes = plt.subplots(1, 2, figsize=(12, 4))
sb.heatmap(confusion_matrix(y_train, y_train_pred),
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[0])
sb.heatmap(confusion_matrix(y_test, y_test_pred), 
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[1])

labels = ['1', '2', '3', '4']
axes[0].set_title('Train')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('True'); 
axes[0].xaxis.set_ticklabels(labels); 
axes[0].yaxis.set_ticklabels(labels);

axes[1].set_title('Test')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('True'); 
axes[1].xaxis.set_ticklabels(labels); 
axes[1].yaxis.set_ticklabels(labels);

In [None]:
# Plot the trained Decision Tree
from sklearn.tree import plot_tree

f = plt.figure(figsize=(12,12))
plot_tree(dectree, filled=True, rounded=True, 
          feature_names=predictors, 
          class_names=labels)

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix

# Predict Response corresponding to Predictors
y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Classification Accuracy \t:", rf.score(X_train, y_train))
print()

# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Classification Accuracy \t:", rf.score(X_test, y_test))
print()

# Plot the Confusion Matrix for Train and Test
f, axes = plt.subplots(1, 2, figsize=(12, 4))
sb.heatmap(confusion_matrix(y_train, y_train_pred),
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[0])
sb.heatmap(confusion_matrix(y_test, y_test_pred), 
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[1])

labels = ['1', '2', '3', '4']
axes[0].set_title('Train')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('True'); 
axes[0].xaxis.set_ticklabels(labels); 
axes[0].yaxis.set_ticklabels(labels);

axes[1].set_title('Test')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('True'); 
axes[1].xaxis.set_ticklabels(labels); 
axes[1].yaxis.set_ticklabels(labels);

## Linear Regression

In [None]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression()
log_model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix

# Predict Response corresponding to Predictors
y_train_pred = log_model.predict(X_train)
y_test_pred = log_model.predict(X_test)

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Classification Accuracy \t:", log_model.score(X_train, y_train))
print()

# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Classification Accuracy \t:", log_model.score(X_test, y_test))
print()

# Plot the Confusion Matrix for Train and Test
f, axes = plt.subplots(1, 2, figsize=(12, 4))
sb.heatmap(confusion_matrix(y_train, y_train_pred),
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[0])
sb.heatmap(confusion_matrix(y_test, y_test_pred), 
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[1])

labels = ['1', '2', '3', '4']
axes[0].set_title('Train')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('True'); 
axes[0].xaxis.set_ticklabels(labels); 
axes[0].yaxis.set_ticklabels(labels);

axes[1].set_title('Test')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('True'); 
axes[1].xaxis.set_ticklabels(labels); 
axes[1].yaxis.set_ticklabels(labels);

## Naive Bayes Gaussian Classifier

In [None]:
from sklearn.naive_bayes import GaussianNB

nbgauss = GaussianNB()
nbgauss.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix

# Predict Response corresponding to Predictors
y_train_pred = nbgauss.predict(X_train)
y_test_pred = nbgauss.predict(X_test)

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Classification Accuracy \t:", nbgauss.score(X_train, y_train))
print()

# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Classification Accuracy \t:", nbgauss.score(X_test, y_test))
print()

# Plot the Confusion Matrix for Train and Test
f, axes = plt.subplots(1, 2, figsize=(12, 4))
sb.heatmap(confusion_matrix(y_train, y_train_pred),
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[0])
sb.heatmap(confusion_matrix(y_test, y_test_pred), 
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[1])

labels = ['1', '2', '3', '4']
axes[0].set_title('Train')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('True'); 
axes[0].xaxis.set_ticklabels(labels); 
axes[0].yaxis.set_ticklabels(labels);

axes[1].set_title('Test')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('True'); 
axes[1].xaxis.set_ticklabels(labels); 
axes[1].yaxis.set_ticklabels(labels);

## Support Vector Machines (SVM)

In [None]:
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel
clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix

# Predict Response corresponding to Predictors
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Classification Accuracy \t:", clf.score(X_train, y_train))
print()

# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Classification Accuracy \t:", clf.score(X_test, y_test))
print()

# Plot the Confusion Matrix for Train and Test
f, axes = plt.subplots(1, 2, figsize=(12, 4))
sb.heatmap(confusion_matrix(y_train, y_train_pred),
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[0])
sb.heatmap(confusion_matrix(y_test, y_test_pred), 
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[1])

labels = ['1', '2', '3', '4']
axes[0].set_title('Train')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('True'); 
axes[0].xaxis.set_ticklabels(labels); 
axes[0].yaxis.set_ticklabels(labels);

axes[1].set_title('Test')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('True'); 
axes[1].xaxis.set_ticklabels(labels); 
axes[1].yaxis.set_ticklabels(labels);