## Random Forest Classifier

Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import SMOTE, RandomOverSampler

pd.pandas.set_option('display.max_columns', None)

In [2]:
crash_19 = pd.read_csv('../data/crash_19.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../data/crash_19.csv'

Setting column 'RD_NO' as index.

In [None]:
crash_19.set_index(['RD_NO'], inplace = True)

In [None]:
crash_19.head()

In [None]:
crash_19.shape

In [None]:
crash_19['MOST_SEVERE_INJURY'].value_counts(normalize = True)

A function that returns the confusion matrix as well as the precision, recall, and f1 score for each model.

In [None]:
def evaluation(y_actual, predicted):
    cnf_matrix = confusion_matrix(y_actual, predicted)

    sns.heatmap(pd.DataFrame(cnf_matrix), annot = True, cmap = 'viridis', fmt = 'd')
    plt.ylabel('Actual Label')
    plt.xlabel('Predicted Label')
    
    labels = ['No Injury', 'Injury']
    print(classification_report(y_actual, predicted, target_names = labels))

### Preprocessing

In [None]:
X = crash_19.drop(columns = 'MOST_SEVERE_INJURY')
y = crash_19['MOST_SEVERE_INJURY']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

### Random Forest 

In [None]:
rf = RandomForestClassifier()

rf.fit(X_train, y_train)

rf.score(X_train, y_train), rf.score(X_test, y_test)

In [None]:
rf_preds = rf.predict(X_test)

In [None]:
evaluation(y_test, rf_preds)

### Near Miss

In [None]:
nm = NearMiss()

X_train_under, y_train_under = nm.fit_resample(X_train, y_train)

In [None]:
y_train_under.value_counts()

In [None]:
rf_under = RandomForestClassifier()

rf_under.fit(X_train_under, y_train_under)

rf_under.score(X_train_under, y_train_under), rf_under.score(X_test, y_test)

In [None]:
rf_under_preds = rf_under.predict(X_test)

In [None]:
evaluation(y_test, rf_under_preds)

### Random Over-Sampler

In [None]:
ros = RandomOverSampler()

X_train_over, y_train_over = ros.fit_resample(X_train, y_train)

In [None]:
rf_over = RandomForestClassifier()

rf_over.fit(X_train_over, y_train_over)

rf_over.score(X_train_over, y_train_over), rf_over.score(X_test, y_test)

In [None]:
rf_over_preds = rf_over.predict(X_test)

In [None]:
evaluation(y_test, rf_over_preds)

### SMOTE

In [None]:
smo = SMOTE()

X_train_smote, y_train_smote = smo.fit_resample(X_train, y_train)

In [None]:
rf_smote = RandomForestClassifier()

rf_smote.fit(X_train_smote, y_train_smote)

rf_smote.score(X_train_smote, y_train_smote), rf_smote.score(X_test, y_test)

In [None]:
rf_smote_preds = rf_smote.predict(X_test)

In [None]:
evaluation(y_test, rf_smote_preds)