## Random Forest Classifier

### 1) Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split

### 2) Read in Data

In [None]:
## Read in CSV

opal_corsa_01 = pd.read_csv("opel_corsa_01.csv", sep = ';')
opal_corsa_01["Make"] = "Opal_Corsa"
opal_corsa_01["Source File"] = 1


opal_corsa_02 = pd.read_csv("opel_corsa_02.csv", sep = ';')
opal_corsa_02["Make"] = "Opal_Corsa"
opal_corsa_02["Source File"] = 2

peugeot_207_01 = pd.read_csv("peugeot_207_01.csv", sep = ';')
peugeot_207_01["Make"] = "Peugeot_207"
peugeot_207_01["Source File"] = 1

peugeot_207_02 = pd.read_csv("peugeot_207_01.csv", sep = ';')
peugeot_207_02["Make"] = "Peugeot_207"
peugeot_207_02["Source File"] = 2

dataset = [opal_corsa_01, opal_corsa_02, peugeot_207_01, peugeot_207_02]

combined = pd.concat(dataset)

data_pre_cleaning = combined.rename({'Unnamed: 0':'ID'}, axis = 1)

data_pre_cleaning.to_csv("pre_cleaning_dataset.csv",index = False)

In [None]:
data = pd.read_csv("pre_cleaning_dataset.csv", sep = ',')
data["Make_f"] = data["Make"]

##### Add Index:

In [None]:
data = data.set_index(["ID", "Make", "Source File"])

### 3) Data Cleaning

#####  Remove Null values (only 10 of them)

In [None]:
data.shape

In [None]:
#Before removing NA's, want to check they don't belong to "Aggresive" class since there's a pretty bit imbalance

In [None]:
data.isna().any()
is_na = data.isnull()
row_has_NaN = is_na.any(axis=1)
rows_with_NaN = data[row_has_NaN]
rows_with_NaN

In [None]:
data_na_removed = data.dropna()
data_na_removed.shape

##### Feature Selection

Divide into x and y here

In [None]:
X = data_na_removed.drop(["roadSurface", "traffic", "drivingStyle"], axis = 1)
X = X.replace('Opal_Corsa', 0)
X = X.replace('Peugeot_207', 1)

In [None]:
y = data_na_removed["drivingStyle"]
y = y.replace('EvenPaceStyle', 0)
y = y.replace('AggressiveStyle', 1)              

##### Look At Correlations

In [None]:
X.corr()

In [None]:
plt.figure(figsize=(12,10))
cor = X.corr()
sns.heatmap(cor, annot=True, cmap="coolwarm")
plt.show()

In [None]:
cor[(cor >0.8) | (cor <-0.8)]

Pairs with Correlation Higher than 0.8: 
- VehicleSpeedAverage & VehicleSpeedInstantaneous (0.877891)
- VehicleSpeedInstantaneous & MassAirFlow (0.829004)
- VehicleSpeedInstantaneous & EngineRPM (0.858727)
- ManifoldAbsolutePressure & MassAirflow (0.869857) 
- MassAirflow & EngineRPM (0.885939)

Check each pair using the SelectKBest Feature to see which is more valuable for prediction:

In [None]:
## VehicleSpeedAverage & VehicleSpeedInstantaneous
fs = SelectKBest(score_func=f_classif, k=1)
X_selected = fs.fit(X[["VehicleSpeedAverage", "VehicleSpeedInstantaneous"]], y)
cols = X_selected.get_support(indices=True)
X_new = X[["VehicleSpeedAverage", "VehicleSpeedInstantaneous"]].iloc[:,cols]
X_new.columns

In [None]:
## MassAirFlow & VehicleSpeedInstantaneous
fs = SelectKBest(score_func=f_classif, k=1)
X_selected = fs.fit(X[["MassAirFlow", "VehicleSpeedInstantaneous"]], y)
cols = X_selected.get_support(indices=True)
X_new = X[["MassAirFlow", "VehicleSpeedInstantaneous"]].iloc[:,cols]
X_new.columns

In [None]:
## EngineRPM & VehicleSpeedInstantaneous
fs = SelectKBest(score_func=f_classif, k=1)
X_selected = fs.fit(X[["EngineRPM", "VehicleSpeedInstantaneous"]], y)
cols = X_selected.get_support(indices=True)
X_new = X[["EngineRPM", "VehicleSpeedInstantaneous"]].iloc[:,cols]
X_new.columns

In [None]:
## ManifoldAbsolutePressure & MassAirFlow
fs = SelectKBest(score_func=f_classif, k=1)
X_selected = fs.fit(X[["ManifoldAbsolutePressure", "MassAirFlow"]], y)
cols = X_selected.get_support(indices=True)
X_new = X[["ManifoldAbsolutePressure", "MassAirFlow"]].iloc[:,cols]
X_new.columns

In [None]:
## EngineRPM & MassAirFlow
fs = SelectKBest(score_func=f_classif, k=1)
X_selected = fs.fit(X[["EngineRPM", "MassAirFlow"]], y)
cols = X_selected.get_support(indices=True)
X_new = X[["EngineRPM", "MassAirFlow"]].iloc[:,cols]
X_new.columns

Propose Two Datasets: 
- Keep VehicleSpeedInstantaneous,  ManifoldAbsolutePressure and remove VehicleSpeedAverage, MassAirflow, EngineRPM
- Remove VehicleSpeedInstantaneous, MassAirflow and Keep EngineRPM, VehicleSpeedAverage, ManifoldAbsolutePressure

##### Feature Importance using SelectKBest

In [None]:
X_1 = X.drop(['VehicleSpeedAverage', 'MassAirFlow', 'EngineRPM'], axis = 1)
X_2 = X.drop(['VehicleSpeedInstantaneous', 'MassAirFlow'], axis = 1)

## Select Top 9 Features for X_1:
fs = SelectKBest(score_func=f_classif, k=9)
# apply feature selection
X_1_selected = fs.fit(X_1, y)
cols = X_1_selected.get_support(indices=True)
X_1_reduced = X_1.iloc[:,cols]


## Select Top 9 Features for X_2:
fs = SelectKBest(score_func=f_classif, k=9)
# apply feature selection
X_2_selected = fs.fit(X_2, y)
cols = X_2_selected.get_support(indices=True)
X_2_reduced = X_2.iloc[:,cols]


In [None]:
X_1_reduced.columns

In [None]:
X_2_reduced.columns

##### PCA 

In [None]:
#from sklearn.decomposition import PCA
#scaler = MinMaxScaler()
#X_1_reduced_scaled = scaler.fit_transform(X_1_reduced)
#pca = PCA(n_components=3)
#X_1_pca = pca.fit_transform(X_1_reduced_scaled)
##X_1_pca_values = pd.DataFrame(data = principalComponents)
#
#X_2_reduced_scaled = scaler.fit_transform(X_2_reduced)
#pca = PCA(n_components=3)
#X_2_pca = pca.fit_transform(X_2_reduced_scaled)
##X_2_pca_values = pd.DataFrame(data = principalComponents)

In [None]:
scaler = MinMaxScaler()
X_1_reduced_scaled = scaler.fit_transform(X_1_reduced)
X_2_reduced_scaled = scaler.fit_transform(X_2_reduced)

In [None]:
#scaler = MinMaxScaler()
#opal_1 = X_1_reduced[(X_1_reduced["Make_f"] == 0) & (X["source_file"] == 1)].drop("Make_f", axis = 1)
#X_1_reduced_scaled_0_1 = scaler.fit_transform(opal_1)
#opal_2 = X_1_reduced[(X_1_reduced["Make_f"] == 0) & (X["source_file"] == 2)].drop("Make_f", axis = 1)
#X_1_reduced_scaled_0_2 = scaler.fit_transform(opal_2)
#
#X_1_reduced_scaled_0 = np.concatenate((X_1_reduced_scaled_0_1, X_1_reduced_scaled_0_2))
#X_1_reduced_scaled_0 = np.append(X_1_reduced_scaled_0, np.array( X_1_reduced["Make_f"][X_1_reduced["Make_f"] == 0]).reshape(-1,1), axis = 1)
#

In [None]:
#scaler = MinMaxScaler()
#peugeot_1 = X_1_reduced[(X_1_reduced["Make_f"] == 1) & (X["source_file"] == 1)].drop("Make_f", axis = 1)
#X_1_reduced_scaled_1_1 = scaler.fit_transform(peugeot_1)
#peugeot_2 = X_1_reduced[(X_1_reduced["Make_f"] == 1) & (X["source_file"] == 2)].drop("Make_f", axis = 1)
#X_1_reduced_scaled_1_2 = scaler.fit_transform(peugeot_2)
#
#X_1_reduced_scaled_1 = np.concatenate((X_1_reduced_scaled_1_1, X_1_reduced_scaled_1_2))
#X_1_reduced_scaled_1 = np.append(X_1_reduced_scaled_1, np.array( X_1_reduced["Make_f"][X_1_reduced["Make_f"] == 1]).reshape(-1,1), axis = 1)
#

In [None]:
#X_1_reduced_scaled = np.concatenate((X_1_reduced_scaled_0, X_1_reduced_scaled_1))

In [None]:
def runRF(X_train, X_test, y_train, y_test, cv_folds = 5):
    clf = RandomForestClassifier(max_depth=5, random_state=45)
    param_grid = { 
        'max_features': ['auto', 'sqrt', 'log2'],
        'max_depth' : [4,5,6,7,8],
        'criterion' :['gini', 'entropy']
    }
    CV_rfc = GridSearchCV(estimator=clf, param_grid=param_grid, cv= 5)
    CV_rfc.fit(X_train,y_train)
    rf_predict = CV_rfc.predict(X_test)
    tab = confusion_matrix(y_test, rf_predict, labels = [0,1])
    confusion = pd.DataFrame(tab, index = ['is_even', 'is_aggressive'], 
                             columns = ['predicted_even', 'predicted_aggressive'])
    print(confusion)
    print(classification_report(y_test, rf_predict))                        

### Training the model

##### Using the first set of variables X_1

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_1_reduced_scaled, y, test_size=0.1, random_state=55)

In [None]:
runRF(X_train, X_test, y_train, y_test)