In [2]:
#Import all necessary packages to solve these problems

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics 
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/gdrive')
#Change current working directory to gdrive
%cd /gdrive



Mounted at /gdrive
/gdrive


In [3]:
#Import Insurance Fraud Training Data and create dataframe for it
TrainingFile = r'/gdrive/My Drive/Projects/InsuranceFraudDetection/FraudDetectionTrain.csv'
TrainingData = pd.read_csv(TrainingFile) 

#Import Insurance Fraud Test Data and create dataframe for it
TestFile = r'/gdrive/My Drive/Projects/InsuranceFraudDetection/FraudDetectionTest.csv'
TestData = pd.read_csv(TestFile)  

print(TrainingData.shape)
print(TestData.shape)

(2999, 32)
(12918, 32)


In [4]:
#Brief overview of datatypes for our training data

TrainingData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2999 entries, 0 to 2998
Data columns (total 32 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   MONTH                 2999 non-null   object
 1   WEEKOFMONTH           2999 non-null   int64 
 2   DAYOFWEEK             2999 non-null   object
 3   MAKE                  2999 non-null   object
 4   ACCIDENTAREA          2999 non-null   object
 5   DAYOFWEEKCLAIMED      2999 non-null   object
 6   MONTHCLAIMED          2999 non-null   object
 7   WEEKOFMONTHCLAIMED    2999 non-null   int64 
 8   SEX                   2999 non-null   object
 9   MARITALSTATUS         2999 non-null   object
 10  AGE                   2999 non-null   int64 
 11  FAULT                 2999 non-null   object
 12  POLICYTYPE            2999 non-null   object
 13  VEHICLECATEGORY       2999 non-null   object
 14  VEHICLEPRICE          2999 non-null   object
 15  REPNUMBER             2999 non-null   

In [5]:
#Using describe function to identify numeric features, See if theres a python function to change features from numeric to categorical

TrainingData.describe()

Unnamed: 0,WEEKOFMONTH,WEEKOFMONTHCLAIMED,AGE,REPNUMBER,DEDUCTIBLE,DRIVERRATING,YEAR
count,2999.0,2999.0,2999.0,2999.0,2999.0,2999.0,2999.0
mean,2.78126,2.671224,40.055352,8.511837,407.302434,2.496832,1995.114038
std,1.286055,1.261614,13.497026,4.601437,41.847258,1.118365,0.606007
min,1.0,1.0,0.0,1.0,300.0,1.0,1994.0
25%,2.0,2.0,31.0,4.0,400.0,1.0,1995.0
50%,3.0,3.0,38.0,9.0,400.0,2.0,1995.0
75%,4.0,4.0,49.0,12.0,400.0,3.0,1995.0
max,5.0,5.0,80.0,16.0,700.0,4.0,1996.0


In [6]:
# Evaluating how balanced our training dataset is

cases=TrainingData["FRAUDFOUND"].value_counts()
non_fraud=np.round(100*cases[0]/(cases[0]+cases[1]),2)
fraud=np.round(100*cases[1]/(cases[0]+cases[1]),2)

print('Percentage of non-fraud cases is', non_fraud, '%')
print('Percentage of fraud cases', fraud, '%')

Percentage of non-fraud cases is 86.7 %
Percentage of fraud cases 13.3 %


In [7]:
# Select features for training dataset
XTrain = TrainingData.copy()
del XTrain['FRAUDFOUND']
print(XTrain.shape)

# Identify target column for training dataset
YTrain = TrainingData[['FRAUDFOUND']].copy()
print(YTrain.shape)

# Select same features as above for test dataset
XTest = TestData.copy()
del XTest['FRAUDFOUND']
print(XTest.shape)

#Identify target column for test dataset
YTest = TestData[['FRAUDFOUND']].copy()
print(YTest.shape)

(2999, 31)
(2999, 1)
(12918, 31)
(12918, 1)


In [8]:
# Identify categorical features to be One Hot Encoded in next step

CatFeatures = ['MONTH', 'DAYOFWEEK', 'MAKE', 'ACCIDENTAREA', 'DAYOFWEEKCLAIMED', 'MONTHCLAIMED', 'SEX', 'MARITALSTATUS', 'FAULT', 'POLICYTYPE', 'VEHICLECATEGORY', 'VEHICLEPRICE', 'DAYS_POLICY_ACCIDENT', 'DAYS_POLICY_CLAIM', 'PASTNUMBEROFCLAIMS', 'AGEOFVEHICLE', 'AGEOFPOLICYHOLDER', 'POLICEREPORTFILED', 'WITNESSPRESENT', 'AGENTTYPE', 'NUMBEROFSUPPLIMENTS', 'ADDRESSCHANGE_CLAIM', 'NUMBEROFCARS', 'BASEPOLICY']


In [9]:
# Perform OHE on Training Data categorical features

OneHot = OneHotEncoder(handle_unknown='ignore',sparse=False)
XCategories = pd.DataFrame(OneHot.fit_transform(XTrain[CatFeatures]),columns=OneHot.get_feature_names(),index=XTrain.index)
XTrain = pd.concat([XTrain,XCategories],axis=1)
XTrain.drop(labels=CatFeatures,axis=1,inplace=True)
XTrain.head(5)

Unnamed: 0,WEEKOFMONTH,WEEKOFMONTHCLAIMED,AGE,REPNUMBER,DEDUCTIBLE,DRIVERRATING,YEAR,x0_Apr,x0_Aug,x0_Dec,x0_Feb,x0_Jan,x0_Jul,x0_Jun,x0_Mar,x0_May,x0_Nov,x0_Oct,x0_Sep,x1_Friday,x1_Monday,x1_Saturday,x1_Sunday,x1_Thursday,x1_Tuesday,x1_Wednesday,x2_Accura,x2_BMW,x2_Chevrolet,x2_Dodge,x2_Ford,x2_Honda,x2_Mazda,x2_Mecedes,x2_Mercury,x2_Nisson,x2_Pontiac,x2_Porche,x2_Saab,x2_Saturn,...,x14_none,x15_2_years,x15_3_years,x15_4_years,x15_5_years,x15_6_years,x15_7_years,x15_more_than_7,x15_new,x16_16_to_17,x16_18_to_20,x16_21_to_25,x16_26_to_30,x16_31_to_35,x16_36_to_40,x16_41_to_50,x16_51_to_65,x16_over_65,x17_No,x17_Yes,x18_No,x18_Yes,x19_External,x19_Internal,x20_1_to_2,x20_3_to_5,x20_more_than_5,x20_none,x21_1_year,x21_2_to_3_years,x21_4_to_8_years,x21_no_change,x21_under_6_months,x22_1-vehicle,x22_2-vehicles,x22_3_to_4,x22_5_to_8,x23_All_Perils,x23_Collision,x23_Liability
0,3,4,21,4,400,4,1994,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,5,1,68,9,400,3,1994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1,1,50,8,400,2,1994,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1,3,39,1,400,3,1994,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,5,1,43,1,400,4,1994,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [10]:
# Perform OHE on Test Data categorical features

XCategories = pd.DataFrame(OneHot.transform(XTest[CatFeatures]),columns=OneHot.get_feature_names(),index=XTest.index)
XTest = pd.concat([XTest,XCategories],axis=1)
XTest.drop(labels=CatFeatures,axis=1,inplace=True)
XTest.head(5)

Unnamed: 0,WEEKOFMONTH,WEEKOFMONTHCLAIMED,AGE,REPNUMBER,DEDUCTIBLE,DRIVERRATING,YEAR,x0_Apr,x0_Aug,x0_Dec,x0_Feb,x0_Jan,x0_Jul,x0_Jun,x0_Mar,x0_May,x0_Nov,x0_Oct,x0_Sep,x1_Friday,x1_Monday,x1_Saturday,x1_Sunday,x1_Thursday,x1_Tuesday,x1_Wednesday,x2_Accura,x2_BMW,x2_Chevrolet,x2_Dodge,x2_Ford,x2_Honda,x2_Mazda,x2_Mecedes,x2_Mercury,x2_Nisson,x2_Pontiac,x2_Porche,x2_Saab,x2_Saturn,...,x14_none,x15_2_years,x15_3_years,x15_4_years,x15_5_years,x15_6_years,x15_7_years,x15_more_than_7,x15_new,x16_16_to_17,x16_18_to_20,x16_21_to_25,x16_26_to_30,x16_31_to_35,x16_36_to_40,x16_41_to_50,x16_51_to_65,x16_over_65,x17_No,x17_Yes,x18_No,x18_Yes,x19_External,x19_Internal,x20_1_to_2,x20_3_to_5,x20_more_than_5,x20_none,x21_1_year,x21_2_to_3_years,x21_4_to_8_years,x21_no_change,x21_under_6_months,x22_1-vehicle,x22_2-vehicles,x22_3_to_4,x22_5_to_8,x23_All_Perils,x23_Collision,x23_Liability
0,3,4,21,4,400,4,1994,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,5,1,68,9,400,3,1994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1,1,50,8,400,2,1994,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1,3,39,1,400,3,1994,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,5,1,43,1,400,4,1994,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [11]:
# Instantiate Decision Tree Model and fit Training Data to it
dt = DecisionTreeClassifier()
dt.fit(XTrain, YTrain)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [12]:
## Instantiate Random Forest Model and fit Training Data to it

rf = RandomForestClassifier()
rf.fit(XTrain, YTrain)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [13]:
#Set up predictions for decision tree training and test data
X_Prediction_Test = dt.predict(XTest)
X_Prediction_Train = dt.predict(XTrain)

#Evaluate accuracy of decision tree training and test data
print("Accuracy of Training is - ", metrics.accuracy_score(YTrain,X_Prediction_Train))
print("Accuracy of Test is -", metrics.accuracy_score(YTest,X_Prediction_Test))

#Review Confusion Matrix
print("Confusion Matrix - Decision Tree")
print(confusion_matrix(YTest,X_Prediction_Test))
print("Max Depth",dt.get_depth())
print("Leaf",dt.get_n_leaves())
print('Additional important metrics')
print(metrics.classification_report(YTest, X_Prediction_Test))

Accuracy of Training is -  1.0
Accuracy of Test is - 0.8823347267378852
Confusion Matrix - Decision Tree
[[10955  1465]
 [   55   443]]
Max Depth 23
Leaf 287
Additional important metrics
              precision    recall  f1-score   support

          No       1.00      0.88      0.94     12420
         Yes       0.23      0.89      0.37       498

    accuracy                           0.88     12918
   macro avg       0.61      0.89      0.65     12918
weighted avg       0.97      0.88      0.91     12918



In [14]:
#Set up predictions for random forest training and test data
X_Prediction_Test_RF = rf.predict(XTest)
X_Prediction_Train_RF = rf.predict(XTrain)

#Evaluate accuracy of random forest training and test data
print("Accuracy of Training is - ", metrics.accuracy_score(YTrain,X_Prediction_Train_RF))
print("Accuracy of Test is - ", metrics.accuracy_score(YTest,X_Prediction_Test_RF))

#Review confusion matrix
print("Confusion Matrix - Random Forest")
print(confusion_matrix(YTest,X_Prediction_Test_RF))
print('Additional important metrics')
print(metrics.classification_report(YTest, X_Prediction_Test_RF))

Accuracy of Training is -  1.0
Accuracy of Test is -  0.9632296021055891
Confusion Matrix - Random Forest
[[12022   398]
 [   77   421]]
Additional important metrics
              precision    recall  f1-score   support

          No       0.99      0.97      0.98     12420
         Yes       0.51      0.85      0.64       498

    accuracy                           0.96     12918
   macro avg       0.75      0.91      0.81     12918
weighted avg       0.98      0.96      0.97     12918



Decision Tree HyperParameter Tuning


In [92]:
#Perform Random Search for decision tree

import time
start_time = time.time()

print("Random Search - Decision Tree")
param={'max_leaf_nodes' : range(10,100,10),'max_depth': 
            range(4,10,1),'criterion':['gini','entropy']}
dt_random = RandomizedSearchCV(dt,param,n_iter=10,cv=5)
dt_random.fit(XTrain, YTrain)
grid_parameters=dt_random.best_params_
print(grid_parameters)
print("Decision Tree accuracy is --> :{0:6f}".
      format(dt_random.score(XTest,YTest)))

print("--- %s seconds to compute ---" % (time.time() - start_time))

Random Search - Decision Tree
{'max_leaf_nodes': 50, 'max_depth': 4, 'criterion': 'gini'}
Decision Tree accuracy is --> :0.925608
--- 1.07527494430542 seconds to compute ---


In [93]:
#Perform Grid Search for decision tree

import time
start_time = time.time()

print("Decision Tree Grid Search - ")
dt_grid = GridSearchCV(dt,param)
dt_grid.fit(XTrain, YTrain)
grid_parameters1=dt_grid.best_params_
print(grid_parameters1)
print("accuracy Score for Decision Tree -->{0:6f}".
      format(dt_grid.score(XTest,YTest)))

print("--- %s seconds to compute ---" % (time.time() - start_time))

Decision Tree Grid Search - 
{'criterion': 'gini', 'max_depth': 4, 'max_leaf_nodes': 10}
accuracy Score for Decision Tree -->0.929246
--- 10.99505615234375 seconds to compute ---


In [109]:
#Utilizing new parameters in the decision tree
dtRandomSearch = DecisionTreeClassifier(**grid_parameters)
dtGridSearch = DecisionTreeClassifier(**grid_parameters1)

#Fitting random search hypertuned parameters to model
dtRandomSearch.fit(XTrain,YTrain)
dtRandomSearch_Pred = dtRandomSearch.predict(XTest)

#Fitting grid search hypertuned parameters to model
dtGridSearch.fit(XTrain,YTrain)
dtGridSearch_Pred = dtGridSearch.predict(XTest)

In [110]:
# Evaluating accuracy of model using new hypertuned parameters (Random Search)
print("Accuracy of Random Search -", metrics.accuracy_score(YTest,dtRandomSearch_Pred))
print("Confusion Matrix of Random Search Decision Tree Model")
print(confusion_matrix(YTest,dtRandomSearch_Pred))
print('Additional important metrics')
print(metrics.classification_report(YTest, dtRandomSearch_Pred))

#Evaluating Cross Validation Score
CrossValidation_Score = cross_val_score(dtRandomSearch, XTrain, YTrain, cv=5, scoring="balanced_accuracy")
print(CrossValidation_Score)

Accuracy of Random Search - 0.9254528564793312
Confusion Matrix of Random Search Decision Tree Model
[[11813   607]
 [  356   142]]
Additional important metrics
              precision    recall  f1-score   support

          No       0.97      0.95      0.96     12420
         Yes       0.19      0.29      0.23       498

    accuracy                           0.93     12918
   macro avg       0.58      0.62      0.59     12918
weighted avg       0.94      0.93      0.93     12918

[0.77451923 0.75913462 0.57403846 0.49807692 0.58668452]


In [96]:
# Evaluating accuracy of model using new hypertuned parameters (Grid Search)
print("Accuracy of Grid Search -", metrics.accuracy_score(YTest,dtGridSearch_Pred))
print("Confusion Matrix of Grid Search Decision Tree Model")
print(confusion_matrix(YTest,dtGridSearch_Pred))
print('Additional Important Metrics')
print(metrics.classification_report(YTest, dtGridSearch_Pred))

#Evaluating Cross Validation Score
CrossValidation_Score = cross_val_score(dtGridSearch, XTrain, YTrain, cv=5, scoring="balanced_accuracy")
print(CrossValidation_Score)

Accuracy of Grid Search - 0.9292460133147546
Confusion Matrix of Grid Search Decision Tree Model
[[11865   555]
 [  359   139]]
Additional Important Metrics
              precision    recall  f1-score   support

          No       0.97      0.96      0.96     12420
         Yes       0.20      0.28      0.23       498

    accuracy                           0.93     12918
   macro avg       0.59      0.62      0.60     12918
weighted avg       0.94      0.93      0.93     12918

[0.76201923 0.75961538 0.57403846 0.49903846 0.58668452]


Random Forest: Random & Grid Search

In [270]:
#Perform Random Search for Random Forest
import time
start_time = time.time()

print("Random Forest - Random Search")
Random_Param={'n_estimators' : range(8,12,1),'max_depth': 
            range(100, 200, 10), 'max_features':range(10,30,5)}
RF_RandomSearch = RandomizedSearchCV(rf,Random_Param,n_iter=10,cv=5)

#Fitting new parameters to Random Forest Model
RF_RandomSearch.fit(XTrain, YTrain)
grid_parameters=RF_RandomSearch.best_params_
print(grid_parameters)
print((RF_RandomSearch.score(XTest,YTest)))

print("--- %s seconds ---" % (time.time() - start_time))

Random Forest - Random Search
{'n_estimators': 10, 'max_features': 25, 'max_depth': 120}
0.9400836042731073
--- 2.877451181411743 seconds ---


In [271]:
import time
start_time = time.time()

print("GridSearchCV-Random Forest")
rf_grid = GridSearchCV(rf,Random_Param)
rf_grid.fit(XTrain, YTrain)
grid_parameters1=rf_grid.best_params_
print(grid_parameters1)
print("accuracy Score for Decision Tree:{0:6f}".
      format(rf_grid.score(XTest,YTest)))

print("--- %s seconds ---" % (time.time() - start_time))

GridSearchCV-Random Forest
{'max_depth': 100, 'max_features': 25, 'n_estimators': 10}
accuracy Score for Decision Tree:0.935903
--- 36.55627393722534 seconds ---


In [272]:
#Using the parameters obtained from HyperParameterTuning in the RandomForestClassifier 
rfRandomSearch = RandomForestClassifier(**grid_parameters)
rfGridSearch = RandomForestClassifier(**grid_parameters1)

rfRandomSearch.fit(XTrain,YTrain)
rfRandomSearch_Prediction = rfRandomSearch.predict(XTest)
rfGridSearch.fit(XTrain,YTrain)
rfGridSearch_Prediction = rfGridSearch.predict(XTest)

In [273]:
# Accuracy for Random Forest using Random Search CV for Hyperparameter Tuning

print("Test Accuracy:", metrics.accuracy_score(YTest,rfRandomSearch_Prediction))
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(YTest,rfRandomSearch_Prediction))
print('Other important metrics')
print(metrics.classification_report(YTest, rfRandomSearch_Prediction))

# Cross Validation evaluation
CrossValidation_Score = cross_val_score(rfRandomSearch, XTrain, YTrain, cv=5, scoring="balanced_accuracy")
print(CrossValidation_Score)

Test Accuracy: 0.9434897042885896
Confusion Matrix for Random Forest:
[[11797   623]
 [  107   391]]
Other important metrics
              precision    recall  f1-score   support

          No       0.99      0.95      0.97     12420
         Yes       0.39      0.79      0.52       498

    accuracy                           0.94     12918
   macro avg       0.69      0.87      0.74     12918
weighted avg       0.97      0.94      0.95     12918

[0.67211538 0.65769231 0.56394231 0.50865385 0.55944499]


In [274]:
# Accuracy for Random Forest using Grid Search for Hyperparameter Tuning

print("Test Accuracy:", metrics.accuracy_score(YTest,rfGridSearch_Prediction))
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(YTest,rfGridSearch_Prediction))
print('Printing the precision and recall, among other metrics')
print(metrics.classification_report(YTest, rfGridSearch_Prediction))

# Cross Validation evaluation
CrossValidation_Score = cross_val_score(rfGridSearch, XTrain, YTrain, cv=5, scoring="balanced_accuracy")
print(CrossValidation_Score)

Test Accuracy: 0.940625483821025
Confusion Matrix for Random Forest:
[[11757   663]
 [  104   394]]
Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

          No       0.99      0.95      0.97     12420
         Yes       0.37      0.79      0.51       498

    accuracy                           0.94     12918
   macro avg       0.68      0.87      0.74     12918
weighted avg       0.97      0.94      0.95     12918

[0.69375    0.69759615 0.55336538 0.49519231 0.59549659]
