Initialization

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import datasets
import numpy as np
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from imblearn.under_sampling import NearMiss
from sklearn.model_selection import cross_validate
from imblearn.ensemble import EasyEnsembleClassifier
from collections import Counter

#load dataset
#data = pd.read_csv('carclaims.csv')

from google.colab import files
uploaded = files.upload()
import io
data = pd.read_csv(io.BytesIO(uploaded['carclaims.csv']))# Dataset is now stored in a Pandas Dataframe
#separate the target variable (FraudFound) from the features
data['FraudFound'] = data['FraudFound'].replace({'Yes': 1, 'No': 0})
y = data['FraudFound']
data = data.drop(['FraudFound'], axis=1)

X = pd.get_dummies(data, columns = ['Month', 'DayOfWeek','Make', 'AccidentArea','DayOfWeekClaimed', 'MonthClaimed',
                                                       'Sex','MaritalStatus', 'Fault','PolicyType','VehicleCategory','VehiclePrice',
                                                       'Days:Policy-Accident','Days:Policy-Claim','PastNumberOfClaims','AgeOfVehicle','AgeOfPolicyHolder',
                                                       'PoliceReportFiled','WitnessPresent','AgentType',
                                                       'NumberOfSuppliments','AddressChange-Claim','NumberOfCars','BasePolicy'])

#split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

Saving carclaims.csv to carclaims.csv


Pure approach, no class imbalance taken into consideration

In [None]:

names = ['Random Forest', 'Naive Bayes', 'Linear SVM']
classifiers = [RandomForestClassifier(n_estimators=100, random_state=0),
               GaussianNB() , SVC(kernel='linear')]

for name, clf in zip(names, classifiers):
    print(name)
    model = clf
    model.fit(X_train, y_train)
    #predict on the test set and evaluate the model performance
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))


Random Forest
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      2899
           1       1.00      0.02      0.03       185

    accuracy                           0.94      3084
   macro avg       0.97      0.51      0.50      3084
weighted avg       0.94      0.94      0.91      3084

Naive Bayes
              precision    recall  f1-score   support

           0       0.97      0.75      0.85      2899
           1       0.15      0.66      0.24       185

    accuracy                           0.75      3084
   macro avg       0.56      0.71      0.54      3084
weighted avg       0.92      0.75      0.81      3084

Linear SVM
              precision    recall  f1-score   support

           0       0.94      0.99      0.97      2899
           1       0.08      0.01      0.02       185

    accuracy                           0.93      3084
   macro avg       0.51      0.50      0.49      3084
weighted avg       0.89      0.93    

The recall (sensitivity) for all classifiers is quite poor for the minority
class (FraudFound = 1). This suggests that the models are unable to accurately detect the majority of false claims.

The accuracy for the minority class is likewise poor for Random Forest and Linear SVM, indicating that the majority of projected fraudulent claims are not fraudulent.

Because of the low recall and accuracy, the F1-score for the minority class is low for all classifiers.

The accuracy of all classifiers is relatively high, which is misleading in this situation due to the unbalanced dataset.

All classifiers had low macro-averaged accuracy, recall, and F1-score, indicating poor performance on the minority class, again due to data imbalance.


Synthetic Oversampling

In [None]:

#Synthetic Oversampling
for name, clf in zip(names, classifiers):
    #create a pipeline
    pipeline = make_pipeline(clf)
    pipeline.fit(X_train, y_train)

    #classify and report the results
    print(name)
    print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))

    pipeline = make_pipeline(clf)
    pipeline.fit(X_train, y_train)

    print(name, "with balanced weights")
    print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))

    pipeline = make_pipeline(SMOTE(random_state=3, k_neighbors=5),
                             clf)
    pipeline.fit(X_train, y_train)

    print(name, " with SMOTE")
    print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))


Random Forest
                   pre       rec       spe        f1       geo       iba       sup

          0       0.94      1.00      0.02      0.97      0.13      0.02      2899
          1       1.00      0.02      1.00      0.03      0.13      0.01       185

avg / total       0.94      0.94      0.08      0.91      0.13      0.02      3084

Random Forest with balanced weights
                   pre       rec       spe        f1       geo       iba       sup

          0       0.94      1.00      0.02      0.97      0.13      0.02      2899
          1       1.00      0.02      1.00      0.03      0.13      0.01       185

avg / total       0.94      0.94      0.08      0.91      0.13      0.02      3084

Random Forest  with SMOTE
                   pre       rec       spe        f1       geo       iba       sup

          0       0.94      1.00      0.01      0.97      0.10      0.01      2899
          1       1.00      0.01      1.00      0.02      0.10      0.01       185

avg

The metrics here suggest that the Random Forest model with no resampling approaches performs the best. It has the highest F1-score of all models, which is 0.91. Although the Naive Bayes model with SMOTE has the greatest recall score for the minority class, it has a lower overall F1-score than the Random Forest model without any resampling strategies. The Linear SVM models outperform the others, with poor recall scores for the minority class.

However if each model is individually assessed, SMOTE & weight balancing seem to slightly improve several metrics, for instance the avg F1 score.

NearMiss


In [None]:
#NearMiss
for name, clf in zip(names, classifiers):
    pipeline = make_pipeline(NearMiss(version=1),
                             clf)
    pipeline.fit(X_train, y_train)

    #classify and report the results
    print(name, "with near miss 1")
    print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))

    #create a pipeline
    pipeline = make_pipeline(NearMiss(version=2),
                             clf)
    pipeline.fit(X_train, y_train)

    print(name, "with near miss 2")
    print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))

    print(name, "with near miss 3") # See issue in GitHub
    pipeline = make_pipeline(NearMiss(version=3, n_neighbors_ver3=3),
                             clf)
    pipeline.fit(X_train, y_train)

    print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))

Random Forest with near miss 1
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.62      0.88      0.76      0.74      0.53      2899
          1       0.13      0.88      0.62      0.22      0.74      0.56       185

avg / total       0.94      0.64      0.86      0.73      0.74      0.53      3084

Random Forest with near miss 2
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.29      0.95      0.45      0.52      0.26      2899
          1       0.08      0.95      0.29      0.15      0.52      0.29       185

avg / total       0.93      0.33      0.91      0.43      0.52      0.26      3084

Random Forest with near miss 3
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.62      0.88      0.76      0.74      0.54      2899
          1       0.13      0.88      0.62      0.23      0.74      0.

According to the results, the Random Forest model with NearMiss 1 seems to be the best performing model. It has the greatest accuracy (0.99) and f1 score (0.73) of any model, indicating that it can properly identify the positive class (frauds) while reducing false positives.

With a precision of 0.95 and a f1 score of 0.83, the Naive Bayes model employing NearMiss 1 fared rather well as well. However, its recall score (0.78) is lower than the Random Forest model with NearMiss 1 (0.64), suggesting that it may be missing some genuine positives.

NearMiss1 & 3 seem to generally outperform NearMiss2 in several metrics, for instance, the F1 avg score.



EasyEnsemble

In [None]:
    scoring = ['accuracy', 'balanced_accuracy']

    algs = []
    rf = RandomForestClassifier()
    algs.append([rf, "Random Forrest"])
    nb = GaussianNB()
    algs.append([nb, "Naive Bayes"])
    nb = SVC(kernel='linear', probability=True)
    algs.append([nb, "Linear SVM"])
    ee = EasyEnsembleClassifier(random_state=42)
    algs.append([ee, "Easy Ensemble"])

    for c, d in algs:
        print("\n" + d)
        scores = cross_validate(c, X, y, scoring=scoring, cv=10, return_train_score=False)
        for s in scoring:
            print("%s: %0.2f (+/- %0.2f)" % (s, scores["test_" + s].mean(), scores["test_" + s].std()))


Random Forrest
accuracy: 0.93 (+/- 0.02)
balanced_accuracy: 0.51 (+/- 0.02)

Naive Bayes
accuracy: 0.73 (+/- 0.02)
balanced_accuracy: 0.53 (+/- 0.05)

Linear SVM


The Random Forest algorithm achieved a high accuracy of 0.93 with a low standard deviation of 0.02. However, the balanced accuracy is only 0.51 with a low standard deviation of 0.02.

The Naive Bayes algorithm achieved a lower accuracy of 0.73 with a similar standard deviation of 0.02. However, the balanced accuracy is slightly higher at 0.53 with a higher standard deviation of 0.05.

Unfortunately I never produced results for Linear SVM as my longest running session before I got disconnected was 6h 22m (hosted environment), still not sufficient for the results to be produced. A more optimal conversion from categorical to numeric than the one I used would most likely speed up the execution.