<a href="https://colab.research.google.com/github/dassus98/fraud-detection/blob/main/Fraud_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing Data & Dependencies

In [29]:
!pip install scikit-optimize



In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import warnings
import os

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV
warnings.filterwarnings('ignore')

sns.set_style('dark')
%matplotlib inline

In [31]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("shivamb/vehicle-claim-fraud-detection")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/shivamb/vehicle-claim-fraud-detection/versions/1


# Data Preparation

In [32]:
df = pd.read_csv(path + "/fraud_oracle.csv")

In [33]:
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,Age,Fault,PolicyType,VehicleCategory,VehiclePrice,FraudFound_P,PolicyNumber,RepNumber,Deductible,DriverRating,Days_Policy_Accident,Days_Policy_Claim,PastNumberOfClaims,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,21,Policy Holder,Sport - Liability,Sport,more than 69000,0,1,12,300,1,more than 30,more than 30,none,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,34,Policy Holder,Sport - Collision,Sport,more than 69000,0,2,15,400,4,more than 30,more than 30,none,6 years,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,47,Policy Holder,Sport - Collision,Sport,more than 69000,0,3,7,400,3,more than 30,more than 30,1,7 years,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,65,Third Party,Sedan - Liability,Sport,20000 to 29000,0,4,4,400,2,more than 30,more than 30,1,more than 7,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,27,Third Party,Sport - Collision,Sport,more than 69000,0,5,3,400,1,more than 30,more than 30,none,5 years,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15420 entries, 0 to 15419
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Month                 15420 non-null  object
 1   WeekOfMonth           15420 non-null  int64 
 2   DayOfWeek             15420 non-null  object
 3   Make                  15420 non-null  object
 4   AccidentArea          15420 non-null  object
 5   DayOfWeekClaimed      15420 non-null  object
 6   MonthClaimed          15420 non-null  object
 7   WeekOfMonthClaimed    15420 non-null  int64 
 8   Sex                   15420 non-null  object
 9   MaritalStatus         15420 non-null  object
 10  Age                   15420 non-null  int64 
 11  Fault                 15420 non-null  object
 12  PolicyType            15420 non-null  object
 13  VehicleCategory       15420 non-null  object
 14  VehiclePrice          15420 non-null  object
 15  FraudFound_P          15420 non-null

In [73]:
df.isna().sum().sum()

0

In [36]:
df.duplicated().sum()

0

In [76]:
df.nunique()

Unnamed: 0,0
Month,12
WeekOfMonth,5
DayOfWeek,7
Make,19
AccidentArea,2
DayOfWeekClaimed,8
MonthClaimed,13
WeekOfMonthClaimed,5
Sex,2
MaritalStatus,4


In [37]:
df.describe()

Unnamed: 0,WeekOfMonth,WeekOfMonthClaimed,Age,FraudFound_P,PolicyNumber,RepNumber,Deductible,DriverRating,Year
count,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0
mean,2.788586,2.693969,39.855707,0.059857,7710.5,8.483268,407.70428,2.487808,1994.866472
std,1.287585,1.259115,13.492377,0.23723,4451.514911,4.599948,43.950998,1.119453,0.803313
min,1.0,1.0,0.0,0.0,1.0,1.0,300.0,1.0,1994.0
25%,2.0,2.0,31.0,0.0,3855.75,5.0,400.0,1.0,1994.0
50%,3.0,3.0,38.0,0.0,7710.5,8.0,400.0,2.0,1995.0
75%,4.0,4.0,48.0,0.0,11565.25,12.0,400.0,3.0,1996.0
max,5.0,5.0,80.0,1.0,15420.0,16.0,700.0,4.0,1996.0


In [38]:
df.set_index('PolicyNumber', inplace=True)

In [39]:
df.tail()

Unnamed: 0_level_0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,Age,Fault,PolicyType,VehicleCategory,VehiclePrice,FraudFound_P,RepNumber,Deductible,DriverRating,Days_Policy_Accident,Days_Policy_Claim,PastNumberOfClaims,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
PolicyNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
15416,Nov,4,Friday,Toyota,Urban,Tuesday,Nov,5,Male,Married,35,Policy Holder,Sedan - Collision,Sedan,20000 to 29000,1,5,400,4,more than 30,more than 30,2 to 4,6 years,31 to 35,No,No,External,none,no change,1 vehicle,1996,Collision
15417,Nov,5,Thursday,Pontiac,Urban,Friday,Dec,1,Male,Married,30,Policy Holder,Sedan - Liability,Sport,30000 to 39000,0,11,400,3,more than 30,more than 30,more than 4,6 years,31 to 35,No,No,External,more than 5,no change,3 to 4,1996,Liability
15418,Nov,5,Thursday,Toyota,Rural,Friday,Dec,1,Male,Single,24,Policy Holder,Sedan - Collision,Sedan,20000 to 29000,1,4,400,4,more than 30,more than 30,more than 4,5 years,26 to 30,No,No,External,1 to 2,no change,1 vehicle,1996,Collision
15419,Dec,1,Monday,Toyota,Urban,Thursday,Dec,2,Female,Married,34,Third Party,Sedan - All Perils,Sedan,20000 to 29000,0,6,400,4,more than 30,more than 30,none,2 years,31 to 35,No,No,External,more than 5,no change,1 vehicle,1996,All Perils
15420,Dec,2,Wednesday,Toyota,Urban,Thursday,Dec,3,Male,Single,21,Policy Holder,Sedan - Collision,Sedan,20000 to 29000,1,3,400,4,more than 30,more than 30,none,5 years,26 to 30,No,No,External,1 to 2,no change,1 vehicle,1996,Collision


# Functions

In [110]:
def plot_binary_cat_feat_dist(df, X, Y, palette):
  fig, axes = plt.subplots(figsize=(6, 6), sharey=True, nrows=2)
  sns.histplot(data=df, x=X, kde=True, ax=axes.flatten()[0], color=custom_palette[9])
  axes.flatten()[0].set_xlabel(X)
  axes.flatten()[0].set_ylabel('Frequency')

  sns.histplot(data=df, x=X, hue=Y, kde=True, ax=axes.flatten()[1], palette=[palette[5], palette[9]])
  axes.flatten()[1].set_xlabel(X)
  axes.flatten()[1].set_ylabel('Frequency')

  plt.tight_layout()
  plt.show()

In [118]:
def plot_binary_cat_feat_range(df, X, Y, palette):
  fig, axes = plt.subplots(figsize=(6, 6), sharey=True, nrows=2)
  sns.boxplot(data=df, x=X, y=Y, ax=axes.flatten()[0], color=custom_palette[9])
  axes.flatten()[0].set_xlabel(X)
  axes.flatten()[0].set_ylabel('Frequency')

  sns.boxplot(data=df, x=X, y=Y, ax=axes.flatten()[1], palette=[palette[5], palette[9]])
  axes.flatten()[1].set_xlabel(X)
  axes.flatten()[1].set_ylabel('Frequency')

  plt.tight_layout()
  plt.show()

# Exploratory Data Analysis with TensorFlow