# Vehicle Fraud Detection

## Loading Data and Data Exploration

In [50]:
import pandas as pd
import numpy as np

# Loading data
data = pd.read_csv('carclaims.csv')

# Basic info on data
print("Dataset shape:", data.shape) # Rows and Columns of dataset
print("Data Type:", data.info())
print("\nMissing Values\n", data.isnull().sum().sum()) 
print("\nClass Distribution", data['FraudFound'].value_counts(normalize=True))


Dataset shape: (15420, 33)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15420 entries, 0 to 15419
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Month                 15420 non-null  object
 1   WeekOfMonth           15420 non-null  int64 
 2   DayOfWeek             15420 non-null  object
 3   Make                  15420 non-null  object
 4   AccidentArea          15420 non-null  object
 5   DayOfWeekClaimed      15420 non-null  object
 6   MonthClaimed          15420 non-null  object
 7   WeekOfMonthClaimed    15420 non-null  int64 
 8   Sex                   15420 non-null  object
 9   MaritalStatus         15420 non-null  object
 10  Age                   15420 non-null  int64 
 11  Fault                 15420 non-null  object
 12  PolicyType            15420 non-null  object
 13  VehicleCategory       15420 non-null  object
 14  VehiclePrice          15420 non-null  object
 15  PolicyNum

## Data Preprocessing

In [51]:
data.head(1)

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange-Claim,NumberOfCars,Year,BasePolicy,FraudFound
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,...,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability,No


In [52]:
# copy of data to avoid modifying original data
data_copy = data.copy()

In [53]:
data_copy.columns

Index(['Month', 'WeekOfMonth', 'DayOfWeek', 'Make', 'AccidentArea',
       'DayOfWeekClaimed', 'MonthClaimed', 'WeekOfMonthClaimed', 'Sex',
       'MaritalStatus', 'Age', 'Fault', 'PolicyType', 'VehicleCategory',
       'VehiclePrice', 'PolicyNumber', 'RepNumber', 'Deductible',
       'DriverRating', 'Days:Policy-Accident', 'Days:Policy-Claim',
       'PastNumberOfClaims', 'AgeOfVehicle', 'AgeOfPolicyHolder',
       'PoliceReportFiled', 'WitnessPresent', 'AgentType',
       'NumberOfSuppliments', 'AddressChange-Claim', 'NumberOfCars', 'Year',
       'BasePolicy', 'FraudFound'],
      dtype='object')

In [54]:
# check any columns for repeating data 
data_copy[['AgeOfPolicyHolder', 'Age']]

Unnamed: 0,AgeOfPolicyHolder,Age
0,26 to 30,21
1,31 to 35,34
2,41 to 50,47
3,51 to 65,65
4,31 to 35,27
...,...,...
15415,31 to 35,35
15416,31 to 35,30
15417,26 to 30,24
15418,31 to 35,34


In [55]:
data_copy[['BasePolicy', 'PolicyType', 'VehicleCategory']]

Unnamed: 0,BasePolicy,PolicyType,VehicleCategory
0,Liability,Sport - Liability,Sport
1,Collision,Sport - Collision,Sport
2,Collision,Sport - Collision,Sport
3,Liability,Sedan - Liability,Sport
4,Collision,Sport - Collision,Sport
...,...,...,...
15415,Collision,Sedan - Collision,Sedan
15416,Liability,Sedan - Liability,Sport
15417,Collision,Sedan - Collision,Sedan
15418,All Perils,Sedan - All Perils,Sedan


### Removing redundant data columns
### Separating claim data from fraud status 

In [56]:
# list of all redundant data points
unwanted_data = ['PolicyNumber', 'BasePolicy', 'VehicleCategory', 'AgeOfPolicyHolder', 'FraudFound']

#  Remove redundant data points
X = data_copy.drop(unwanted_data, axis=1)

#table that links entry ID in X to fraud value
y = data_copy['FraudFound']