## Model Training

In [1]:
## Basic imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

## Preprocessing imports

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

## Train test split

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

## Modelling 

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC

## Other imports

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

### Import the csv data as pandas dataframe

In [2]:
df = pd.read_csv("./data/cleaned_data.csv")

In [3]:
df.head()

Unnamed: 0,policy_state,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,insured_sex,insured_education_level,insured_relationship,capital-gains,capital-loss,...,property_damage,bodily_injuries,witnesses,police_report_available,total_claim_amount,injury_claim,vehicle_claim,auto_make,auto_year,fraud_reported
0,OH,1000,1406.91,0,466132,MALE,MD,husband,53300,0,...,YES,1,2,YES,71610,6510,52080,Saab,2004,Y
1,IN,2000,1197.22,5000000,468176,MALE,MD,other-relative,0,0,...,?,0,0,?,5070,780,3510,Mercedes,2007,Y
2,OH,2000,1413.14,5000000,430632,FEMALE,PhD,own-child,35100,0,...,NO,2,3,NO,34650,7700,23100,Dodge,2007,N
3,IL,2000,1415.74,6000000,608117,FEMALE,PhD,unmarried,48900,-62400,...,?,1,2,NO,63400,6340,50720,Chevrolet,2014,Y
4,IL,1000,1583.91,6000000,610706,MALE,Associate,unmarried,66000,-46000,...,NO,0,1,NO,6500,1300,4550,Accura,2009,N


In [4]:
## Check for null values

df.isnull().sum()

policy_state                   0
policy_deductable              0
policy_annual_premium          0
umbrella_limit                 0
insured_zip                    0
insured_sex                    0
insured_education_level        0
insured_relationship           0
capital-gains                  0
capital-loss                   0
incident_type                  0
collision_type                 0
incident_severity              0
authorities_contacted          0
incident_state                 0
incident_city                  0
incident_hour_of_the_day       0
number_of_vehicles_involved    0
property_damage                0
bodily_injuries                0
witnesses                      0
police_report_available        0
total_claim_amount             0
injury_claim                   0
vehicle_claim                  0
auto_make                      0
auto_year                      0
fraud_reported                 0
dtype: int64

In [5]:
## Define numerical and categorical columns

numerical_cols = df.select_dtypes(exclude="object").columns

numerical_cols

Index(['policy_deductable', 'policy_annual_premium', 'umbrella_limit',
       'insured_zip', 'capital-gains', 'capital-loss',
       'incident_hour_of_the_day', 'number_of_vehicles_involved',
       'bodily_injuries', 'witnesses', 'total_claim_amount', 'injury_claim',
       'vehicle_claim', 'auto_year'],
      dtype='object')

In [7]:
categorical_cols = df.select_dtypes(include="object").columns

categorical_cols

Index(['policy_state', 'insured_sex', 'insured_education_level',
       'insured_relationship', 'incident_type', 'collision_type',
       'incident_severity', 'authorities_contacted', 'incident_state',
       'incident_city', 'property_damage', 'police_report_available',
       'auto_make', 'fraud_reported'],
      dtype='object')

In [8]:
## Define dependent and independent features

X = df.drop(columns=['fraud_reported'], axis=1)

y = df['fraud_reported']

In [9]:
X

Unnamed: 0,policy_state,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,insured_sex,insured_education_level,insured_relationship,capital-gains,capital-loss,...,number_of_vehicles_involved,property_damage,bodily_injuries,witnesses,police_report_available,total_claim_amount,injury_claim,vehicle_claim,auto_make,auto_year
0,OH,1000,1406.91,0,466132,MALE,MD,husband,53300,0,...,1,YES,1,2,YES,71610,6510,52080,Saab,2004
1,IN,2000,1197.22,5000000,468176,MALE,MD,other-relative,0,0,...,1,?,0,0,?,5070,780,3510,Mercedes,2007
2,OH,2000,1413.14,5000000,430632,FEMALE,PhD,own-child,35100,0,...,3,NO,2,3,NO,34650,7700,23100,Dodge,2007
3,IL,2000,1415.74,6000000,608117,FEMALE,PhD,unmarried,48900,-62400,...,1,?,1,2,NO,63400,6340,50720,Chevrolet,2014
4,IL,1000,1583.91,6000000,610706,MALE,Associate,unmarried,66000,-46000,...,1,NO,0,1,NO,6500,1300,4550,Accura,2009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,OH,1000,1310.80,0,431289,FEMALE,Masters,unmarried,0,0,...,1,YES,0,1,?,87200,17440,61040,Honda,2006
996,IL,1000,1436.79,0,608177,FEMALE,PhD,wife,70900,0,...,1,YES,2,3,?,108480,18080,72320,Volkswagen,2015
997,OH,500,1383.49,3000000,442797,FEMALE,Masters,other-relative,35100,0,...,3,?,2,3,YES,67500,7500,52500,Suburu,1996
998,IL,2000,1356.92,5000000,441714,MALE,Associate,wife,0,0,...,1,?,0,1,YES,46980,5220,36540,Audi,1998


In [10]:
y

0      Y
1      Y
2      N
3      Y
4      N
      ..
995    N
996    N
997    N
998    N
999    N
Name: fraud_reported, Length: 1000, dtype: object