In [51]:
import pandas as pd
import numpy as np
import datetime as dt

import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
from imblearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

# run block of code and catch warnings
import warnings
with warnings.catch_warnings():
	# ignore all caught warnings
	warnings.filterwarnings("ignore")

In [52]:
df = pd.read_csv('insurance_claims.csv')

In [53]:
df.head().T

Unnamed: 0,0,1,2,3,4
months_as_customer,328,228,134,256,228
age,48,42,29,41,44
policy_number,521585,342868,687698,227811,367455
policy_bind_date,2014-10-17,2006-06-27,2000-09-06,1990-05-25,2014-06-06
policy_state,OH,IN,OH,IL,IL
policy_csl,250/500,250/500,100/300,250/500,500/1000
policy_deductable,1000,2000,2000,2000,1000
policy_annual_premium,1406.91,1197.22,1413.14,1415.74,1583.91
umbrella_limit,0,5000000,5000000,6000000,6000000
insured_zip,466132,468176,430632,608117,610706


In [54]:
df = df.rename(columns={'capital-gains': 'capital_gains', 'capital-loss': 'capital_loss'})

In [55]:
df.columns

Index(['months_as_customer', 'age', 'policy_number', 'policy_bind_date',
       'policy_state', 'policy_csl', 'policy_deductable',
       'policy_annual_premium', 'umbrella_limit', 'insured_zip', 'insured_sex',
       'insured_education_level', 'insured_occupation', 'insured_hobbies',
       'insured_relationship', 'capital_gains', 'capital_loss',
       'incident_date', 'incident_type', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_state', 'incident_city',
       'incident_location', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'property_damage', 'bodily_injuries',
       'witnesses', 'police_report_available', 'total_claim_amount',
       'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make',
       'auto_model', 'auto_year', 'fraud_reported', '_c39'],
      dtype='object')

In [56]:
# unique entries. Useful to know the catagorical features
df.nunique()

months_as_customer              391
age                              46
policy_number                  1000
policy_bind_date                951
policy_state                      3
policy_csl                        3
policy_deductable                 3
policy_annual_premium           991
umbrella_limit                   11
insured_zip                     995
insured_sex                       2
insured_education_level           7
insured_occupation               14
insured_hobbies                  20
insured_relationship              6
capital_gains                   338
capital_loss                    354
incident_date                    60
incident_type                     4
collision_type                    4
incident_severity                 4
authorities_contacted             5
incident_state                    7
incident_city                     7
incident_location              1000
incident_hour_of_the_day         24
number_of_vehicles_involved       4
property_damage             

In [57]:
# Total number of missing values
df.isna().sum().sum()

1000

In [58]:
# column with missing values 
df.columns[df.isna().any()]

Index(['_c39'], dtype='object')

In [59]:
# removing column named _c39 as it contains only null values

df = df.drop(['_c39'], axis = 1)

In [60]:
# columns with ? entries
df.columns[(df == '?').any()]

Index(['collision_type', 'property_damage', 'police_report_available'], dtype='object')

In [61]:
# check if there are duplicated entries
df.duplicated(subset=None, keep='first').sum()

0

In [62]:
df['policy_bind_date'] = pd.to_datetime(df['policy_bind_date'])

In [63]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
months_as_customer,1000.0,203.954,115.1132,0.0,115.75,199.5,276.25,479.0
age,1000.0,38.948,9.140287,19.0,32.0,38.0,44.0,64.0
policy_number,1000.0,546238.6,257063.0,100804.0,335980.25,533135.0,759099.75,999435.0
policy_deductable,1000.0,1136.0,611.8647,500.0,500.0,1000.0,2000.0,2000.0
policy_annual_premium,1000.0,1256.406,244.1674,433.33,1089.6075,1257.2,1415.695,2047.59
umbrella_limit,1000.0,1101000.0,2297407.0,-1000000.0,0.0,0.0,0.0,10000000.0
insured_zip,1000.0,501214.5,71701.61,430104.0,448404.5,466445.5,603251.0,620962.0
capital_gains,1000.0,25126.1,27872.19,0.0,0.0,0.0,51025.0,100500.0
capital_loss,1000.0,-26793.7,28104.1,-111100.0,-51500.0,-23250.0,0.0,0.0
incident_hour_of_the_day,1000.0,11.644,6.951373,0.0,6.0,12.0,17.0,23.0


In [64]:
drop_columns = ['policy_state', 'policy_csl', 'incident_date', 'incident_state', 'incident_city', 'incident_location']
df = df.drop(drop_columns, axis = 1)
df.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,insured_sex,insured_education_level,...,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported
0,328,48,521585,2014-10-17,1000,1406.91,0,466132,MALE,MD,...,2,YES,71610,6510,13020,52080,Saab,92x,2004,Y
1,228,42,342868,2006-06-27,2000,1197.22,5000000,468176,MALE,MD,...,0,?,5070,780,780,3510,Mercedes,E400,2007,Y
2,134,29,687698,2000-09-06,2000,1413.14,5000000,430632,FEMALE,PhD,...,3,NO,34650,7700,3850,23100,Dodge,RAM,2007,N
3,256,41,227811,1990-05-25,2000,1415.74,6000000,608117,FEMALE,PhD,...,2,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y
4,228,44,367455,2014-06-06,1000,1583.91,6000000,610706,MALE,Associate,...,1,NO,6500,1300,650,4550,Accura,RSX,2009,N


In [65]:
df['fraud_reported'] = df['fraud_reported'].str.replace('Y', '1')
df['fraud_reported'] = df['fraud_reported'].str.replace('N', '0')
df['fraud_reported'] = df['fraud_reported'].astype(int)

In [66]:
df['fraud_reported'].unique()

array([1, 0])

In [67]:
Fraud = df[df['fraud_reported'] == 1]
Valid = df[df['fraud_reported'] == 0]

In [68]:
print('Fraud Report Cases: {}'.format(len(Fraud)))
print('Valid Cases: {}'.format(len(Valid)))

Fraud Report Cases: 247
Valid Cases: 753


In [69]:
import plotly.offline as py
def target_percent():
    trace = go.Pie(labels = ['Valid','Fraud'], values = df['fraud_reported'].value_counts(), 
                   textfont=dict(size=15), opacity = 0.8,
                   marker=dict(colors=['green', 'red'], 
                               line=dict(color='#000000', width=1.5)))


    layout = dict(title =  'Distribution of Target variable')
    fig = dict(data = [trace], layout=layout)
    py.iplot(fig)
target_percent()

In [70]:
#We will visualize the data and see if there is any feature which might influence the claims
def vis_data(df, x, y = 'fraud_reported', graph = 'countplot'):
    if graph == 'hist':
        fig = px.histogram(df, x = x)
        fig.update_layout(title = 'Distribution of {x}'.format(x = x))
        fig.show()
    elif graph == 'bar':
      fig = px.bar(df, x = x, y = y)
      fig.update_layout(title = '{x} vs. {y}'.format(x = x, y = y))
      fig.show()
    elif graph == 'countplot':
      a = df.groupby([x,y]).count()
      a.reset_index(inplace = True)
      no_fraud = a[a['fraud_reported'] == 0]
      yes_fraud = a[a['fraud_reported'] == 1]
      trace1 = go.Bar(x = no_fraud[x], y = no_fraud['policy_number'], name = 'No Fraud')
      trace2 = go.Bar(x = yes_fraud[x], y = yes_fraud['policy_number'], name = 'Fraud')
      fig = go.Figure(data = [trace1, trace2])
      fig.update_layout(title = '{x} vs. {y}'.format(x=x, y = y))
      fig.update_layout(barmode = 'group')
      fig.show()

In [71]:
vis_data(df, 'insured_sex')

In [72]:
vis_data(df, 'insured_education_level')


In [73]:
vis_data(df, 'insured_occupation')

From the data, it looks like people in exec-managerial positions have more number of frauds compared to other occupations.

Sales, tech-support and transport moving also have relatively high cases of fraud.

In [74]:
vis_data(df, 'insured_relationship')

In [75]:
vis_data(df, 'incident_type')

Multi-vehicle and single vehicle collisions have more number of frauds compared to parked and vehicle theft. One of the reasons could be that in a collision, there is high possibility of more damage to car, as well as the passengers and hence the need to file false insurance claims.

In [76]:
vis_data(df, 'collision_type')

In [77]:
vis_data(df, 'incident_severity')

Here, compared to minor damage, total loss and trivial damage, fraudulent claims are highest in major damage.

One reason could be that the high amount of repair cost which will be incurred by the insurer due to major damage.

In [78]:
vis_data(df, 'authorities_contacted')

In [79]:
vis_data(df, 'insured_hobbies')

One thing which is striking in this graph is that people with chess and cross-fit as hobby have extremely high number of fraudulent claims.

We will keep them and rename other values as 'other'

In [80]:
hobbies = df['insured_hobbies'].unique()
for hobby in hobbies:
  if (hobby != 'chess') & (hobby != 'cross-fit'):
    df['insured_hobbies'] = df['insured_hobbies'].str.replace(hobby, 'other')

df['insured_hobbies'].unique()

array(['other', 'chess', 'cross-fit'], dtype=object)

In [81]:
vis_data(df, 'insured_hobbies')

In [82]:
#We will bin the ages and then check the trend for fraud vs. no fraud according to age.
df['age'].describe()

count    1000.000000
mean       38.948000
std         9.140287
min        19.000000
25%        32.000000
50%        38.000000
75%        44.000000
max        64.000000
Name: age, dtype: float64

In [83]:
bin_labels = ['15-20', '21-25', '26-30', '31-35', '36-40', '41-45', '46-50', '51-55', '56-60', '61-65']
bins = [15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65]

df['age_group'] = pd.cut(df['age'], bins = bins, labels = bin_labels, include_lowest = True)
vis_data(df, 'age_group')

In [84]:
df['months_as_customer'].describe()

count    1000.000000
mean      203.954000
std       115.113174
min         0.000000
25%       115.750000
50%       199.500000
75%       276.250000
max       479.000000
Name: months_as_customer, dtype: float64

In [85]:
bin_labels = ['0-50', '51-100', '100-150', '151-200', '201-250', '251-300', '301-350', '351-400', '401-450', '451-500']
bins = [0,50,100,150,200,250,300,350,400,450,500]

df['month_group'] = pd.cut(df['months_as_customer'], bins = bins, labels = bin_labels, include_lowest = True)

In [86]:
vis_data(df, 'month_group')

In [87]:
vis_data(df, 'auto_make')

In [88]:
vis_data(df, 'number_of_vehicles_involved')

In [89]:
vis_data(df, 'witnesses', 'fraud_reported')

In [90]:
vis_data(df, 'bodily_injuries')

In [91]:
vis_data(df, 'total_claim_amount', 'y', 'hist')

In [92]:
vis_data(df, 'incident_hour_of_the_day')

In [93]:
vis_data(df, 'number_of_vehicles_involved')

In [94]:
vis_data(df, 'witnesses')

In [95]:
vis_data(df, 'auto_year')

In [96]:
df['policy_annual_premium'].describe()

count    1000.000000
mean     1256.406150
std       244.167395
min       433.330000
25%      1089.607500
50%      1257.200000
75%      1415.695000
max      2047.590000
Name: policy_annual_premium, dtype: float64

In [97]:
bins = list(np.linspace(0,2500, 6, dtype = int))
bin_labels = ['very low', 'low', 'medium', 'high', 'very high']

df['policy_annual_premium_groups'] = pd.cut(df['policy_annual_premium'], bins = bins, labels=bin_labels)

In [98]:
vis_data(df, 'policy_annual_premium_groups')

In [99]:
df['policy_deductable'].describe()

count    1000.000000
mean     1136.000000
std       611.864673
min       500.000000
25%       500.000000
50%      1000.000000
75%      2000.000000
max      2000.000000
Name: policy_deductable, dtype: float64

In [100]:
bins = list(np.linspace(0,2000, 5, dtype = int))
bin_labels = ['0-500', '501-1000', '1001-1500', '1501-2000']

df['policy_deductable_group'] = pd.cut(df['policy_deductable'], bins = bins, labels = bin_labels)

vis_data(df, 'policy_deductable_group')

In [101]:
vis_data(df, 'property_damage')

In [102]:
vis_data(df, 'police_report_available')

In [103]:
#removing columns for which we created groups
df = df.drop(['age', 'months_as_customer', 'policy_deductable', 'policy_annual_premium'], axis = 1)
df.columns

Index(['policy_number', 'policy_bind_date', 'umbrella_limit', 'insured_zip',
       'insured_sex', 'insured_education_level', 'insured_occupation',
       'insured_hobbies', 'insured_relationship', 'capital_gains',
       'capital_loss', 'incident_type', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'property_damage', 'bodily_injuries',
       'witnesses', 'police_report_available', 'total_claim_amount',
       'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make',
       'auto_model', 'auto_year', 'fraud_reported', 'age_group', 'month_group',
       'policy_annual_premium_groups', 'policy_deductable_group'],
      dtype='object')

In [104]:
required_columns = ['policy_number', 'insured_sex', 'insured_education_level', 'insured_occupation',
       'insured_hobbies', 'capital_gains', 'capital_loss', 'incident_type', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_hour_of_the_day', 'number_of_vehicles_involved',
       'witnesses', 'total_claim_amount',
       'injury_claim', 'property_claim', 'vehicle_claim',
       'fraud_reported', 'age_group',
       'month_group', 'policy_annual_premium_groups']

print(len(required_columns))


22


In [105]:
# checking for multicollinearity
df1 = df[required_columns]
corr = df1.corr()
fig = go.Figure(data = go.Heatmap( z = corr.values, x = list(corr.columns),y = list(corr.index),colorscale = 'Viridis'))
fig.update_layout(title = 'Correlation')
fig.show()

In [106]:
required_columns2 = ['insured_sex', 'insured_occupation',
       'insured_hobbies', 'capital_gains', 'capital_loss', 'incident_type', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_hour_of_the_day', 'number_of_vehicles_involved',
       'witnesses', 'total_claim_amount', 'fraud_reported', 'age_group',
       'month_group', 'policy_annual_premium_groups']

print(len(required_columns2))

17


From the correlation matrix, we see there is high correlation between vehicle claim, total_claim_amount, property_claim and injury_claim

The reason for it is that total_claim_amount is the sum of columns vehicle claim,property_claim and injury_claim.

We will remove the other 3 columns and only keep total_claim_amount as it captures the information and removes collinearity.

In [107]:
df2 = df1[required_columns2]
df2.head()

Unnamed: 0,insured_sex,insured_occupation,insured_hobbies,capital_gains,capital_loss,incident_type,collision_type,incident_severity,authorities_contacted,incident_hour_of_the_day,number_of_vehicles_involved,witnesses,total_claim_amount,fraud_reported,age_group,month_group,policy_annual_premium_groups
0,MALE,craft-repair,other,53300,0,Single Vehicle Collision,Side Collision,Major Damage,Police,5,1,2,71610,1,46-50,301-350,medium
1,MALE,machine-op-inspct,other,0,0,Vehicle Theft,?,Minor Damage,Police,8,1,0,5070,1,41-45,201-250,medium
2,FEMALE,sales,other,35100,0,Multi-vehicle Collision,Rear Collision,Minor Damage,Police,7,3,3,34650,0,26-30,100-150,medium
3,FEMALE,armed-forces,other,48900,-62400,Single Vehicle Collision,Front Collision,Major Damage,Police,5,1,2,63400,1,41-45,251-300,medium
4,MALE,sales,other,66000,-46000,Vehicle Theft,?,Minor Damage,,20,1,1,6500,0,41-45,201-250,high


In [56]:
df2.shape

(1000, 17)

In [108]:
num_features = df2._get_numeric_data().columns
num_features

Index(['capital_gains', 'capital_loss', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'witnesses', 'total_claim_amount',
       'fraud_reported'],
      dtype='object')

In [109]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   insured_sex                   1000 non-null   object  
 1   insured_occupation            1000 non-null   object  
 2   insured_hobbies               1000 non-null   object  
 3   capital_gains                 1000 non-null   int64   
 4   capital_loss                  1000 non-null   int64   
 5   incident_type                 1000 non-null   object  
 6   collision_type                1000 non-null   object  
 7   incident_severity             1000 non-null   object  
 8   authorities_contacted         1000 non-null   object  
 9   incident_hour_of_the_day      1000 non-null   int64   
 10  number_of_vehicles_involved   1000 non-null   int64   
 11  witnesses                     1000 non-null   int64   
 12  total_claim_amount            1000 non-null   int

In [110]:
df2['age_group'] = df2['age_group'].astype(object)
df2['month_group'] = df2['month_group'].astype(object)
df2['policy_annual_premium_groups'] = df2['policy_annual_premium_groups'].astype(object)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [111]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   insured_sex                   1000 non-null   object
 1   insured_occupation            1000 non-null   object
 2   insured_hobbies               1000 non-null   object
 3   capital_gains                 1000 non-null   int64 
 4   capital_loss                  1000 non-null   int64 
 5   incident_type                 1000 non-null   object
 6   collision_type                1000 non-null   object
 7   incident_severity             1000 non-null   object
 8   authorities_contacted         1000 non-null   object
 9   incident_hour_of_the_day      1000 non-null   int64 
 10  number_of_vehicles_involved   1000 non-null   int64 
 11  witnesses                     1000 non-null   int64 
 12  total_claim_amount            1000 non-null   int64 
 13  fraud_reported     

In [112]:
# extracting categorical columns
cat_df = df2.select_dtypes(include = ['object'])
cat_df.columns

Index(['insured_sex', 'insured_occupation', 'insured_hobbies', 'incident_type',
       'collision_type', 'incident_severity', 'authorities_contacted',
       'age_group', 'month_group', 'policy_annual_premium_groups'],
      dtype='object')

In [113]:
cat_df.head()

Unnamed: 0,insured_sex,insured_occupation,insured_hobbies,incident_type,collision_type,incident_severity,authorities_contacted,age_group,month_group,policy_annual_premium_groups
0,MALE,craft-repair,other,Single Vehicle Collision,Side Collision,Major Damage,Police,46-50,301-350,medium
1,MALE,machine-op-inspct,other,Vehicle Theft,?,Minor Damage,Police,41-45,201-250,medium
2,FEMALE,sales,other,Multi-vehicle Collision,Rear Collision,Minor Damage,Police,26-30,100-150,medium
3,FEMALE,armed-forces,other,Single Vehicle Collision,Front Collision,Major Damage,Police,41-45,251-300,medium
4,MALE,sales,other,Vehicle Theft,?,Minor Damage,,41-45,201-250,high


In [114]:
df2[num_features].head()

Unnamed: 0,capital_gains,capital_loss,incident_hour_of_the_day,number_of_vehicles_involved,witnesses,total_claim_amount,fraud_reported
0,53300,0,5,1,2,71610,1
1,0,0,8,1,0,5070,1
2,35100,0,7,3,3,34650,0
3,48900,-62400,5,1,2,63400,1
4,66000,-46000,20,1,1,6500,0


In [115]:
# printing unique values of each column
for col in cat_df.columns:
    print(f"{col}: \n{cat_df[col].unique()}\n")

insured_sex: 
['MALE' 'FEMALE']

insured_occupation: 
['craft-repair' 'machine-op-inspct' 'sales' 'armed-forces' 'tech-support'
 'prof-specialty' 'other-service' 'priv-house-serv' 'exec-managerial'
 'protective-serv' 'transport-moving' 'handlers-cleaners' 'adm-clerical'
 'farming-fishing']

insured_hobbies: 
['other' 'chess' 'cross-fit']

incident_type: 
['Single Vehicle Collision' 'Vehicle Theft' 'Multi-vehicle Collision'
 'Parked Car']

collision_type: 
['Side Collision' '?' 'Rear Collision' 'Front Collision']

incident_severity: 
['Major Damage' 'Minor Damage' 'Total Loss' 'Trivial Damage']

authorities_contacted: 
['Police' 'None' 'Fire' 'Other' 'Ambulance']

age_group: 
['46-50' '41-45' '26-30' '36-40' '31-35' '61-65' '21-25' '56-60' '51-55'
 '15-20']

month_group: 
['301-350' '201-250' '100-150' '251-300' '151-200' '0-50' '401-450'
 '51-100' '451-500' '351-400']

policy_annual_premium_groups: 
['medium' 'high' 'low' 'very high' 'very low']



In [116]:
for col in cat_df.columns:
    print( col,':')
    print(df2[col].value_counts())

insured_sex :
FEMALE    537
MALE      463
Name: insured_sex, dtype: int64
insured_occupation :
machine-op-inspct    93
prof-specialty       85
tech-support         78
sales                76
exec-managerial      76
craft-repair         74
transport-moving     72
other-service        71
priv-house-serv      71
armed-forces         69
adm-clerical         65
protective-serv      63
handlers-cleaners    54
farming-fishing      53
Name: insured_occupation, dtype: int64
insured_hobbies :
other        919
chess         46
cross-fit     35
Name: insured_hobbies, dtype: int64
incident_type :
Multi-vehicle Collision     419
Single Vehicle Collision    403
Vehicle Theft                94
Parked Car                   84
Name: incident_type, dtype: int64
collision_type :
Rear Collision     292
Side Collision     276
Front Collision    254
?                  178
Name: collision_type, dtype: int64
incident_severity :
Minor Damage      354
Total Loss        280
Major Damage      276
Trivial Damage   

In [119]:
# label endcoding for the object datatypes
from sklearn import preprocessing
for col in cat_df.columns:
    if (df2[col].dtype == 'object'):
        le = preprocessing.LabelEncoder()
        le = le.fit(df2[col])
        df2[col] = le.transform(df2[col])
        print('Completed Label encoding on',col)

Completed Label encoding on insured_sex
Completed Label encoding on insured_occupation
Completed Label encoding on insured_hobbies
Completed Label encoding on incident_type
Completed Label encoding on collision_type
Completed Label encoding on incident_severity
Completed Label encoding on authorities_contacted
Completed Label encoding on age_group
Completed Label encoding on month_group
Completed Label encoding on policy_annual_premium_groups




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [120]:
for col in cat_df.columns:
    print( col,':')
    print(df2[col].value_counts())

insured_sex :
0    537
1    463
Name: insured_sex, dtype: int64
insured_occupation :
6     93
9     85
12    78
11    76
3     76
2     74
13    72
8     71
7     71
1     69
0     65
10    63
5     54
4     53
Name: insured_occupation, dtype: int64
insured_hobbies :
2    919
0     46
1     35
Name: insured_hobbies, dtype: int64
incident_type :
0    419
2    403
3     94
1     84
Name: incident_type, dtype: int64
collision_type :
2    292
3    276
1    254
0    178
Name: collision_type, dtype: int64
incident_severity :
1    354
2    280
0    276
3     90
Name: incident_severity, dtype: int64
authorities_contacted :
4    292
1    223
3    198
0    196
2     91
Name: authorities_contacted, dtype: int64
age_group :
4    201
3    195
5    184
2    157
6    109
7     50
8     46
1     38
9     18
0      2
Name: age_group, dtype: int64
month_group :
4    174
1    163
3    153
2    137
9    112
0     91
5     53
7     50
6     41
8     26
Name: month_group, dtype: int64
policy_annual_premium_

In [121]:
# Correlation matrix
corrmat = df2.corr()
fig = go.Figure(data = go.Heatmap( z = corrmat.values, x = list(corrmat.columns),y = list(corrmat.index),colorscale = 'Viridis'))
fig.update_layout(title = 'Correlation')
fig.show()

In [124]:
#Correlation with output variable
cor_target = abs(corrmat["fraud_reported"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0]
relevant_features

insured_sex                     0.030873
insured_occupation              0.001564
insured_hobbies                 0.363613
capital_gains                   0.019173
capital_loss                    0.014863
incident_type                   0.050376
collision_type                  0.110130
incident_severity               0.405988
authorities_contacted           0.045802
incident_hour_of_the_day        0.004316
number_of_vehicles_involved     0.051839
witnesses                       0.049497
total_claim_amount              0.163651
fraud_reported                  1.000000
age_group                       0.022379
month_group                     0.013433
policy_annual_premium_groups    0.019561
Name: fraud_reported, dtype: float64

In [122]:
df2.to_csv(r'finaldataset.csv')

In [123]:
df2.columns

Index(['insured_sex', 'insured_occupation', 'insured_hobbies', 'capital_gains',
       'capital_loss', 'incident_type', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'witnesses', 'total_claim_amount',
       'fraud_reported', 'age_group', 'month_group',
       'policy_annual_premium_groups'],
      dtype='object')

In [127]:
#X = new_df[['months_as_customer', 'policy_csl', 'insured_sex','collision_type', 'incident_severity','authorities_contacted', 'incident_state', 'witnesses','injury_claim', 'property_claim','vehicle_claim', 'auto_make',]]
X_df = df2.drop(["fraud_reported"],axis=1)
y_df = df2['fraud_reported']

In [128]:
print(X_df.shape)
print(y_df.shape)

(1000, 16)
(1000,)


In [129]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=24)
X,y = sm.fit_resample(X_df, y_df)

In [130]:
print(X.shape)
print(y.shape)

(1506, 16)
(1506,)


In [131]:
#stadardize data    
from sklearn.preprocessing import StandardScaler
x_scaled = StandardScaler().fit_transform(X)

In [132]:
# splitting data into training set and test set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_scaled,y,test_size = 0.20)

In [133]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1204, 16)
(302, 16)
(1204,)
(302,)


In [147]:
lr = LogisticRegression()

lr = LogisticRegression()

lr.fit(X_train, y_train)
preds = lr.predict(X_test)

log_acc=accuracy_score(y_test,lr.predict(X_test))


print("Train Set Accuracy:"+str(accuracy_score(y_train,lr.predict(X_train))*100))
print("Test Set Accuracy:"+str(accuracy_score(y_test,lr.predict(X_test))*100))

Train Set Accuracy:88.53820598006644
Test Set Accuracy:88.0794701986755


In [148]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.91      0.86      0.88       158
           1       0.86      0.90      0.88       144

    accuracy                           0.88       302
   macro avg       0.88      0.88      0.88       302
weighted avg       0.88      0.88      0.88       302



In [149]:
print(confusion_matrix(y_test, preds))

[[136  22]
 [ 14 130]]


In [150]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()

dtc.fit(X_train, y_train)
preds = dtc.predict(X_test)


dtc_acc= accuracy_score(y_test,dtc.predict(X_test))

print("Train Set Accuracy:"+str(accuracy_score(y_train,dtc.predict(X_train))*100))
print("Test Set Accuracy:"+str(accuracy_score(y_test,dtc.predict(X_test))*100))

Train Set Accuracy:100.0
Test Set Accuracy:87.41721854304636


In [151]:
print(confusion_matrix(y_test, preds))

[[136  22]
 [ 16 128]]


In [155]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

rfc.fit(X_train, y_train)
preds = rfc.predict(X_test)


rfc_acc= accuracy_score(y_test,rfc.predict(X_test))

print("Train Set Accuracy:"+str(accuracy_score(y_train,rfc.predict(X_train))*100))
print("Test Set Accuracy:"+str(accuracy_score(y_test,rfc.predict(X_test))*100))

Train Set Accuracy:100.0
Test Set Accuracy:90.06622516556291


In [156]:
print(confusion_matrix(y_test, preds))

[[135  23]
 [  7 137]]


In [157]:
rfpred = pd.DataFrame({
    'Actual' : rfc.predict(X_test),
    'Predicted' : y_test
})
comparison_column = np.where(rfpred["Actual"] == rfpred["Predicted"], True, False)
rfpred[comparison_column].count()

Actual       272
Predicted    272
dtype: int64

In [158]:
from sklearn.svm import SVC


svc = SVC(kernel='linear')
svc.fit(X_train, y_train)

preds = svc.predict(X_test)

svm_acc= accuracy_score(y_test,svc.predict(X_test))


print("Train Set Accuracy:"+str(accuracy_score(y_train,svc.predict(X_train))*100))
print("Test Set Accuracy:"+str(accuracy_score(y_test,svc.predict(X_test))*100))

print('Classification report:\n', classification_report(y_test, preds))

Train Set Accuracy:87.70764119601328
Test Set Accuracy:88.41059602649007
Classification report:
               precision    recall  f1-score   support

           0       0.91      0.86      0.89       158
           1       0.86      0.91      0.88       144

    accuracy                           0.88       302
   macro avg       0.88      0.89      0.88       302
weighted avg       0.89      0.88      0.88       302



In [159]:
print(confusion_matrix(y_test, preds))

[[136  22]
 [ 13 131]]


In [160]:
#new_input = [[0,10,2,0,0,1,0,3,4,6,1,2,5920,4,3,2]]
new_input = [[1,11,2,37800,0,2,1,1,1,2,1,3,64200,5,3,0]]
new_output = svc.predict(new_input)
# summarize input and output
print(new_input, new_output)

[[1, 11, 2, 37800, 0, 2, 1, 1, 1, 2, 1, 3, 64200, 5, 3, 0]] [0]


In [161]:
svmpred = pd.DataFrame({
    'Actual' : svc.predict(X_test),
    'Predicted' : y_test
})
comparison_column = np.where(svmpred["Actual"] == svmpred["Predicted"], True, False)
svmpred[comparison_column].count()

Actual       267
Predicted    267
dtype: int64

In [163]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()

lda.fit(X_train, y_train)
preds = lda.predict(X_test)


lda_acc= accuracy_score(y_test,lda.predict(X_test))


print("Train Set Accuracy:"+str(accuracy_score(y_train,lda.predict(X_train))*100))
print("Test Set Accuracy:"+str(accuracy_score(y_test,lda.predict(X_test))*100))

Train Set Accuracy:86.87707641196013
Test Set Accuracy:87.08609271523179


In [164]:
print('Classification report:\n', classification_report(y_test, preds))

Classification report:
               precision    recall  f1-score   support

           0       0.90      0.85      0.87       158
           1       0.84      0.90      0.87       144

    accuracy                           0.87       302
   macro avg       0.87      0.87      0.87       302
weighted avg       0.87      0.87      0.87       302



In [165]:
print(confusion_matrix(y_test, preds))

[[134  24]
 [ 15 129]]


In [167]:
models = pd.DataFrame({
    'Model': ['Logistic','Decision Tree Classifier','Random Forest Classifier','SVC','LDA'],
    'Score': [ log_acc,dtc_acc, rfc_acc, svm_acc, lda_acc]
})

models.sort_values(by = 'Score', ascending = False)

Unnamed: 0,Model,Score
2,Random Forest Classifier,0.900662
3,SVC,0.884106
0,Logistic,0.880795
1,Decision Tree Classifier,0.874172
4,LDA,0.870861


In [168]:
colors=['Logistic','Decision Tree Classifier','Random Forest Classifier','SVC','LDA']
fig = px.bar(models, x='Model', y='Score',color=colors)
fig.show()

In [173]:
# pickling the model 
import pickle 
pickle_out = open("output.pkl", "wb") 
pickle.dump(rfc,pickle_out) 
pickle_out.close()