In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Loading the dataset 
df=pd.read_csv('/content/insurance_claims.csv')

In [None]:
#Splitting the test data to training and testing data
from sklearn.model_selection import train_test_split

X = df.drop('fraud_reported', axis=1)
y = df['fraud_reported']
#Using the standard 80/20 split for the data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
#Label encoding 
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in df:
    if df[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(df[col].unique())) <= 2:
            # Train on the training data
            le.fit(df[col])
            # Transform both training and testing data
            df[col] = le.transform(df[col])
            
            
            # Keep track of how many columns were label encoded
            le_count += 1
            
print('%d columns were label encoded.' % le_count)

2 columns were label encoded.


In [14]:
#Extracting categorical values and their unique values 
column_name = []
unique_value = []

# Iterate over each column in the dataframe
for col in df.columns:
    # Check if the datatype of the column is object (i.e., categorical)
    if df[col].dtype == 'object':
        # Append the column name and its number of unique values to the respective lists
        column_name.append(col)
        unique_value.append(df[col].nunique())

# Create a new dataframe containing the categorical column names and their unique values
cat_cols_df = pd.DataFrame({'Column Name': column_name, 'Unique Values': unique_value})

# Sort the dataframe in descending order of unique values
cat_cols_df = cat_cols_df.sort_values(by='Unique Values', ascending=False)

# Print the resulting dataframe
print(cat_cols_df)


                Column Name  Unique Values
14        incident_location           1000
0          policy_bind_date            951
7             incident_date             60
18               auto_model             39
5           insured_hobbies             20
4        insured_occupation             14
17                auto_make             14
3   insured_education_level              7
12           incident_state              7
13            incident_city              7
6      insured_relationship              6
11    authorities_contacted              5
8             incident_type              4
10        incident_severity              4
9            collision_type              4
1              policy_state              3
2                policy_csl              3
15          property_damage              3
16  police_report_available              3


In [16]:
# droping columns based on above result
df.drop(['incident_location','policy_bind_date','incident_date','auto_model','insured_occupation','insured_hobbies'],axis=1,inplace=True)

In [30]:
# extracting categorical columns
cat_df = X.select_dtypes(include = ['object'])

In [31]:
cat_df.head()

Unnamed: 0,policy_csl,insured_relationship
0,250/500,husband
1,250/500,other-relative
2,100/300,own-child
3,250/500,unmarried
4,500/1000,unmarried


In [32]:
# printing unique values of each column
for col in cat_df.columns:
    print(f"{col}: \n{cat_df[col].unique()}\n")

policy_csl: 
['250/500' '100/300' '500/1000']

insured_relationship: 
['husband' 'other-relative' 'own-child' 'unmarried' 'wife' 'not-in-family']



In [33]:
cat_df = pd.get_dummies(cat_df, drop_first = True)

In [34]:
cat_df.head()

Unnamed: 0,policy_csl_250/500,policy_csl_500/1000,insured_relationship_not-in-family,insured_relationship_other-relative,insured_relationship_own-child,insured_relationship_unmarried,insured_relationship_wife
0,1,0,0,0,0,0,0
1,1,0,0,1,0,0,0
2,0,0,0,0,1,0,0
3,1,0,0,0,0,1,0
4,0,1,0,0,0,1,0


In [35]:
# extracting the numerical columns

num_df = X.select_dtypes(include = ['int64'])

In [36]:
num_df.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_deductable,umbrella_limit,insured_zip,capital-gains,capital-loss,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,witnesses,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_year
0,328,48,521585,1000,0,466132,53300,0,5,1,1,2,71610,6510,13020,52080,2004
1,228,42,342868,2000,5000000,468176,0,0,8,1,0,0,5070,780,780,3510,2007
2,134,29,687698,2000,5000000,430632,35100,0,7,3,2,3,34650,7700,3850,23100,2007
3,256,41,227811,2000,6000000,608117,48900,-62400,5,1,1,2,63400,6340,6340,50720,2014
4,228,44,367455,1000,6000000,610706,66000,-46000,20,1,0,1,6500,1300,650,4550,2009


In [37]:
# combining the Numerical and Categorical dataframes to get the final dataset

X = pd.concat([num_df, cat_df], axis = 1)

In [38]:
X.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_deductable,umbrella_limit,insured_zip,capital-gains,capital-loss,incident_hour_of_the_day,number_of_vehicles_involved,...,property_claim,vehicle_claim,auto_year,policy_csl_250/500,policy_csl_500/1000,insured_relationship_not-in-family,insured_relationship_other-relative,insured_relationship_own-child,insured_relationship_unmarried,insured_relationship_wife
0,328,48,521585,1000,0,466132,53300,0,5,1,...,13020,52080,2004,1,0,0,0,0,0,0
1,228,42,342868,2000,5000000,468176,0,0,8,1,...,780,3510,2007,1,0,0,1,0,0,0
2,134,29,687698,2000,5000000,430632,35100,0,7,3,...,3850,23100,2007,0,0,0,0,1,0,0
3,256,41,227811,2000,6000000,608117,48900,-62400,5,1,...,6340,50720,2014,1,0,0,0,0,1,0
4,228,44,367455,1000,6000000,610706,66000,-46000,20,1,...,650,4550,2009,0,1,0,0,0,1,0


In [23]:
num_df = X_train[['months_as_customer', 'policy_deductable', 'umbrella_limit',
       'capital-gains', 'capital-loss', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'bodily_injuries', 'witnesses', 'injury_claim', 'property_claim',
       'vehicle_claim']]

In [24]:
# Scaling the numeric values in the dataset

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_data = scaler.fit_transform(num_df)

In [25]:
scaled_num_df = pd.DataFrame(data = scaled_data, columns = num_df.columns, index = X_train.index)
scaled_num_df.head()

Unnamed: 0,months_as_customer,policy_deductable,umbrella_limit,capital-gains,capital-loss,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,witnesses,injury_claim,property_claim,vehicle_claim
29,0.301419,1.399911,-0.489302,-0.90544,0.969265,-0.94995,-0.804695,1.219422,0.465374,1.335925,1.375409,1.327654
535,-1.243193,-0.230262,-0.489302,-0.90544,-1.587633,0.336664,-0.804695,0.0,-1.320234,-0.199262,-0.18985,0.05345
695,0.722676,1.399911,-0.489302,0.759074,-1.60897,-0.378122,-0.804695,1.219422,-0.42743,-1.470111,-1.485594,-1.909299
557,-1.260745,-1.045349,2.060804,-0.90544,-0.328743,-0.94995,-0.804695,-1.219422,0.465374,-1.356243,-1.369495,-1.838598
836,0.441838,-0.230262,-0.489302,0.951544,0.969265,-1.664736,-0.804695,-1.219422,-0.42743,2.806044,-0.073751,0.981007


In [26]:
X_train.drop(columns = scaled_num_df.columns, inplace = True)

In [27]:
X_train = pd.concat([scaled_num_df, X_train], axis = 1)

In [28]:
X_train.head()

Unnamed: 0,months_as_customer,policy_deductable,umbrella_limit,capital-gains,capital-loss,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,witnesses,injury_claim,...,authorities_contacted,incident_state,incident_city,incident_location,property_damage,police_report_available,total_claim_amount,auto_make,auto_model,auto_year
29,0.301419,1.399911,-0.489302,-0.90544,0.969265,-0.94995,-0.804695,1.219422,0.465374,1.335925,...,Police,SC,Northbrook,2100 Francis Drive,NO,NO,91650,Accura,TL,2011
535,-1.243193,-0.230262,-0.489302,-0.90544,-1.587633,0.336664,-0.804695,0.0,-1.320234,-0.199262,...,Other,NY,Arlington,9611 Pine Ridge,NO,YES,52400,Accura,MDX,2005
695,0.722676,1.399911,-0.489302,0.759074,-1.60897,-0.378122,-0.804695,1.219422,-0.42743,-1.470111,...,Police,WV,Riverwood,8742 4th St,NO,NO,2700,Honda,Accord,2006
557,-1.260745,-1.045349,2.060804,-0.90544,-0.328743,-0.94995,-0.804695,-1.219422,0.465374,-1.356243,...,Police,NC,Northbrook,1810 Elm Hwy,NO,YES,5160,Accura,TL,2004
836,0.441838,-0.230262,-0.489302,0.951544,0.969265,-1.664736,-0.804695,-1.219422,-0.42743,2.806044,...,Police,VA,Northbend,2381 1st Hwy,NO,YES,85320,Nissan,Pathfinder,2006


One-hot encoding 

In [40]:

# extract the target variable
y = df['fraud_reported']

# extract the features and encode the categorical variables using one-hot encoding
X = pd.get_dummies(df.drop('fraud_reported', axis=1))



SVM Classification

In [42]:

#Using the standard 80/20 split for the data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#SVM Classification
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train, y_train)

y_pred = svc.predict(X_test)

In [43]:
# accuracy_score, confusion_matrix and classification_report

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

svc_train_acc = accuracy_score(y_train, svc.predict(X_train))
svc_test_acc = accuracy_score(y_test, y_pred)

print(f"Training accuracy of Support Vector Classifier is : {svc_train_acc}")
print(f"Test accuracy of Support Vector Classifier is : {svc_test_acc}")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Training accuracy of Support Vector Classifier is : 0.76
Test accuracy of Support Vector Classifier is : 0.725
[[145   0]
 [ 55   0]]
              precision    recall  f1-score   support

           0       0.72      1.00      0.84       145
           1       0.00      0.00      0.00        55

    accuracy                           0.73       200
   macro avg       0.36      0.50      0.42       200
weighted avg       0.53      0.72      0.61       200



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


KNN classification

In [44]:
#KNN Classification

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 30)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

In [45]:
# accuracy_score, confusion_matrix and classification_report

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

knn_train_acc = accuracy_score(y_train, knn.predict(X_train))
knn_test_acc = accuracy_score(y_test, y_pred)

print(f"Training accuracy of KNN is : {knn_train_acc}")
print(f"Test accuracy of KNN is : {knn_test_acc}")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Training accuracy of KNN is : 0.76
Test accuracy of KNN is : 0.725
[[145   0]
 [ 55   0]]
              precision    recall  f1-score   support

           0       0.72      1.00      0.84       145
           1       0.00      0.00      0.00        55

    accuracy                           0.73       200
   macro avg       0.36      0.50      0.42       200
weighted avg       0.53      0.72      0.61       200



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Decision Tree Classification

In [46]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

y_pred = dtc.predict(X_test)

In [47]:
# accuracy_score, confusion_matrix and classification_report

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

dtc_train_acc = accuracy_score(y_train, dtc.predict(X_train))
dtc_test_acc = accuracy_score(y_test, y_pred)

print(f"Training accuracy of Decision Tree is : {dtc_train_acc}")
print(f"Test accuracy of Decision Tree is : {dtc_test_acc}")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Training accuracy of Decision Tree is : 1.0
Test accuracy of Decision Tree is : 0.705
[[119  26]
 [ 33  22]]
              precision    recall  f1-score   support

           0       0.78      0.82      0.80       145
           1       0.46      0.40      0.43        55

    accuracy                           0.70       200
   macro avg       0.62      0.61      0.61       200
weighted avg       0.69      0.70      0.70       200



Random Forest Classification

In [48]:
from sklearn.ensemble import RandomForestClassifier

rand_clf = RandomForestClassifier(criterion= 'entropy', max_depth= 10, max_features= 'sqrt', min_samples_leaf= 1, min_samples_split= 3, n_estimators= 140)
rand_clf.fit(X_train, y_train)

y_pred = rand_clf.predict(X_test)

In [49]:
# accuracy_score, confusion_matrix and classification_report

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

rand_clf_train_acc = accuracy_score(y_train, rand_clf.predict(X_train))
rand_clf_test_acc = accuracy_score(y_test, y_pred)

print(f"Training accuracy of Random Forest is : {rand_clf_train_acc}")
print(f"Test accuracy of Random Forest is : {rand_clf_test_acc}")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Training accuracy of Random Forest is : 0.96375
Test accuracy of Random Forest is : 0.72
[[129  16]
 [ 40  15]]
              precision    recall  f1-score   support

           0       0.76      0.89      0.82       145
           1       0.48      0.27      0.35        55

    accuracy                           0.72       200
   macro avg       0.62      0.58      0.59       200
weighted avg       0.69      0.72      0.69       200



XG Boost Classification

In [50]:
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)

In [51]:
# accuracy_score, confusion_matrix and classification_report

xgb_train_acc = accuracy_score(y_train, xgb.predict(X_train))
xgb_test_acc = accuracy_score(y_test, y_pred)

print(f"Training accuracy of XgBoost is : {xgb_train_acc}")
print(f"Test accuracy of XgBoost is : {xgb_test_acc}")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Training accuracy of XgBoost is : 0.92875
Test accuracy of XgBoost is : 0.73
[[127  18]
 [ 36  19]]
              precision    recall  f1-score   support

           0       0.78      0.88      0.82       145
           1       0.51      0.35      0.41        55

    accuracy                           0.73       200
   macro avg       0.65      0.61      0.62       200
weighted avg       0.71      0.73      0.71       200



Comparing various models 

In [53]:
models = pd.DataFrame({
    'Model' : ['SVC', 'KNN', 'Decision Tree', 'Random Forest', 'XgBoost'],
    'Score' : [svc_test_acc, knn_test_acc, dtc_test_acc, rand_clf_test_acc, xgb_test_acc]
})


models.sort_values(by = 'Score', ascending = False)

Unnamed: 0,Model,Score
4,XgBoost,0.73
0,SVC,0.725
1,KNN,0.725
3,Random Forest,0.72
2,Decision Tree,0.705
