In [1]:
# Generals imports
import pandas as pd
import numpy as np
# Ploting imports
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

plt.style.use('ggplot')

In [2]:
# Asign CSV to Data Frame
df = pd.read_csv('../input/auto-insurance-claims-data/insurance_claims.csv')

In [3]:
#Check the data
df.head()

In this case, we know the meaning of the columns. This is good if you want to do an Exploratory Data Analysis (EDA).

In [4]:
# Replace missing values (?) with np.nan
df.replace('?', np.nan, inplace = True)

In [5]:
# Data Statistics resume
df.describe()

# **Data Pre-Processing**



In [6]:
# missing values count per column
df.isna().sum()

# **Visualizing Missing Values**

**Missingno library** offers a very nice way to visualize the distribution of NaN values.<br>
Missingno is a Python library and compatible with Pandas

In [7]:
import missingno as msno

msno.bar(df)
plt.show()

# **Handling missing values**

Replace nan values with the **mode.**
**Mode:** it's the value that appears most often.

In [8]:
df['collision_type'] = df['collision_type'].fillna(df['collision_type'].mode()[0])
df['property_damage'] = df['property_damage'].fillna(df['property_damage'].mode()[0])
df['police_report_available'] = df['police_report_available'].fillna(df['police_report_available'].mode()[0])

In [9]:
# Check the correction
df.isna().sum()

### **Now, there aren't missing values**

# **Dropping columns** 

Some of them are not necessary for prediction.

In [10]:
df.nunique()

In [11]:
# Dropping columns 
to_drop = ['policy_number','policy_bind_date','policy_state','insured_zip','incident_location','incident_date',
           'incident_state','incident_city','insured_hobbies','auto_make','auto_model','auto_year', '_c39']

df.drop(to_drop, inplace = True, axis = 1)

In [12]:
# checking for multicollinearity
plt.figure(figsize = (18, 12))

corr = df.corr()
mask = np.triu(np.ones_like(corr, dtype = bool))

sns.heatmap(data = corr, mask = mask, annot = True, fmt = '.2g', linewidth = 1)
plt.show()

### **Correlation Conclusions**

From the above plot, we can see that there is high correlation between 'age' and 'months_as_customer'.

**We will drop the "Age" column.**

Also there is high correlation between 'total_clam_amount', 'injury_claim', 'property_claim', 'vehicle_claim' as 'total claim' is the sum of all others. 

**We will drop the total claim column.**

In [13]:
df.drop(columns = ['age', 'total_claim_amount'], inplace = True, axis = 1)

In [14]:
# separating the feature and target columns
X = df.drop('fraud_reported', axis = 1)
y = df['fraud_reported']

## **Encoding Categorical columns**

We need to transform categorical values to numerical values.
XGBoost works with numerical values.

In [15]:
# extracting categorical columns
cat_df = X.select_dtypes(include = ['object'])

In [16]:
# Check the data again
cat_df.head()

In [17]:
# printing unique values of each column
for col in cat_df.columns:
    print(f"{col}: \n{cat_df[col].unique()}\n")

In [18]:
# Converting categorical data into dummy or indicator variables.
cat_df = pd.get_dummies(cat_df, drop_first = True)

In [19]:
cat_df.head()

In [20]:
# extracting the numerical columns
num_df = X.select_dtypes(include = ['int64'])

In [21]:
# combining the Numerical and Categorical dataframes to get the final dataset
X = pd.concat([num_df, cat_df], axis = 1)

In [22]:
X.head()

## Data distribution

See the distribution of each column

In [23]:
plt.figure(figsize = (25, 20))
plotnumber = 1

for col in X.columns:
    if plotnumber <= 24:
        ax = plt.subplot(5, 5, plotnumber)
        sns.distplot(X[col])
        plt.xlabel(col, fontsize = 15)
        
    plotnumber += 1
    
plt.tight_layout()
plt.show()

## **Outliers Detection**

See if we need to correct some columns outliers.

In [24]:
plt.figure(figsize = (20, 15))
plotnumber = 1

for col in X.columns:
    if plotnumber <= 24:
        ax = plt.subplot(5, 5, plotnumber)
        sns.boxplot(X[col])
        plt.xlabel(col, fontsize = 15)
    
    plotnumber += 1
plt.tight_layout()
plt.show()

# Scale Data

**Outliers are present in some numerical columns we need to scale numerical columns**

In [25]:
# splitting data into training set and test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [26]:
X_train.head()

In [27]:
# Select columns to scale.
num_df = X_train[['months_as_customer', 'policy_deductable', 'umbrella_limit',
           'capital-gains', 'capital-loss', 'incident_hour_of_the_day',
           'number_of_vehicles_involved', 'bodily_injuries', 'witnesses', 
            'injury_claim', 'property_claim', 'vehicle_claim']]

In [28]:
# Scaling the numeric values in the dataset
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_data = scaler.fit_transform(num_df)

In [29]:
scaled_num_df = pd.DataFrame(data = scaled_data, columns = num_df.columns, index = X_train.index)
scaled_num_df.head()

In [30]:
# Drop old numerical and not scaled Data
X_train.drop(columns = scaled_num_df.columns, inplace = True)

In [31]:
# Create new X_train with the scaled data.
X_train = pd.concat([scaled_num_df, X_train], axis = 1)

In [32]:
X_train.head()

# **SVM Classifier**

In [None]:
from sklearn.svm import SVC

# SVM model instance
svc = SVC()
svc.fit(X_train, y_train)

y_pred = svc.predict(X_test)

In [None]:
# accuracy_score, confusion_matrix and classification_report
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

svc_test_acc = accuracy_score(y_test, y_pred)

print(f"Test accuracy of Support Vector Classifier is : {svc_test_acc}")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# **KNN**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Model instance
knn = KNeighborsClassifier(n_neighbors = 30)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

In [None]:
# accuracy_score, confusion_matrix and classification_report
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

knn_test_acc = accuracy_score(y_test, y_pred)

print(f"Test accuracy of KNN is : {knn_test_acc}")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# **Decision Tree Classifier**

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Model instance
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

y_pred = dtc.predict(X_test)

In [None]:
# accuracy_score, confusion_matrix and classification_report
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

dtc_test_acc = accuracy_score(y_test, y_pred)

print(f"Test accuracy of Decision Tree is : {dtc_test_acc}")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# **GridSearch**

Exhaustive search over specified parameter values for an estimator.

Best hyperparameters for Decision Tree Classifier.

In [None]:
# hyper parameter tuning
from sklearn.model_selection import GridSearchCV

grid_params = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [3, 5, 7, 10],
    'min_samples_split' : range(2, 10, 1),
    'min_samples_leaf' : range(2, 10, 1)
}

grid_search = GridSearchCV(dtc, grid_params, cv = 5, n_jobs = -1, verbose = 1)
grid_search.fit(X_train, y_train)

In [None]:
# best parameters and best score
print(grid_search.best_params_)
print(grid_search.best_score_)

In [None]:
# best estimator 
dtc = grid_search.best_estimator_

y_pred = dtc.predict(X_test)

In [None]:
# accuracy_score, confusion_matrix and classification_report
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

dtc_test_acc = accuracy_score(y_test, y_pred)

print(f"Test accuracy of Decision Tree is : {dtc_test_acc}")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# **Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier

rand_clf = RandomForestClassifier(criterion= 'entropy', max_depth= 10, max_features= 'sqrt', min_samples_leaf= 1, min_samples_split= 3, n_estimators= 140)
rand_clf.fit(X_train, y_train)

y_pred = rand_clf.predict(X_test)

In [None]:
# accuracy_score, confusion_matrix and classification_report

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

rand_clf_train_acc = accuracy_score(y_train, rand_clf.predict(X_train))
rand_clf_test_acc = accuracy_score(y_test, y_pred)

print(f"Training accuracy of Random Forest is : {rand_clf_train_acc}")
print(f"Test accuracy of Random Forest is : {rand_clf_test_acc}")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# **Gradient Boosting Classifier**

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

# accuracy score, confusion matrix and classification report of gradient boosting classifier
gb_acc = accuracy_score(y_test, gb.predict(X_test))

print(f"Test Accuracy of Gradient Boosting Classifier is {gb_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, gb.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, gb.predict(X_test))}")

# **Stochastic Gradient Boosting (SGB)**

In [None]:
sgb = GradientBoostingClassifier(subsample = 0.90, max_features = 0.70)
sgb.fit(X_train, y_train)

sgb_acc = accuracy_score(y_test, sgb.predict(X_test))

print(f"Test Accuracy of Stochastic Gradient Boosting is {sgb_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, sgb.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, sgb.predict(X_test))}")

# **XgBoost Classifier**

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)

In [None]:
# accuracy_score, confusion_matrix and classification_report

xgb_test_acc = accuracy_score(y_test, y_pred)

print(f"Test accuracy of XgBoost is : {xgb_test_acc}")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# **Cat Boost Classifier**

In [None]:
from catboost import CatBoostClassifier

cat = CatBoostClassifier(iterations=10)
cat.fit(X_train, y_train)

In [None]:
# accuracy score, confusion matrix and classification report of cat boost
cat_acc = accuracy_score(y_test, cat.predict(X_test))

print(f"Training Accuracy of Cat Boost Classifier is {accuracy_score(y_train, cat.predict(X_train))}")
print(f"Test Accuracy of Cat Boost Classifier is {cat_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, cat.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, cat.predict(X_test))}")

# **Extra Trees Classifier**

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

etc = ExtraTreesClassifier()
etc.fit(X_train, y_train)

# accuracy score, confusion matrix and classification report of extra trees classifier
etc_acc = accuracy_score(y_test, etc.predict(X_test))

print(f"Test Accuracy of Extra Trees Classifier is {etc_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, etc.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, etc.predict(X_test))}")

# **LGBM Classifier**

In [None]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(learning_rate = 1)
lgbm.fit(X_train, y_train)

# accuracy score, confusion matrix and classification report of lgbm classifier
lgbm_acc = accuracy_score(y_test, lgbm.predict(X_test))

print(f"Test Accuracy of LGBM Classifier is {lgbm_acc} \n")

print(f"{confusion_matrix(y_test, lgbm.predict(X_test))}\n")
print(classification_report(y_test, lgbm.predict(X_test)))

# **Voting Classifier**

In [None]:
from sklearn.ensemble import VotingClassifier

classifiers = [('Support Vector Classifier', svc), ('KNN', knn),  ('Decision Tree', dtc), ('Random Forest', rand_clf),
               ('Ada Boost', ada), ('XGboost', xgb), ('Gradient Boosting Classifier', gb), ('SGB', sgb),
               ('Cat Boost', cat), ('Extra Trees Classifier', etc), ('LGBM', lgbm)]

vc = VotingClassifier(estimators = classifiers)
vc.fit(X_train, y_train)

y_pred = vc.predict(X_test)

In [None]:
# accuracy_score, confusion_matrix and classification_report
vc_test_acc = accuracy_score(y_test, y_pred)

print(f"Test accuracy of Voting Classifier is : {vc_test_acc}")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# **Models Comparison**

In [None]:
models = pd.DataFrame({
    'Model' : ['SVC', 'KNN', 'Decision Tree', 'Random Forest', 'Gradient Boost', 'SGB', 'Cat Boost', 'Extra Trees', 'LGBM', 'XgBoost', 'Voting Classifier'],
    'Score' : [svc_test_acc, knn_test_acc, dtc_test_acc, rand_clf_test_acc, gb_acc, sgb_acc, cat_acc, etc_acc, lgbm_acc, xgb_test_acc, vc_test_acc]
})

models.sort_values(by = 'Score', ascending = False)

In [None]:
px.bar(data_frame = models, x = 'Score', y = 'Model', color = 'Score', template = 'plotly_dark', 
       title = 'Models Comparison')
