In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from xgboost.sklearn import XGBClassifier
import os
# os.chdir(r"/home/deathblade287/Dropbox/ML/Projects/utils")
from utils import define_y_axis, auto_dropper, label_encoder_auto, last
# sklearn stands for scikit learn which is pre-defined ML library in Python

In [None]:
os.chdir(r"/home/deathblade287/Dropbox/ML/Projects/insurance_claims")

le=LabelEncoder()
scaler=StandardScaler()
xgb1=XGBClassifier()


# LabelEncoding refers to assigning numeric values to categorical variables (for eg male/female , Yes/No, Sun/Mon/../Sat, etc)
# Scaling lets just park for next class

# We will use XGBoost algorithm for this
# Regression is when we have to predict continuous values for eg 1,1.5,1.6,1.7,1.77,...,2,2.1,2.2,2.3,...,5
# Classification is when we have to predict classes - car/bike , male/female , samsung/iphone/microsoft, etc

#In this case of Fraud, we have to create model which will predict whether it is fraud or not fraud

In [None]:
# XGBoost - Xtreme gradient Boosting

In [None]:
# read data from csv

data = pd.read_csv(r"insurance_claims_0723_full.csv")

# data is a variable in dataframe
data, y = define_y_axis(data, "fraud_reported")

all_column = auto_dropper(data, y)
print(all_column)
print(type(all_column))

In [None]:
data.info()

In [None]:
cols = data.columns
len(cols)

In [None]:
# transform date feature to convert it into day and week
data['incident_date']=pd.to_datetime(data.incident_date)
data["incident_monthdate"] = data.incident_date.dt.day
data["incident_weekday"]=data.incident_date.dt.dayofweek

#data_incident_date=data["incident_date"]
data=data.drop("incident_date",axis=1)

In [None]:
data["incident_monthdate"]

In [None]:
data["incident_weekday"]

In [None]:
# splitting 100/200 format into 2 columns - one col with values before / and other col with valiues after /

csl = data["policy_csl"].str.split("/", n = 1, expand = True)
data["lower_csl"]=csl[0]
data["upper_csl"]=csl[1]


# Use "Others" instead of "?" and then use labelencoding

data["collision_type"].replace("?","Others")
data['collision_type']=le.fit_transform(data['collision_type'])

data["property_damage"].replace("?","Others")
data['property_damage']=le.fit_transform(data['property_damage'])


In [None]:
data["collision_type"]

In [None]:
# this is to use labelencoder wherever the cols have categorical data (not numeric data)

# data['insured_education_level']=le.fit_transform(data['insured_education_level'])
# data['insured_occupation']=le.fit_transform(data['insured_occupation'])
# data['insured_hobbies']=le.fit_transform(data['insured_hobbies'])
# data['insured_relationship']=le.fit_transform(data['insured_relationship'])
# data['incident_type']=le.fit_transform(data['incident_type'])
# data['incident_severity']=le.fit_transform(data['incident_severity'])
# data['authorities_contacted']=le.fit_transform(data['authorities_contacted'])
# data['incident_state']=le.fit_transform(data['incident_state'])
# data['police_report_available']=le.fit_transform(data['police_report_available'])
# data['auto_make']=le.fit_transform(data['auto_make'])
# data['auto_model']=le.fit_transform(data['auto_model'])
# data['auto_year']=le.fit_transform(data['auto_year'])

# label_encoder_auto()

# os.chdir(path)
data = data
columns_encoded = []
# Lable Encoding (Starting)...
le = LabelEncoder()
for i in range(len(data.columns)):
    column_now = data.columns[i]
    all_rows = data[column_now][0:]
    # Removing The Index + pandas.core.series.Series => List
    all_rows = all_rows.to_list()
    for i in range(len(all_rows)):
        if type(all_rows[i]) == int:
            status_encoding = False
        elif type(all_rows[i]) != int:
            status_encoding = True
            break
    if status_encoding == True:
        data[column_now] = le.fit_transform(data[column_now])
        columns_encoded.append(column_now)

In [None]:
# dropping those columns which we think will have NO impact/effect on fraud

# data=data.drop("_c39",axis=1)


# data=data.drop("policy_number",axis=1)
# data=data.drop("policy_bind_date",axis=1)
# data=data.drop("policy_csl",axis=1)
# data=data.drop("insured_zip",axis=1)
# data=data.drop("insured_sex",axis=1)
# data=data.drop("incident_city",axis=1)
# data=data.drop("incident_hour_of_the_day",axis=1)
# data=data.drop("incident_location",axis=1)
# data=data.drop("injury_claim",axis=1)
# data=data.drop("property_claim",axis=1)
# data=data.drop("age",axis=1)
# data=data.drop("policy_state",axis=1)
# data=data.drop("policy_deductable",axis=1)
# data=data.drop("umbrella_limit",axis=1)
# data=data.drop("insured_education_level",axis=1)
# data=data.drop("insured_relationship",axis=1)
# data=data.drop("capital-gains",axis=1)
# data=data.drop("capital-loss",axis=1)
# data=data.drop("incident_type",axis=1)
# data=data.drop("collision_type",axis=1)
# data=data.drop("authorities_contacted",axis=1)
# data=data.drop("incident_state",axis=1)
# data=data.drop("number_of_vehicles_involved",axis=1)
# data=data.drop("property_damage",axis=1)
# data=data.drop("witnesses",axis=1)
# data=data.drop("auto_model",axis=1)
# data=data.drop("auto_year",axis=1)
# data=data.drop("lower_csl",axis=1)
# data=data.drop("upper_csl",axis=1)
# data=data.drop("incident_monthdate",axis=1)
data, y = define_y_axis(data, "fraud_reported")
auto_dropper(data, y)

In [None]:
data.describe()
# .describe is used to view/analyze data statistically 

# 1. Identify outliers with the help of mean, std, max - too high or too low
# 2. see if these outliers have a relation with label (fraud or not fraud)
# 3. if they have no relation, which mean that these outliers are not frauds, then we can remove the rows
# to avoid outliers in our data

In [None]:
data[data["total_claim_amount"]>100000]

In [None]:
cols_refined = data.columns
cols_refined

In [None]:
# we need to apply scaling before splitting into train_test

In [None]:
data = scaler.fit_transform(data)

In [None]:
data

In [None]:
seed = 9
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=test_size, random_state=seed,stratify=y)

# if we dont use "y" within stratify, the split of our labels (fraud yes/no in our case)
# can be different from ourt train-test size

In [None]:
params = {
    'objective': 'binary:logistic',
    'max_depth': 4,
    'booster':"gbtree",
    'learning_rate': 0.09,
    'n_estimators': 5
}

In [None]:
model_data_xgb_1 = XGBClassifier(**params).fit(X_train, y_train)

# using .fit function means training the algorithm
# it means below
# 1. Start with random theta for all columns
# 2. Calculate cost function
# 3. Update theta
# 4. repeat above steps until the cost function is closer to 0 or stops reducing

In [None]:
y_pred_data_xgb_1 = model_data_xgb_1.predict(X_test)

# capturing the predictions of test data
# test data is what algorithm has NOT used for training purpose
# it means that model doesnt know the actual answers but it will predict basis the coefficients model has developed


In [None]:
print("Accuracy for XGBoost model: %.2f" % (accuracy_score(y_test, y_pred_data_xgb_1) * 100))

# accuracy is calculated by ccmparing actual labels with predicted labels of test data

In [None]:
print(model_data_xgb_1.feature_importances_)

In [None]:
import matplotlib.pyplot as plt
plt.bar(cols_refined,model_data_xgb_1.feature_importances_)

In [None]:
# we use pickle library to store/save the model
# .pkl file

In [None]:
# How to use the model in the actual environment
# 1. Store the model (pickle)
# 2. Create an application (for eg website) which takes 12 cols from the customer while he/she logs a claim
# 3. apply the model on 12 cols (.predict)
# 4 finally the outcome is either of the class - fraud or not fraud
# 5. an action can be taken for eg, an email can be sent automatically to the investigation team (insurance company)

In [None]:
# Next class
# 1. Explain parameters
# 2. Scaling
# 3. deleting rows
# 4. demonstrate complete cycle of applying model
# 5. how to use .describe function


# The standard score of a sample x is calculated as:
# z = (x - u) / s


# where u is the mean of the training samples
# and s is the standard deviation of the training samples

# scaling converts all numbers with mean as 0 and std as 1

In [None]:
# Assignment for July 30th is to try and create model with all cols - dont delete any col
# observe the feature_importance when you input all cols
# dont forget to do label encoding and scaling on all cols
# take reference from above example

# In fact, change any parameter and see how the accuracy increases or decreases