In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import confusion_matrix, accuracy_score,precision_score, roc_auc_score,classification_report,roc_curve,auc
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
import shap
shap.initjs()
import warnings
warnings.filterwarnings('ignore')
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Introduction

Our client is an Insurance company that has provided Health Insurance to its customers now they need your help in building a model to predict whether the policyholders (customers) from past year will also be interested in Vehicle Insurance provided by the company.

An insurance policy is an arrangement by which a company undertakes to provide a guarantee of compensation for specified loss, damage, illness, or death in return for the payment of a specified premium. A premium is a sum of money that the customer needs to pay regularly to an insurance company for this guarantee.

For example, you may pay a premium of Rs. 5000 each year for a health insurance cover of Rs. 200,000/- so that if, God forbid, you fall ill and need to be hospitalised in that year, the insurance provider company will bear the cost of hospitalisation etc. for upto Rs. 200,000. Now if you are wondering how can company bear such high hospitalisation cost when it charges a premium of only Rs. 5000/-, that is where the concept of probabilities comes in picture. For example, like you, there may be 100 customers who would be paying a premium of Rs. 5000 every year, but only a few of them (say 2-3) would get hospitalised that year and not everyone. This way everyone shares the risk of everyone else.

Just like medical insurance, there is vehicle insurance where every year customer needs to pay a premium of certain amount to insurance provider company so that in case of unfortunate accident by the vehicle, the insurance provider company will provide a compensation (called ‘sum assured’) to the customer

# Business Goal

Building a model to predict whether a customer would be interested in Vehicle Insurance is extremely helpful for the company because it can then accordingly plan its communication strategy to reach out to those customers and optimize its business model and revenue.


In [None]:
train=pd.read_csv(r'data/health-insurance-cross-sell-prediction/train.csv')
test=pd.read_csv(r'data/health-insurance-cross-sell-prediction/test.csv')
train.head()

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train.Response.value_counts()

# Columns description

* **id**     Unique ID for the customer
* **Gender**     Gender of the customer
* **Age**         Age of the customer
* **Driving_License**  **0** : Customer does not have DL, **1** : Customer already has DL
* **Region_Code**       Unique code for the region of the customer
* **Previously_Insured**  -  1 : Customer already has Vehicle Insurance, 0 : Customer doesn't have Vehicle Insurance
* **Vehicle_Age**       Age of the Vehicle
* **Vehicle_Damage**  1 : Customer got his/her vehicle damaged in the past. 0 : Customer didn't get his/her vehicle damaged in the past.
* **Annual_Premium**  The amount customer needs to pay as premium in the year
* **PolicySalesChannel**	Anonymized Code for the channel of outreaching to the customer ie. Different Agents, Over Mail, Over Phone, In Person, etc.
* **Vintage**   Number of Days, Customer has been associated with the company
* **Response**  1 : Customer is interested, 0 : Customer is not interested

# **EDA** - Exploratory data analysis

In [None]:
train.info()

In [None]:
test.isna().sum()

In [None]:
train=train.drop(columns='id',axis=1)

In [None]:
plt.figure(figsize=(12,6))
train.groupby(['Response'])['Previously_Insured'].value_counts().plot(
                                                 kind='pie',autopct='%.0f%%')
plt.show()


* **(0,1)** represent to previously insured but this time **not** interested
* **(1,0)** represent to previously **not** insured but this time interested
* **(0,0)** represent neither previously nor this time interested

# Interruption

* The company should check the communication strategy and take into account why 46% of customers quite
* Customers between 22-30 tend not to buy the service a second time
* Fortunately no missing values



In [None]:
plt.figure(figsize=(15,5))
sns.countplot(train.Age,hue=train.Previously_Insured)
plt.show()

In [None]:
sns.heatmap(train.corr()[['Response']],annot=True,)
plt.show()

# Scale & Split & Dummies

In [None]:
#train
le = LabelEncoder()
le_count = 0

for col in train:
    if train[col].dtype == 'object':
        if len(list(train[col].unique())) <= 2:
            le.fit(train[col])
            train[col] = le.transform(train[col])
                      
            le_count += 1
            print('%d columns were label encoded.' % le_count)

In [None]:
train_dummies=pd.get_dummies(train)

In [None]:
x=train_dummies.drop(columns='Response',axis=1)
y=train_dummies['Response']

#split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 11)
#scale
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Cat Boost

In [None]:
clf = CatBoostClassifier(
    iterations=100, 
    learning_rate=0.001, 
    verbose=5,
)

clf.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
)

In [None]:
clf_pred=clf.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, clf_pred)
print(cm)
print(accuracy_score(y_test, clf_pred))
plt.figure(figsize=(10,5))
sns.heatmap(cm, annot=True, fmt="d", linewidths=.5)
plt.show()

# Model Interruption
* First of all, we get an 87% accuracy score, but it does not mean we got a good model. The model cannot predict true negative(TN) and False Negative (FN) values. That problem comes from an imbalanced data problem
* Most customers were not interested (86%), while interested customers occurred (14%) of the time in the data frame.

### Sub Sample
* In this scenario, our subsample will be a data frame with a 50/50 ratio of interested and non-interested customers. Which means our sub-sample has the same amount of responses.

* Our dataset has 46710 cases of non-interested customers, so we can get 46710 cases of interested customers to create our new sub-data frame. We concat the 46710 cases of interested and non-interested customers for creating a new sub-sample.

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(train.Response)
plt.show()

In [None]:
non_interested=train[train.Response==1]
interested=train[train.Response==0][:46710]
df=pd.concat([non_interested,interested])
data=df.sample(frac=1,random_state=42)
data.head()

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(data.Response)
plt.show()

In [None]:
data1=pd.get_dummies(data)

In [None]:
x=data1.drop(columns='Response',axis=1)
y=data1['Response']

#split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 11)
#scale
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Cat Boost

In [None]:

clf = CatBoostClassifier(
    iterations=100, 
    learning_rate=0.001, 
    verbose=20,
)

clf.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
)

In [None]:
clf_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, clf_pred)
print(classification_report(y_test,clf_pred))
print('Accuracy',accuracy_score(y_test, clf_pred))
plt.figure(figsize=(10,5))
sns.heatmap(cm, annot=True, fmt="d", linewidths=.5)
plt.show()

# Light GBM

In [None]:
params = {
    'learning_rate': 0.005,
    "max_depth": 12,
    "num_leaves": 24,  
    "max_bin": 512,
    "num_iterations": 10000
}
model = lgb.LGBMClassifier(**params)
model.fit(X_train,y_train,
          eval_set=[(X_test, y_test)],
        early_stopping_rounds=1000)

In [None]:
model_pred = model.predict(X_test)
cm = confusion_matrix(y_test, model_pred)
print(classification_report(y_test,model_pred))
print('Accuracy',accuracy_score(y_test, model_pred))
plt.figure(figsize=(10,5))
sns.heatmap(cm, annot=True, fmt="d", linewidths=.5,cmap='turbo_r')
plt.show()

# XG Boost

In [None]:
xg=xgb.XGBClassifier(
                    n_estimators =1000
                     )
    
evaluation = [( X_train, y_train), ( X_test, y_test)]
    
xg.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=20,verbose=100)

In [None]:
xg_pred = xg.predict(X_test)
cm = confusion_matrix(y_test, xg_pred)
print(classification_report(y_test,xg_pred))
print('Accuracy',accuracy_score(y_test, xg_pred))
plt.figure(figsize=(10,5))
sns.heatmap(cm, annot=True, fmt="d", linewidths=.5,cmap='tab20')
plt.show()

In [None]:
print('Cat Boost',roc_auc_score(y_test, clf_pred))
print('XG Boost',roc_auc_score(y_test, xg_pred))
print('Light GBM',roc_auc_score(y_test, model_pred))

In [None]:
fpr, tpr, threshold = roc_curve(y_test, clf_pred)
fpr1, tpr1, threshold1 = roc_curve(y_test, xg_pred)
fpr2, tpr2, threshold1 = roc_curve(y_test, model_pred)
roc_auc =auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc,color='red')
plt.plot(fpr1, tpr1,  label = 'AUC = %0.2f' % roc_auc,color='yellow')
plt.plot(fpr2, tpr2,  label = 'AUC = %0.2f' % roc_auc,color='green')


plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()


# Model Evaluation
* Light Gbm performs well than other models at this time. 
* Accuracy was 79.8%, and we got a 0.80  AUC score which we can interpret as good. 
* For further interruption, I used shap library plots to understand features' importance and roles

# Shap

In [None]:
explainer = shap.Explainer(model, X_train,feature_names=x.columns.values.tolist())
shap_values = explainer(X_test)
plt.title('Feature Importance')
shap.plots.waterfall(shap_values[0],max_display=12)



In [None]:
shap.plots.beeswarm(shap_values,max_display=12)

# Conclusion
* Previously insured, Age, and Vehicle Damage are significant features.
* We got fewer Shap Values where Previously Insured value equal to 1. In other words, most previous customers didn't buy the insurance. The company should consider the reason behind that result. The trigger should be communication strategy, irrelevant product, etc.
* Shap values get minor numbers, whereas vehicle damage gets a small value. We assume that situation is normal because people used not to think about insurance unless they got an accident. The company should consider starting a marketing campaign for that purpose.
* The insurance annual premium prices affected shap value 0.12, which needs considering and further research about pricing strategy.