# Compare various models with dalex ft. h2o, autokeras, catboost, lightgbm

## Health Insurance Cross Sell Prediction

In [None]:
import dalex as dx
import pandas as pd
import numpy as np
import sklearn
import tensorflow as tf
import autokeras as ak
import kerastuner as kt
import h2o
import catboost
import lightgbm

import warnings
warnings.filterwarnings('ignore')

In [None]:
# session info
pkg_dict = {}
for pkg in [dx, pd, np, sklearn, tf, ak, kt, h2o, catboost, lightgbm]:
    pkg_dict[str.split(str(pkg))[1].replace("'", "")] = pkg.__version__
pd.DataFrame(pkg_dict, index=["version"])

# 1. data

data (train.csv) from: https://www.kaggle.com/anmolkumar/health-insurance-cross-sell-prediction?select=train.csv

In [4]:
data = pd.read_csv("train.csv")
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    381109 non-null  int64  
 1   Gender                381109 non-null  object 
 2   Age                   381109 non-null  int64  
 3   Driving_License       381109 non-null  int64  
 4   Region_Code           381109 non-null  float64
 5   Previously_Insured    381109 non-null  int64  
 6   Vehicle_Age           381109 non-null  object 
 7   Vehicle_Damage        381109 non-null  object 
 8   Annual_Premium        381109 non-null  float64
 9   Policy_Sales_Channel  381109 non-null  float64
 10  Vintage               381109 non-null  int64  
 11  Response              381109 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 34.9+ MB


Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


We have 380k observations, 11 variables and binary target. For the purpose of this comparison, let's use 10% of the data.

Clean data:

remove id, Policy_Sales_Channel, Region_Code

convert Vehicle_Age to integer

convert Gender, Vehicle_Damage to binary

investigate Driving_License, Annual_Premium

In [None]:
from sklearn.model_selection import train_test_split
data, _ = train_test_split(data, train_size=0.1, random_state=1, stratify=data.Response)

In [None]:
# drop columns
data.drop(["id", "Region_Code", "Policy_Sales_Channel"], axis=1, inplace=True)

In [None]:
# convert three columns
print(data.Vehicle_Age.unique())
data.replace({'Gender': ["Male", "Female"], 'Vehicle_Damage': ["Yes", "No"], 'Vehicle_Age': data.Vehicle_Age.unique()},
             {'Gender': [1, 0], 'Vehicle_Damage': [1, 0], 'Vehicle_Age': [2, 1, 0]},
            inplace=True)

In [None]:
# what about Driving License in selling the vehicle insurance?
print(data.Driving_License.mean())

# 5% people bought the vehicle insurance without the Driving License
print(data.Response[data.Driving_License==0].mean())

# let's remove this variable for the clarity
# in the model we could assign: IF Driving_License == 0 THEN Response = 0
data.drop("Driving_License", axis=1, inplace=True)

In [None]:
# what about the distribution of Annual_Premium?
import matplotlib.pyplot as plt
_ = plt.hist(data.Annual_Premium, bins='auto', log=True)
plt.show()

In [None]:
# where is the peek?
print(data.Annual_Premium.min())

# a lot of the same values
print((data.Annual_Premium==data.Annual_Premium.min()).sum())

# some very big values (0.2% above 100k)
print((data.Annual_Premium>100000).sum() / data.shape[0])

In [None]:
# let's make a variable indicating the baseline, and move the annual premium
data = data.assign(
    Annual_Premium_Baseline=lambda x: (x.Annual_Premium==data.Annual_Premium.min()).astype(int),
    Annual_Premium=data.Annual_Premium-data.Annual_Premium.min()
)

# for the sake of this comparison, let's remove heavy outliers as well
data = data[data.Annual_Premium<100000-2630]
_ = plt.hist(data.Annual_Premium, bins='auto')
plt.show()

## split

In [None]:
data.shape

In [None]:
data.Response.mean() # uneven target

In [None]:
X, y = data.drop("Response", axis=1), data.Response
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1, stratify=y)

In [None]:
X_train.head()

# 2. Models

### baseline

In [None]:
model_baseline = lightgbm.LGBMClassifier(boosting_type="dart", n_estimators=1000, is_unbalance=True)
model_baseline.fit(X_train, y_train)
exp_baseline = dx.Explainer(model_baseline, X_test, y_test, verbose=False, label="lgbm_dart")
exp_baseline.model_performance()

### H2O

In [None]:
_ = h2o.init(nthreads=-1, max_mem_size=16)
h2o.no_progress()

In [None]:
df = h2o.H2OFrame(pd.concat([X_train, y_train], axis=1))
df.Response = df['Response'].asfactor()
model_h2o = h2o.estimators.H2ORandomForestEstimator(ntrees=1000,
                                                    nfolds=3,
                                                    balance_classes=True,
                                                    seed=1)
model_h2o.train(x=X_train.columns.to_list(),
                y="Response",
                training_frame=df)

In [None]:
exp_h2o = dx.Explainer(model_h2o, h2o.H2OFrame(X_test), y_test,
                       label="h2o_rf", model_type='classification', verbose=False)
exp_h2o.model_performance()

### Autokeras

In [None]:
from sklearn.utils import class_weight
weights = dict(enumerate(class_weight.compute_class_weight(class_weight='balanced',
                                                           classes=y_train.unique(),
                                                           y=y_train)))
weights

In [None]:
import logging
tf.get_logger().setLevel(logging.ERROR)
model_autokeras = ak.StructuredDataClassifier(
    max_trials=5,
    metrics=[
        tf.keras.metrics.BinaryAccuracy(name='accuracy'),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall'),
        tf.keras.metrics.AUC(name='auc')
    ],
    num_classes= 2,
    objective=kt.Objective("auc", direction="max"),
    loss=tf.keras.losses.BinaryCrossentropy(from_logits = True),
    tuner="random",
    seed=1, overwrite=True
)

model_autokeras.fit(X_train, y_train, validation_split=0.25, class_weight=weights, epochs=20, verbose=0)

In [None]:
model_keras = model_autokeras.export_model()
model_keras.summary()

In [None]:
exp_keras = dx.Explainer(model=model_keras, data=X_test, y=y_test,
                         label="autokeras", model_type="classification", verbose=False)
exp_keras.model_performance()

### Catboost

In [None]:
from sklearn.utils import class_weight
weights = dict(enumerate(class_weight.compute_class_weight(class_weight='balanced',
                                                           classes=y_train.unique(),
                                                           y=y_train)))
y_weights = y_train.replace(list(weights.keys()), list(weights.values()))
y_weights

In [None]:
pool_train = catboost.Pool(X_train, y_train,
                           weight=y_weights)
model_catboost = catboost.CatBoostClassifier(iterations=2000)
model_catboost.fit(pool_train, verbose=False)

In [None]:
exp_catboost = dx.Explainer(model_catboost, X_test, y_test, verbose=False, label="catboost")
exp_catboost.model_performance()

# 3. Comparison with dalex