In [82]:
#importing the libraries
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd

In [83]:
#loading the data
train_data = pd.read_csv('../input/health-insurance-lead-prediction/train_Df64byy.csv').drop('ID', axis = 1)
test_data = pd.read_csv('../input/health-insurance-lead-prediction/test_YCcRUnU.csv').drop('ID', axis = 1)

In [84]:
train_data.head()

Unnamed: 0,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium,Response
0,C3,3213,Rented,Individual,36,36,No,X1,14+,3.0,22,11628.0,0
1,C5,1117,Owned,Joint,75,22,No,X2,,,22,30510.0,0
2,C5,3732,Owned,Individual,32,32,No,,1.0,1.0,19,7450.0,1
3,C24,4378,Owned,Joint,52,48,No,X1,14+,3.0,19,17780.0,0
4,C8,2190,Rented,Individual,44,44,No,X2,3.0,1.0,16,10404.0,0


# Exploratory Data Analysis

In [85]:
# exploring the target column
print(train_data.Response.describe())
print('-'*30)
print(train_data.Response.value_counts())

count    50882.000000
mean         0.239947
std          0.427055
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: Response, dtype: float64
------------------------------
0    38673
1    12209
Name: Response, dtype: int64


In [86]:
# finding out the columns with null values
train_data.isna().sum()

City_Code                      0
Region_Code                    0
Accomodation_Type              0
Reco_Insurance_Type            0
Upper_Age                      0
Lower_Age                      0
Is_Spouse                      0
Health Indicator           11691
Holding_Policy_Duration    20251
Holding_Policy_Type        20251
Reco_Policy_Cat                0
Reco_Policy_Premium            0
Response                       0
dtype: int64

In [87]:
# data type of variables
train_data.dtypes

City_Code                   object
Region_Code                  int64
Accomodation_Type           object
Reco_Insurance_Type         object
Upper_Age                    int64
Lower_Age                    int64
Is_Spouse                   object
Health Indicator            object
Holding_Policy_Duration     object
Holding_Policy_Type        float64
Reco_Policy_Cat              int64
Reco_Policy_Premium        float64
Response                     int64
dtype: object

In [88]:
# all the columns
train_data.columns

Index(['City_Code', 'Region_Code', 'Accomodation_Type', 'Reco_Insurance_Type',
       'Upper_Age', 'Lower_Age', 'Is_Spouse', 'Health Indicator',
       'Holding_Policy_Duration', 'Holding_Policy_Type', 'Reco_Policy_Cat',
       'Reco_Policy_Premium', 'Response'],
      dtype='object')

In [89]:
# making a list of columns which have categories in their values
cols = ['City_Code','Region_Code','Accomodation_Type', 'Reco_Insurance_Type','Holding_Policy_Type',
        'Reco_Policy_Cat','Health Indicator']

In [103]:
# let us explore the Region_Code variable
region_count = train_data['Region_Code'].value_counts()
print(len(region_count[region_count<10]))
print(len(region_count[region_count>10]))

3165
1928


We see that  there are many categories which have less that ten value counts, so we decide to replace them all with 'OTHERS'

In [104]:
replace_these = list(region_count[region_count<10].index)
train_data['Region_Code'] = train_data['Region_Code'].replace(replace_these,'OTHERS')

we do same with the test_data 

In [105]:
region_count = test_data['Region_Code'].value_counts()
replace_these = list(region_count[region_count<10].index)
test_data['Region_Code'] = test_data['Region_Code'].replace(replace_these,'OTHERS')

* Now we fill the NaN values
* And create a dummies columns for all categorical columns, I am not using here the get_dummies finction because it is not guaranteed to have same categories in training and testing in realistic dataset, that's why I create a set containing the common categories between test and train dataset; and then create the encoding for only them 

In [106]:
for col in cols:
    train_data[col] = train_data[col].fillna('UA')
    test_data[col] = test_data[col].fillna('UA')
    uniques = list(set(list(train_data[col].unique())).intersection(set(list(test_data[col].unique()))))
    for unique in uniques:
        train_data['Is_'+col+'_equal_'+str(unique)] = train_data[col].apply(lambda x: (x==unique)*1.0)
        test_data['Is_'+col+'_equal_'+str(unique)] = test_data[col].apply(lambda x: (x==unique)*1.0) 
    train_data = train_data.drop(col,axis = 1)
    test_data = test_data.drop(col,axis = 1)

Now let us see the columns list

In [107]:
print(train_data.columns)

Index(['Upper_Age', 'Lower_Age', 'Is_Spouse', 'Holding_Policy_Duration',
       'Reco_Policy_Premium', 'Response', 'Is_City_Code_equal_C34',
       'Is_City_Code_equal_C29', 'Is_City_Code_equal_C36',
       'Is_City_Code_equal_C12',
       ...
       'Is_Health Indicator_equal_X2', 'Is_Health Indicator_equal_X7',
       'Is_Health Indicator_equal_X3', 'Is_Health Indicator_equal_X9',
       'Is_Health Indicator_equal_X6', 'Is_Health Indicator_equal_X4',
       'Is_Health Indicator_equal_X1', 'Is_Health Indicator_equal_UA',
       'Is_Health Indicator_equal_X8', 'Is_Health Indicator_equal_X5'],
      dtype='object', length=527)


So we have 527 columns, Now let us do some feature engineering

## Feature Engineering

In [109]:
def yes_no(x):
    if x== 'Yes': return 1
    return 0

def digited(x):
    dicts = {'1.0':1, '14.0':14,'3.0':3,'7.0':7, 
             '0.0':0, '2.0':2, '11.0':11,'6.0':6, 
             '4.0':4,'8.0':8, '9.0':9, '10.0':10, 
             '5.0':5, '12.0':12, '13.0':13}
    return dicts[x]

def divider(x,y):
    if y==0: return 0
    return x/y

def extreme_z(x):
    if x< -1.5: 
        return 1
    elif x>4:
        return 1
    else:
        return 0

In [111]:
# function for creating new features
def feature_addition(data):
    data['Holding_Policy_Duration'] = data['Holding_Policy_Duration'].fillna('0.0')
    data['Holding_Policy_Duration_more_than_14'] = data['Holding_Policy_Duration'].apply(lambda x: x=='14+')
    data['Holding_Policy_Duration'] = data['Holding_Policy_Duration'].replace('14+','14.0')
    data['Holding_Policy_Duration'] = data['Holding_Policy_Duration'].apply(lambda x: digited(x))
    data['Is_Spouse'] = data['Is_Spouse'].apply(lambda x: yes_no(x))
    data['Age_difference'] = data['Upper_Age'] - data['Lower_Age']
    data['Is_Age_difference_0'] = data['Age_difference'].apply(lambda x: x==0)
    data['policy_duration_difference_ratio'] = data.apply(lambda x: divider(x['Age_difference'],x['Holding_Policy_Duration']),axis = 1)
    data['Reco_Policy_Premium_z'] = data['Reco_Policy_Premium'].apply(lambda x: (x/6500 - 2))
    data['Reco_Policy_extreme'] = data['Reco_Policy_Premium_z'].apply(extreme_z)
    return data

In [112]:
# making new features on train as well as test data
train_data = feature_addition(train_data)
test_data = feature_addition(test_data)

In [59]:
# defining independent and target varible 
X = train_data.drop('Response',axis = 1)
Y = train_data['Response']

###  Train Test Split

In [60]:

from sklearn.model_selection import train_test_split as tts
X_train,X_test,Y_train,Y_test = tts(X,Y,test_size = 0.2,random_state = 42,
                                    stratify = Y)

Now let us make the model with limited parameter and run it on the train data
I am using xgboost fpr the final model because it gave maximum score in comparison to  others
Also I am using scale_pos_weight = 4 in the parameter because the data is imbalanced and the ration of both the classes count is almost 4

In [113]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

classifier = XGBClassifier(
                            eval_metric = 'error',
                            scale_pos_weight = 4)

classifier.fit(X_train,Y_train)
Y_pred_train = classifier.predict(X_train)
Y_pred_test = classifier.predict(X_test)
print(classification_report(Y_train,Y_pred_train))
print(classification_report(Y_test,Y_pred_test))
print(roc_auc_score(Y_train,Y_pred_train))
print(roc_auc_score(Y_test,Y_pred_test))

              precision    recall  f1-score   support

           0       0.94      0.45      0.61     30938
           1       0.34      0.91      0.50      9767

    accuracy                           0.56     40705
   macro avg       0.64      0.68      0.55     40705
weighted avg       0.80      0.56      0.58     40705

              precision    recall  f1-score   support

           0       0.87      0.41      0.56      7735
           1       0.30      0.81      0.44      2442

    accuracy                           0.51     10177
   macro avg       0.59      0.61      0.50     10177
weighted avg       0.74      0.51      0.53     10177

0.6808732698720977
0.6102101925631337


As we have see the roc_auc score on the train data, let us make prediction for test data and create a submission file

In [114]:
submission_pred=classifier.predict(test_data)
df_sub=pd.DataFrame(submission_pred,columns=['Response'])
df_sub.insert(loc=0, column='ID', value=df_sub.index+50883)
df_sub.to_csv('xgboost.csv', index=False)