## --------------------Health Insurance Classification Problem----------------------------

In [2]:
# importing the required libraries and modules
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# Loading the datasets
df_train = pd.read_csv('Health_Insurance_train.csv')
df_test = pd.read_csv('Health_Insurance_test.csv')
df_tst_op = pd.read_csv('sample_submission.csv')

In [3]:
df_train.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [4]:
df_test.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,381110,Male,25,1,11.0,1,< 1 Year,No,35786.0,152.0,53
1,381111,Male,40,1,28.0,0,1-2 Year,Yes,33762.0,7.0,111
2,381112,Male,47,1,28.0,0,1-2 Year,Yes,40050.0,124.0,199
3,381113,Male,24,1,27.0,1,< 1 Year,Yes,37356.0,152.0,187
4,381114,Male,27,1,28.0,1,< 1 Year,No,59097.0,152.0,297


In [4]:
df_train.shape

(381109, 12)

In [5]:
df_test.isna().sum()

id                      0
Gender                  0
Age                     0
Driving_License         0
Region_Code             0
Previously_Insured      0
Vehicle_Age             0
Vehicle_Damage          0
Annual_Premium          0
Policy_Sales_Channel    0
Vintage                 0
dtype: int64

### Steps of operations to be performed on the dataset
1. Look for null values in a dataset
2. Categorical Encoding ['Gender','Vehicle_Age','Vehicle_Damage']
3. Removing the unwantd columns after encoding ['Gender','Vehicle_Age','Vehicle_Damage','id']
4. Choose the model and train the dataset
5. Check the training accuracy_score
6. Predict the output values for testing data
7. Check the accuracy_score for testing data predicted values

#### 1. Look for null values in a dataset

In [6]:
# 1. Look for null values in a dataset
df_train.isnull().sum()

id                      0
Gender                  0
Age                     0
Driving_License         0
Region_Code             0
Previously_Insured      0
Vehicle_Age             0
Vehicle_Damage          0
Annual_Premium          0
Policy_Sales_Channel    0
Vintage                 0
Response                0
dtype: int64

#### 2. Categorical Encoding ['Gender','Vehicle_Age','Vehicle_Damage']
#### 3. Removing the unwantd columns after encoding ['Gender','Vehicle_Age','Vehicle_Damage','id']

In [7]:
# 2. Categorical Encoding ['Gender','Vehicle_Age','Vehicle_Damage']
# Gender: 1-Male; 0-Female
# Vehicle_Age: OneHotEncoding
# Vehicle_Damage: 1-Yes; 0-No

def cat_encode(df):
    df['Gender_en'] = np.where(df['Gender']=='Male',1,0)
    df['Vehicle_Damage_en'] = np.where(df['Vehicle_Damage']=='Yes',1,0)
    df['Vehicle_Age_Orig'] = df['Vehicle_Age']
    df = pd.get_dummies(df,columns=['Vehicle_Age'])
    df.drop(['Gender','Vehicle_Age_Orig','Vehicle_Damage','id'],axis=1,inplace=True)    
    return df



In [8]:
# applying the encoding
df1 = df_train.copy()
dt_train = cat_encode(df1)


In [9]:
dt_train.head()

Unnamed: 0,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Gender_en,Vehicle_Damage_en,Vehicle_Age_1-2 Year,Vehicle_Age_< 1 Year,Vehicle_Age_> 2 Years
0,44,1,28.0,0,40454.0,26.0,217,1,1,1,0,0,1
1,76,1,3.0,0,33536.0,26.0,183,0,1,0,1,0,0
2,47,1,28.0,0,38294.0,26.0,27,1,1,1,0,0,1
3,21,1,11.0,1,28619.0,152.0,203,0,1,0,0,1,0
4,29,1,41.0,1,27496.0,152.0,39,0,0,0,0,1,0


In [30]:
dt_train['Response'].value_counts()

0    334399
1     46710
Name: Response, dtype: int64

#### Splitting input and o/p features and oversample to balance the dataset

In [13]:
# Splitting input and o/p features and oversample to balance the dataset
def sample_class(df):
    from imblearn.over_sampling import ADASYN
    
    X = df.drop('Response',axis=1)
    y = df['Response']
    
    asyn = ADASYN(sampling_strategy=0.7)
    X_new,y_new = asyn.fit_resample(X,y)
    
    return X_new,y_new

In [14]:
X,y = sample_class(dt_train)

In [15]:
y.value_counts()

0    334399
1    237516
Name: Response, dtype: int64

### Splitting the data into Train and Test

In [17]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=43)

### Creating a function for scaling data

In [19]:
def scale_features(dataset):
    from sklearn.preprocessing import StandardScaler
    ss = StandardScaler()
    return ss.fit_transform(dataset)

### Creating a function for LogisticRegression 

In [20]:
# Building the model for the training data
def logit(X_train,y_train):
    from sklearn.linear_model import LogisticRegression 
    lr_model = LogisticRegression()
    lr_model.fit(scale_features(X_train),y_train)
    return lr_model

### Creating a function for DecisionTreeClassifier

In [23]:
def decision(X_train,y_train):
    from sklearn.tree import DecisionTreeClassifier
    dt_model = DecisionTreeClassifier()
    dt_model.fit(X_train,y_train)
    return dt_model

### Checking the accuracy of the models with test data

In [24]:
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report

y_pred = logit(X_train,y_train).predict(scale_features(X_test))
y_pred1 = decision(X_train,y_train).predict(X_test)

print('Accuracy score of LogisticRegression: ',accuracy_score(y_test,y_pred))
print('Accuracy score of DecisionTreeClassification: ',accuracy_score(y_test,y_pred1))

print( "Logistic Regression:\n", confusion_matrix(y_test,y_pred) )
print( "DecisionTreeClassifier :\n", confusion_matrix(y_test,y_pred1) )

Accuracy score of LogisticRegression:  0.8041556170770799
Accuracy score of DecisionTreeClassification:  0.8664199329739181
Logistic Regression:
 [[79570 20819]
 [12783 58403]]
DecisionTreeClassifier :
 [[88184 12205]
 [10714 60472]]


In [26]:
print("logisticRegression: \n\n",classification_report(y_test,y_pred))
print("DecisionTreeClassifier: \n\n",classification_report(y_test,y_pred1))

logisticRegression: 

               precision    recall  f1-score   support

           0       0.86      0.79      0.83    100389
           1       0.74      0.82      0.78     71186

    accuracy                           0.80    171575
   macro avg       0.80      0.81      0.80    171575
weighted avg       0.81      0.80      0.81    171575

DecisionTreeClassifier: 

               precision    recall  f1-score   support

           0       0.89      0.88      0.88    100389
           1       0.83      0.85      0.84     71186

    accuracy                           0.87    171575
   macro avg       0.86      0.86      0.86    171575
weighted avg       0.87      0.87      0.87    171575

