In [47]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.metrics import roc_auc_score,roc_curve
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import seaborn as sns
import matplotlib.pyplot as plt

import warnings 
warnings.filterwarnings('ignore')

In [3]:
df=pd.read_csv('train_Insurance.csv')
df.head(5)

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    381109 non-null  int64  
 1   Gender                381109 non-null  object 
 2   Age                   381109 non-null  int64  
 3   Driving_License       381109 non-null  int64  
 4   Region_Code           381109 non-null  float64
 5   Previously_Insured    381109 non-null  int64  
 6   Vehicle_Age           381109 non-null  object 
 7   Vehicle_Damage        381109 non-null  object 
 8   Annual_Premium        381109 non-null  float64
 9   Policy_Sales_Channel  381109 non-null  float64
 10  Vintage               381109 non-null  int64  
 11  Response              381109 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 34.9+ MB


In [6]:
df.drop(['id'],axis=1,inplace=True)
df

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0
...,...,...,...,...,...,...,...,...,...,...,...
381104,Male,74,1,26.0,1,1-2 Year,No,30170.0,26.0,88,0
381105,Male,30,1,37.0,1,< 1 Year,No,40016.0,152.0,131,0
381106,Male,21,1,30.0,1,< 1 Year,No,35118.0,160.0,161,0
381107,Female,68,1,14.0,0,> 2 Years,Yes,44617.0,124.0,74,0


In [8]:
df['Gender'].nunique()

2

In [9]:
df['Gender'].replace({'Male':0, 'Female':1},inplace= True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Gender                381109 non-null  int64  
 1   Age                   381109 non-null  int64  
 2   Driving_License       381109 non-null  int64  
 3   Region_Code           381109 non-null  float64
 4   Previously_Insured    381109 non-null  int64  
 5   Vehicle_Age           381109 non-null  object 
 6   Vehicle_Damage        381109 non-null  object 
 7   Annual_Premium        381109 non-null  float64
 8   Policy_Sales_Channel  381109 non-null  float64
 9   Vintage               381109 non-null  int64  
 10  Response              381109 non-null  int64  
dtypes: float64(3), int64(6), object(2)
memory usage: 32.0+ MB


In [11]:
df['Vehicle_Age'].nunique()

3

In [13]:
df['Vehicle_Age'].value_counts()

1-2 Year     200316
< 1 Year     164786
> 2 Years     16007
Name: Vehicle_Age, dtype: int64

In [14]:
Vehicle_Age_df=pd.get_dummies(df['Vehicle_Age'],drop_first=True)

In [15]:
Vehicle_Age_df

Unnamed: 0,< 1 Year,> 2 Years
0,0,1
1,0,0
2,0,1
3,1,0
4,1,0
...,...,...
381104,0,0
381105,1,0
381106,1,0
381107,0,1


In [16]:
df=pd.concat([df,Vehicle_Age_df],axis=1)
df

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,< 1 Year,> 2 Years
0,0,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1,0,1
1,0,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0,0,0
2,0,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1,0,1
3,0,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0,1,0
4,1,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
381104,0,74,1,26.0,1,1-2 Year,No,30170.0,26.0,88,0,0,0
381105,0,30,1,37.0,1,< 1 Year,No,40016.0,152.0,131,0,1,0
381106,0,21,1,30.0,1,< 1 Year,No,35118.0,160.0,161,0,1,0
381107,1,68,1,14.0,0,> 2 Years,Yes,44617.0,124.0,74,0,0,1


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 13 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Gender                381109 non-null  int64  
 1   Age                   381109 non-null  int64  
 2   Driving_License       381109 non-null  int64  
 3   Region_Code           381109 non-null  float64
 4   Previously_Insured    381109 non-null  int64  
 5   Vehicle_Age           381109 non-null  object 
 6   Vehicle_Damage        381109 non-null  object 
 7   Annual_Premium        381109 non-null  float64
 8   Policy_Sales_Channel  381109 non-null  float64
 9   Vintage               381109 non-null  int64  
 10  Response              381109 non-null  int64  
 11  < 1 Year              381109 non-null  uint8  
 12  > 2 Years             381109 non-null  uint8  
dtypes: float64(3), int64(6), object(2), uint8(2)
memory usage: 32.7+ MB


In [22]:
df.head(5)

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,< 1 Year,> 2 Years
0,0,44,1,28.0,0,Yes,40454.0,26.0,217,1,0,1
1,0,76,1,3.0,0,No,33536.0,26.0,183,0,0,0
2,0,47,1,28.0,0,Yes,38294.0,26.0,27,1,0,1
3,0,21,1,11.0,1,No,28619.0,152.0,203,0,1,0
4,1,29,1,41.0,1,No,27496.0,152.0,39,0,1,0


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Gender                381109 non-null  int64  
 1   Age                   381109 non-null  int64  
 2   Driving_License       381109 non-null  int64  
 3   Region_Code           381109 non-null  float64
 4   Previously_Insured    381109 non-null  int64  
 5   Vehicle_Damage        381109 non-null  object 
 6   Annual_Premium        381109 non-null  float64
 7   Policy_Sales_Channel  381109 non-null  float64
 8   Vintage               381109 non-null  int64  
 9   Response              381109 non-null  int64  
 10  < 1 Year              381109 non-null  uint8  
 11  > 2 Years             381109 non-null  uint8  
dtypes: float64(3), int64(6), object(1), uint8(2)
memory usage: 29.8+ MB


In [24]:
df['Vehicle_Damage'].nunique()

2

In [25]:
df['Vehicle_Damage'].value_counts()

Yes    192413
No     188696
Name: Vehicle_Damage, dtype: int64

In [29]:
df['Vehicle_Damage'].replace({'Yes':0,'No':1},inplace=True)
df

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,< 1 Year,> 2 Years
0,0,44,1,28.0,0,0,40454.0,26.0,217,1,0,1
1,0,76,1,3.0,0,1,33536.0,26.0,183,0,0,0
2,0,47,1,28.0,0,0,38294.0,26.0,27,1,0,1
3,0,21,1,11.0,1,1,28619.0,152.0,203,0,1,0
4,1,29,1,41.0,1,1,27496.0,152.0,39,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
381104,0,74,1,26.0,1,1,30170.0,26.0,88,0,0,0
381105,0,30,1,37.0,1,1,40016.0,152.0,131,0,1,0
381106,0,21,1,30.0,1,1,35118.0,160.0,161,0,1,0
381107,1,68,1,14.0,0,0,44617.0,124.0,74,0,0,1


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Gender                381109 non-null  int64  
 1   Age                   381109 non-null  int64  
 2   Driving_License       381109 non-null  int64  
 3   Region_Code           381109 non-null  float64
 4   Previously_Insured    381109 non-null  int64  
 5   Vehicle_Damage        381109 non-null  int64  
 6   Annual_Premium        381109 non-null  float64
 7   Policy_Sales_Channel  381109 non-null  float64
 8   Vintage               381109 non-null  int64  
 9   Response              381109 non-null  int64  
 10  < 1 Year              381109 non-null  uint8  
 11  > 2 Years             381109 non-null  uint8  
dtypes: float64(3), int64(7), uint8(2)
memory usage: 29.8 MB


In [32]:
df.isna().sum()

Gender                  0
Age                     0
Driving_License         0
Region_Code             0
Previously_Insured      0
Vehicle_Damage          0
Annual_Premium          0
Policy_Sales_Channel    0
Vintage                 0
Response                0
< 1 Year                0
> 2 Years               0
dtype: int64

In [33]:
x=df.drop(['Response'],axis=1)
y=df['Response']

In [35]:
x

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,< 1 Year,> 2 Years
0,0,44,1,28.0,0,0,40454.0,26.0,217,0,1
1,0,76,1,3.0,0,1,33536.0,26.0,183,0,0
2,0,47,1,28.0,0,0,38294.0,26.0,27,0,1
3,0,21,1,11.0,1,1,28619.0,152.0,203,1,0
4,1,29,1,41.0,1,1,27496.0,152.0,39,1,0
...,...,...,...,...,...,...,...,...,...,...,...
381104,0,74,1,26.0,1,1,30170.0,26.0,88,0,0
381105,0,30,1,37.0,1,1,40016.0,152.0,131,1,0
381106,0,21,1,30.0,1,1,35118.0,160.0,161,1,0
381107,1,68,1,14.0,0,0,44617.0,124.0,74,0,1


In [36]:
y

0         1
1         0
2         1
3         0
4         0
         ..
381104    0
381105    0
381106    0
381107    0
381108    0
Name: Response, Length: 381109, dtype: int64

In [37]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=21,stratify=y)

In [39]:
x_train.size

2934536

In [40]:
x_test.size

1257663

In [41]:
log_model=LogisticRegression()
log_model.fit(x_train,y_train)

LogisticRegression()

In [42]:
y_pred=log_model.predict(x_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [43]:
y_test

242143    0
326197    1
228765    1
304086    0
290290    0
         ..
305939    0
204066    0
369875    0
370952    0
189829    0
Name: Response, Length: 114333, dtype: int64

## Accuracy

In [45]:
accuracy_score(y_pred,y_test)

0.8774107213140563

In [50]:
y_pred_train=log_model.predict(x_train)


In [51]:
y_train

233978    0
335743    0
369964    1
36273     1
112852    0
         ..
280049    0
178764    0
232582    0
119310    1
32634     0
Name: Response, Length: 266776, dtype: int64

In [52]:
accuracy_score(y_pred_train,y_train)

0.8774027648664048

## Confusion matrix

In [55]:
confusion_matrix(y_test,y_pred)

array([[100317,      3],
       [ 14013,      0]], dtype=int64)

## Classification report

In [59]:
clf_report=classification_report(y_test,y_pred)
print(clf_report)

              precision    recall  f1-score   support

           0       0.88      1.00      0.93    100320
           1       0.00      0.00      0.00     14013

    accuracy                           0.88    114333
   macro avg       0.44      0.50      0.47    114333
weighted avg       0.77      0.88      0.82    114333

