In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from warnings import filterwarnings
#sns.set_theme()
#%config InlineBackend.figure_format = 'retina'

import scipy.stats as stats
import pylab
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score,roc_auc_score,precision_score,recall_score,classification_report,precision_recall_curve,confusion_matrix,roc_curve
from sklearn.ensemble import RandomForestClassifier
filterwarnings('ignore')

In [None]:
train='train.csv'
df=pd.read_csv(train)
df

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe(include='all')

In [None]:
print(df['Age'].mean())
print(df['Vehicle_Age'].unique())

In [None]:
df.duplicated().value_counts().to_frame('values')

In [None]:
df.drop(['id'],axis=1,inplace=True)
df.drop(['Region_Code'],axis=1,inplace=True)
df.drop(['Policy_Sales_Channel'],axis=1,inplace=True)

In [None]:
feature=[ 'Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage', 'Response']

In [None]:
df.skew(axis=0).to_frame('Skewed Values')

In [None]:
sns.countplot(df['Response'],palette="viridis")

In [None]:
print('Negative Responses %',(df['Response'].value_counts()[0]/len(df))*100)
print('Positive Responses %',(df['Response'].value_counts()[1]/len(df))*100)

In [None]:
plt.figure(figsize = (13,5))
plt.subplot(1,2,1)
sns.countplot(df['Gender'],palette='husl')
plt.title("count of male and female")
plt.subplot(1,2,2)
sns.countplot(df['Gender'], hue = df['Response'],palette="husl")
plt.title("Response in Male and female category")
plt.show()

In [None]:
sns.boxplot(df['Age'])

In [None]:
sns.countplot(df['Driving_License'],hue=df['Response'])

In [None]:
sns.countplot(x='Previously_Insured',hue='Response',data=df,palette='husl')

In [None]:
sns.countplot(x='Vehicle_Age',hue='Response',data=df,palette='husl')

In [None]:
plt.figure(figsize=(13,7))
plt.subplot(2,1,2)
sns.boxplot(df['Annual_Premium'])
plt.title("boxplot of Annual premium")
plt.show()

In [None]:
corr = df.corr()
f, ax = plt.subplots(figsize=(10,10))
sns.heatmap(corr, ax=ax, annot=True,linewidths=3,cmap='YlGn')
plt.title("Pearson correlation of Features", y=1.05, size=15)

In [None]:
sns.catplot(x='Response',y='Gender',data=df,kind='bar',palette="viridis")
plt.title('Gender vs Response',fontsize=15)

In [None]:
bin=np.linspace(min(df['Age']),max(df['Age']),5)
groups=['Young_Age','Medium_Age','Old_Age','Senior_Citizen']
df['Age-binned']=pd.cut(df['Age'],bins=bin,labels=groups)

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(df['Age-binned'],hue=df['Response'],palette='viridis')
plt.title('Response Vs Age Group')
plt.ylabel('Response')

In [None]:
sns.catplot(x='Gender',y='Age',hue='Previously_Insured',col='Response',data=df,kind='bar',palette='rocket')

In [None]:
sns.catplot(x='Gender',y='Age',hue='Vehicle_Damage',col='Response',
            data=df,kind='bar',palette='rocket')

In [None]:
fig, axes = plt.subplots(4,2, figsize=(22,20))

sns.boxplot(ax=axes[0, 0], data=df, x='Response', y='Gender')
sns.boxplot(ax=axes[0, 1], data=df, x='Response', y='Age')
sns.boxplot(ax=axes[1, 0], data=df, x='Response', y='Driving_License')
sns.boxplot(ax=axes[1, 1], data=df, x='Response', y='Previously_Insured')
sns.boxplot(ax=axes[2, 0], data=df, x='Response', y='Vehicle_Age')
sns.boxplot(ax=axes[2, 1], data=df, x='Response', y='Vehicle_Damage')
sns.boxplot(ax=axes[3, 0], data=df, x='Response', y='Annual_Premium')
sns.boxplot(ax=axes[3, 1], data=df, x='Response', y='Vintage')

In [None]:
def outlier_treatment(datacolumn):
     sorted(datacolumn)
     Q1,Q3 = np.percentile(datacolumn , [25,75])
     IQR = Q3 - Q1
     lower_range = Q1 - (1.5 * IQR)
     upper_range = Q3 + (1.5 * IQR)
     return lower_range,upper_range

In [None]:
lowerbound,upperbound = outlier_treatment(df.Annual_Premium)

In [None]:
df[(df.Annual_Premium < lowerbound) | (df.Annual_Premium > upperbound)]
df.drop(df[ (df.Annual_Premium > upperbound) | (df.Annual_Premium < lowerbound) ].index , inplace=True)

In [None]:
sns.boxplot( x='Response', y='Annual_Premium',data=df,palette='Spectral')

In [None]:
df['Gender'] = pd.get_dummies(df['Gender'],drop_first=True)

In [None]:
label = LabelEncoder()
df['Vehicle_Age']=label.fit_transform(df['Vehicle_Age'])
df['Vehicle_Damage']=label.fit_transform(df['Vehicle_Damage'])
df['Gender']=pd.get_dummies(df["Gender"],drop_first=True)

In [None]:
feature = [ 'Gender', 'Driving_License', 
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage','Vintage'
       'Response']

In [None]:
x=df.iloc[:,:-1]
y=df['Response']

In [None]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
randomsample=  RandomOverSampler()
x_new,y_new=randomsample.fit_resample(x,y)

from collections import Counter
print('Original dataset shape {}'.format(Counter(y)))
print('Resampled dataset shape {}'.format(Counter(y_new)))
sns.countplot(y_new,palette='husl')

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x_new,y_new,test_size=0.3,random_state=0)

In [None]:
scaler = StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
lr=LogisticRegression(random_state = 666,n_jobs = -1)
clf_l = lr.fit(x_train,y_train)
y_pred_log = clf_l.predict(x_test)
print(accuracy_score(y_test,y_pred_log)*100)
print(roc_auc_score(y_test,y_pred_log)*100)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import matplotlib.pyplot as plt

df_model1 = DecisionTreeClassifier()
df_model1.fit(x_train,y_train)

In [None]:
smote = SMOTE(random_state=1,n_jobs=-1)
x_sample,y_sample = smote.fit_resample(df.iloc[:,:-1],df['Response'])

In [None]:
x_frame = pd.DataFrame(x_sample,columns=['Gender', 'Age', 'Driving_License', 'Previously_Insured', 'Vehicle_Age',
                              'Vehicle_Damage', 'Annual_Premium', 'Vintage'])
y_frame = pd.DataFrame(y_sample,columns=['Response'])

In [None]:
x1=x_frame.drop(['Driving_License'],axis=1,inplace=True)
x1=x_frame.copy()

In [None]:
x_train1,x_test1,y_train1,y_test1=train_test_split(x1,y1,test_size=0.5,random_state=42)

In [None]:
scaler = StandardScaler().fit(x_train1)
x_train_s = scaler.transform(x_train1)
x_test_s = scaler.transform(x_test1)

In [None]:
lr=LogisticRegression(random_state = 666,n_jobs = -1)
clf_l = lr.fit(x_train_s,y_train1)
y_pred_log = clf_l.predict(x_test_s)
print(accuracy_score(y_test1,y_pred_log)*100)
print(roc_auc_score(y_test1,y_pred_log)*100)

lr.score(x_train_s,y_train1)

In [None]:
logreg_tuning = LogisticRegression()
param_logreg = {'C': [1, 0.5, 0.1, 5, 9],
               'penalty':['l2','l1', 'elasticnet']}

In [None]:
from sklearn.model_selection import GridSearchCV
model_logreg_tuned = GridSearchCV(estimator = logreg_tuning, param_grid = param_logreg, cv = 3, n_jobs = -1 , verbose = 1, scoring = 'recall')
model_logreg_tuned.fit(x_train_s, y_train1)
logreg_tuned = model_logreg_tuned.best_estimator_
y_tuned_logreg = logreg_tuned.predict(x_test_s)
logreg_tuned.score(x_train_s,y_train1)

In [None]:
DT_tuning = DecisionTreeClassifier()
param_DT =  {
    "max_depth": [None, 4,9,15,20,50],
    "min_samples_leaf": [ 1,4,0.1,2,10],
    "max_features" : [None, 0.2, 0.8, 2.0],
    "min_samples_split": [2,9,15,25]}

In [None]:
model_DT_tuned = GridSearchCV(estimator = DT_tuning, param_grid = param_DT, cv = 5, n_jobs = -1 , verbose = 1, scoring = 'recall')
model_DT_tuned.fit(x_train_s,y_train1)
DT_tuned = model_DT_tuned.best_estimator_
y_tuned_DT = DT_tuned.predict(x_test_s)
DT_tuned.score(x_train_s,y_train1)