In [2]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import imblearn
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score_score
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [3]:
#reading the dataset 
insurance_df=pd.read_csv('Seguro-Trimmed.csv')
insurance_df

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54447,136609,0,1,1,6,0,0,1,0,0,...,7,5,4,4,0,1,1,0,0,0
54448,136611,0,0,2,2,0,0,0,0,0,...,4,0,2,11,0,1,1,0,0,0
54449,136614,0,2,1,5,1,4,0,0,1,...,3,1,4,11,1,1,0,0,1,1
54450,136615,0,1,1,3,1,0,0,1,0,...,9,3,3,4,0,1,0,1,0,0


In [4]:
y=insurance_df['target']
y
insurance_df.drop(columns='target',inplace=True)

In [5]:
insurance_df.replace(to_replace=-1,value=np.nan,inplace=True)

In [6]:
type(insurance_df)

pandas.core.frame.DataFrame

In [7]:
na_df=pd.DataFrame({'col_name':insurance_df.columns,
                   'na_count':insurance_df.isnull().sum(),
                   'na_percentage': insurance_df.isnull().sum()/insurance_df.shape[0]*100}) 
na_df.sort_values(by='na_percentage',ascending=False)
cols_to_drop=list(na_df[na_df['na_percentage']>50]['col_name'])
cols_to_drop.append('id') 


insurance_df.drop(columns=cols_to_drop,inplace=True)



                

In [8]:
#categorical columns
cat_cols=[col for col in insurance_df.columns if 'cat' in col] 
cat_cols

['ps_ind_02_cat',
 'ps_ind_04_cat',
 'ps_ind_05_cat',
 'ps_car_01_cat',
 'ps_car_02_cat',
 'ps_car_04_cat',
 'ps_car_05_cat',
 'ps_car_06_cat',
 'ps_car_07_cat',
 'ps_car_08_cat',
 'ps_car_09_cat',
 'ps_car_10_cat',
 'ps_car_11_cat']

In [9]:
#binary columns
bin_cols=[col for col in insurance_df.columns if 'bin' in col]
bin_cols

['ps_ind_06_bin',
 'ps_ind_07_bin',
 'ps_ind_08_bin',
 'ps_ind_09_bin',
 'ps_ind_10_bin',
 'ps_ind_11_bin',
 'ps_ind_12_bin',
 'ps_ind_13_bin',
 'ps_ind_16_bin',
 'ps_ind_17_bin',
 'ps_ind_18_bin',
 'ps_calc_15_bin',
 'ps_calc_16_bin',
 'ps_calc_17_bin',
 'ps_calc_18_bin',
 'ps_calc_19_bin',
 'ps_calc_20_bin']

In [10]:
#continous columns list
combined_cols=(cat_cols+bin_cols)
combined_cols
con_cols=[col for col in insurance_df.columns if col not in combined_cols]
con_cols

['ps_ind_01',
 'ps_ind_03',
 'ps_ind_14',
 'ps_ind_15',
 'ps_reg_01',
 'ps_reg_02',
 'ps_reg_03',
 'ps_car_11',
 'ps_car_12',
 'ps_car_13',
 'ps_car_14',
 'ps_car_15',
 'ps_calc_01',
 'ps_calc_02',
 'ps_calc_03',
 'ps_calc_04',
 'ps_calc_05',
 'ps_calc_06',
 'ps_calc_07',
 'ps_calc_08',
 'ps_calc_09',
 'ps_calc_10',
 'ps_calc_11',
 'ps_calc_12',
 'ps_calc_13',
 'ps_calc_14']

In [11]:
#train test split
X_train,X_test,y_train,y_test=train_test_split(insurance_df,y,test_size=0.2,random_state=43)

In [12]:
#filling missing values of continuous columns using mean
for col in con_cols:
    X_train[col]=X_train[col].fillna(X_train[col].mean())
    X_test[col]=X_test[col].fillna(X_train[col].mean())
X_test[col]
    


51552    13
6235     10
31913    10
17733     6
6278      4
         ..
27623     7
45804     7
36739    14
26308    10
48453     5
Name: ps_calc_14, Length: 10891, dtype: int64

In [13]:
for col in combined_cols:
    X_train[col]=X_train[col].fillna(X_train[col].mode()[0])
    X_test[col]=X_test[col].fillna(X_train[col].mode()[0])

In [14]:
sm=SMOTE(sampling_strategy=0.2, random_state=42)
X_train_smote, y_train_smote = sm.fit_resample(X_train,y_train)

In [15]:
y_train_smote.value_counts(normalize=True)

0    0.833333
1    0.166667
Name: target, dtype: float64

In [16]:
#finding missing values count
X_train_smote.isnull().sum()

ps_ind_01         0
ps_ind_02_cat     0
ps_ind_03         0
ps_ind_04_cat     0
ps_ind_05_cat     0
ps_ind_06_bin     0
ps_ind_07_bin     0
ps_ind_08_bin     0
ps_ind_09_bin     0
ps_ind_10_bin     0
ps_ind_11_bin     0
ps_ind_12_bin     0
ps_ind_13_bin     0
ps_ind_14         0
ps_ind_15         0
ps_ind_16_bin     0
ps_ind_17_bin     0
ps_ind_18_bin     0
ps_reg_01         0
ps_reg_02         0
ps_reg_03         0
ps_car_01_cat     0
ps_car_02_cat     0
ps_car_04_cat     0
ps_car_05_cat     0
ps_car_06_cat     0
ps_car_07_cat     0
ps_car_08_cat     0
ps_car_09_cat     0
ps_car_10_cat     0
ps_car_11_cat     0
ps_car_11         0
ps_car_12         0
ps_car_13         0
ps_car_14         0
ps_car_15         0
ps_calc_01        0
ps_calc_02        0
ps_calc_03        0
ps_calc_04        0
ps_calc_05        0
ps_calc_06        0
ps_calc_07        0
ps_calc_08        0
ps_calc_09        0
ps_calc_10        0
ps_calc_11        0
ps_calc_12        0
ps_calc_13        0
ps_calc_14        0


In [17]:
#one-hot encoding
one_hot_train=pd.get_dummies(X_train_smote[cat_cols])
one_hot_test=pd.get_dummies(X_test[cat_cols])
one_hot_train.shape
oe_train,oe_test=one_hot_train.align(one_hot_test,join='inner',axis=1,fill_value=0)  



In [18]:
#Min Max Scaling
scaler=MinMaxScaler()
for col in con_cols:
    X_train_smote[col]=scaler.fit_transform(np.array(X_train_smote[col]).reshape(-1,1))
    X_test[col]=scaler.transform(np.array(X_test[col]).reshape(-1,1))



In [19]:
train_final=pd.concat([X_train_smote[con_cols],oe_train],axis=1)
test_final=pd.concat([X_test[con_cols],oe_test],axis=1)

In [20]:
logreg=LogisticRegression()
logreg.fit(train_final,y_train_smote)

LogisticRegression()

In [26]:
pred=logreg.predict(test_final)

print('confusion_matrix', confusion_matrix(y_test,pred))
print('precision_score', precision_score(y_test,pred))
print('recall_score', recall_score(y_test,pred))
print('f1_score', f1_score(y_test,pred))

confusion_matrix [[10345   144]
 [  397     5]]
precision_score 0.03355704697986577
recall_score 0.012437810945273632
f1_score 0.018148820326678767


In [27]:
gnb = GaussianNB()
gnb.fit(train_final,y_train_smote)
gnb_pred = gnb.predict(test_final)

print('confusion_matrix', confusion_matrix(y_test,gnb_pred))
print('precision_score', precision_score(y_test,gnb_pred))
print('recall_score', recall_score(y_test,gnb_pred))
print('f1_score', f1_score(y_test,gnb_pred))

confusion_matrix [[8903 1586]
 [ 316   86]]
precision_score 0.05143540669856459
recall_score 0.21393034825870647
f1_score 0.08293153326904533


In [23]:
#Decision Tree

tree = DecisionTreeClassifier(max_depth=8,random_state=42)
tree.fit(train_final,y_train_smote)
test_pred = tree.predict(test_final)
print('train predictions', tree.predict(train_final))
print('test predictions', test_pred)

train predictions [0 0 0 ... 0 1 1]
test predictions [0 0 0 ... 0 0 0]


In [28]:
#decision tree metrics

print('confusion_matrix', confusion_matrix(y_test,test_pred))
print('precision_score', precision_score(y_test,test_pred))
print('recall_score', recall_score(y_test,test_pred))
print('f1_score', f1_score(y_test,test_pred))

confusion_matrix [[10465    24]
 [  401     1]]
precision_score 0.04
recall_score 0.0024875621890547263
f1_score 0.004683840749414519
