<a href="https://colab.research.google.com/github/ckkhandare/DS_firstProject/blob/main/DSProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Libraries and loading Data

In [82]:
! pip install catboost

Collecting catboost
  Downloading catboost-0.26.1-cp37-none-manylinux1_x86_64.whl (67.4 MB)
[K     |████████████████████████████████| 67.4 MB 27 kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.26.1


In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,OrdinalEncoder,RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV,cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

In [6]:
df=pd.read_csv('train.csv',na_values='unknown')

# Exploratory Data Analysis

In [7]:
df

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,subscribed
0,26110,56,admin.,married,,no,1933,no,no,telephone,19,nov,44,2,-1,0,,no
1,40576,31,,married,secondary,no,3,no,no,cellular,20,jul,91,2,-1,0,,no
2,15320,27,services,married,secondary,no,891,yes,no,cellular,18,jul,240,1,-1,0,,no
3,43962,57,management,divorced,tertiary,no,3287,no,no,cellular,22,jun,867,1,84,3,success,yes
4,29842,31,technician,married,secondary,no,119,yes,no,cellular,4,feb,380,1,-1,0,,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31642,36483,29,management,single,tertiary,no,0,yes,no,cellular,12,may,116,2,-1,0,,no
31643,40178,53,management,divorced,tertiary,no,380,no,yes,cellular,5,jun,438,2,-1,0,,yes
31644,19710,32,management,single,tertiary,no,312,no,no,cellular,7,aug,37,3,-1,0,,no
31645,38556,57,technician,married,secondary,no,225,yes,no,telephone,15,may,22,7,337,12,failure,no


In [8]:
df.shape

(31647, 18)

In [9]:
df.isnull().sum()*100/31647

ID             0.000000
age            0.000000
job            0.650931
marital        0.000000
education      4.152052
default        0.000000
balance        0.000000
housing        0.000000
loan           0.000000
contact       28.998009
day            0.000000
month          0.000000
duration       0.000000
campaign       0.000000
pdays          0.000000
previous       0.000000
poutcome      81.931937
subscribed     0.000000
dtype: float64

In [10]:
df.select_dtypes(include='O').columns

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'poutcome', 'subscribed'],
      dtype='object')

In [11]:
df.select_dtypes(exclude='O').columns

Index(['ID', 'age', 'balance', 'day', 'duration', 'campaign', 'pdays',
       'previous'],
      dtype='object')

#### Observation

poutcome has more than 80% data missing so it can be droped

education is the only ordinal feature rest all as Nominal ['job','marital','default', 'housing','loan', 'contact','month', 'poutcome','subscribed']

job , education and contact have missing values that need to be imputed

# Preprocessing

In [12]:
df.drop(columns=['poutcome'],inplace=True)

In [13]:
df.drop(columns=['ID'],inplace=True)

In [14]:
df.shape

(31647, 16)

In [15]:
X=df.drop(columns=['subscribed'])
y=df['subscribed']

In [16]:
y.value_counts(normalize=True)

no     0.882611
yes    0.117389
Name: subscribed, dtype: float64

In [20]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30, random_state=42,stratify=y)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((22152, 15), (9495, 15), (22152,), (9495,))

In [47]:
le=LabelEncoder()

In [52]:
y_train1=le.fit_transform(y_train)
y_test1=le.transform(y_test)

In [57]:
pd.DataFrame(y_train1).value_counts(normalize=True)

0    0.882629
1    0.117371
dtype: float64

In [58]:
pd.DataFrame(y_test1).value_counts(normalize=True)

0    0.88257
1    0.11743
dtype: float64

In [59]:
education=['primary','secondary','tertiary']

In [60]:
ordi=['education']
nomi=['job', 'marital', 'default',  'housing', 'loan', 'contact',
       'month']

In [61]:
#continuous
continuous=['age', 'balance', 'day', 'duration', 'campaign', 'pdays',
       'previous']

In [62]:
p_numeric=Pipeline([
                    ('StandardS',RobustScaler())
])

In [63]:
p_cat_ordi=Pipeline([
                     ('Impute_num',SimpleImputer(strategy='most_frequent')),
                ('Ordinal_encode',OrdinalEncoder(categories=[education]))
])

In [64]:
p_cat_nomi=Pipeline([
                     ('Impute_num',SimpleImputer(strategy='most_frequent')),
                     ('One_hot',OneHotEncoder(drop='first'))
])

In [65]:
col_trans=ColumnTransformer(transformers=[                                          
    ('scale',p_numeric,continuous),
    ('O_encode',p_cat_ordi,ordi),
    ('N_encode',p_cat_nomi,nomi)

],remainder='drop')

In [67]:
col_trans.fit_transform(X_train,y_train1)

array([[ 0.86666667, -0.33038999,  0.76923077, ...,  0.        ,
         0.        ,  0.        ],
       [-0.53333333, -0.33038999, -0.15384615, ...,  0.        ,
         0.        ,  0.        ],
       [-0.13333333, -0.45548197,  0.92307692, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.13333333,  0.26122149,  0.30769231, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.73333333, -0.27078734,  0.38461538, ...,  1.        ,
         0.        ,  0.        ],
       [-0.53333333,  0.18543046,  0.53846154, ...,  0.        ,
         0.        ,  0.        ]])

In [68]:
X_train1=col_trans.transform(X_train)
X_test1=col_trans.transform(X_test)

In [69]:
X_train1.shape,X_test1.shape

((22152, 35), (9495, 35))

In [85]:
sm=SMOTE(random_state=42)

In [86]:
X_train_bal,y_train_bal=sm.fit_resample(X_train1,y_train1)



In [87]:
pd.DataFrame(y_train_bal).value_counts(normalize=True)

1    0.5
0    0.5
dtype: float64

# Feature selection 

In [83]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [89]:
cl0=RandomForestClassifier()
cl1=LogisticRegression()
cl2=CatBoostClassifier()
cl3=XGBClassifier()
label=['RandomForestClassifier','logistic','CatBoostClassifier','XGBClassifier']
for clf,label in zip([cl0,cl1,cl3,cl2],label):
  score=cross_val_score(clf,X_train_bal,y_train_bal,cv=5,scoring='roc_auc',n_jobs=-1,verbose=2)
  print(score.mean(),label)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   20.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   20.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


0.9920186669219438 RandomForestClassifier


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


0.8936965856420169 logistic


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   15.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   15.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


0.9734625313858677 CatBoostClassifier
0.9821696669533176 XGBClassifier


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.8min finished
