<a href="https://colab.research.google.com/github/ckkhandare/DS_firstProject/blob/main/DSProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Libraries and loading Data

In [1]:
! pip install catboost

Collecting catboost
  Downloading catboost-0.26.1-cp37-none-manylinux1_x86_64.whl (67.4 MB)
[K     |████████████████████████████████| 67.4 MB 30 kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.26.1


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,OrdinalEncoder,RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV,cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE



In [3]:
df=pd.read_csv('train.csv',na_values='unknown')

# Exploratory Data Analysis

In [4]:
df

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,subscribed
0,26110,56,admin.,married,,no,1933,no,no,telephone,19,nov,44,2,-1,0,,no
1,40576,31,,married,secondary,no,3,no,no,cellular,20,jul,91,2,-1,0,,no
2,15320,27,services,married,secondary,no,891,yes,no,cellular,18,jul,240,1,-1,0,,no
3,43962,57,management,divorced,tertiary,no,3287,no,no,cellular,22,jun,867,1,84,3,success,yes
4,29842,31,technician,married,secondary,no,119,yes,no,cellular,4,feb,380,1,-1,0,,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31642,36483,29,management,single,tertiary,no,0,yes,no,cellular,12,may,116,2,-1,0,,no
31643,40178,53,management,divorced,tertiary,no,380,no,yes,cellular,5,jun,438,2,-1,0,,yes
31644,19710,32,management,single,tertiary,no,312,no,no,cellular,7,aug,37,3,-1,0,,no
31645,38556,57,technician,married,secondary,no,225,yes,no,telephone,15,may,22,7,337,12,failure,no


In [5]:
df.shape

(31647, 18)

In [6]:
df.isnull().sum()*100/31647

ID             0.000000
age            0.000000
job            0.650931
marital        0.000000
education      4.152052
default        0.000000
balance        0.000000
housing        0.000000
loan           0.000000
contact       28.998009
day            0.000000
month          0.000000
duration       0.000000
campaign       0.000000
pdays          0.000000
previous       0.000000
poutcome      81.931937
subscribed     0.000000
dtype: float64

In [7]:
df.select_dtypes(include='O').columns

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'poutcome', 'subscribed'],
      dtype='object')

In [8]:
df.select_dtypes(exclude='O').columns

Index(['ID', 'age', 'balance', 'day', 'duration', 'campaign', 'pdays',
       'previous'],
      dtype='object')

#### Observation

poutcome has more than 80% data missing so it can be droped

education is the only ordinal feature rest all as Nominal ['job','marital','default', 'housing','loan', 'contact','month', 'poutcome','subscribed']

job , education and contact have missing values that need to be imputed

# Preprocessing

In [9]:
df.drop(columns=['poutcome'],inplace=True)

In [10]:
df.drop(columns=['ID'],inplace=True)

In [11]:
df.shape

(31647, 16)

In [12]:
X=df.drop(columns=['subscribed'])
y=df['subscribed']

In [13]:
y.value_counts(normalize=True)

no     0.882611
yes    0.117389
Name: subscribed, dtype: float64

In [14]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30, random_state=42,stratify=y)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((22152, 15), (9495, 15), (22152,), (9495,))

In [15]:
le=LabelEncoder()

In [16]:
y_train1=le.fit_transform(y_train)
y_test1=le.transform(y_test)

In [17]:
pd.DataFrame(y_train1).value_counts(normalize=True)

0    0.882629
1    0.117371
dtype: float64

In [18]:
pd.DataFrame(y_test1).value_counts(normalize=True)

0    0.88257
1    0.11743
dtype: float64

In [19]:
education=['primary','secondary','tertiary']

In [20]:
ordi=['education']
nomi=['job', 'marital', 'default',  'housing', 'loan', 'contact',
       'month']

In [21]:
#continuous
continuous=['age', 'balance', 'day', 'duration', 'campaign', 'pdays',
       'previous']

In [22]:
p_numeric=Pipeline([
                    ('StandardS',RobustScaler())
])

In [23]:
p_cat_ordi=Pipeline([
                     ('Impute_num',SimpleImputer(strategy='most_frequent')),
                ('Ordinal_encode',OrdinalEncoder(categories=[education]))
])

In [24]:
p_cat_nomi=Pipeline([
                     ('Impute_num',SimpleImputer(strategy='most_frequent')),
                     ('One_hot',OneHotEncoder(drop='first'))
])

In [25]:
col_trans=ColumnTransformer(transformers=[                                          
    ('scale',p_numeric,continuous),
    ('O_encode',p_cat_ordi,ordi),
    ('N_encode',p_cat_nomi,nomi)

],remainder='drop')

In [26]:
col_trans.fit_transform(X_train,y_train1)

array([[ 0.86666667, -0.33038999,  0.76923077, ...,  0.        ,
         0.        ,  0.        ],
       [-0.53333333, -0.33038999, -0.15384615, ...,  0.        ,
         0.        ,  0.        ],
       [-0.13333333, -0.45548197,  0.92307692, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.13333333,  0.26122149,  0.30769231, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.73333333, -0.27078734,  0.38461538, ...,  1.        ,
         0.        ,  0.        ],
       [-0.53333333,  0.18543046,  0.53846154, ...,  0.        ,
         0.        ,  0.        ]])

In [27]:
X_train1=col_trans.transform(X_train)
X_test1=col_trans.transform(X_test)

In [28]:
X_train1.shape,X_test1.shape

((22152, 35), (9495, 35))

In [29]:
sm=SMOTE(random_state=42)

In [30]:
X_train_bal,y_train_bal=sm.fit_resample(X_train1,y_train1)



In [31]:
pd.DataFrame(y_train_bal).value_counts(normalize=True)

1    0.5
0    0.5
dtype: float64

#Feature selection

In [32]:
  # !pip install boruta

In [33]:
# sel_X_train=np.array(X_train)
# sel_y_train=np.array(y_train)
# rf =RandomForestClassifier()
# selected_fet= BorutaPy(rf, random_state=7,max_iter=10,perc=90 ,verbose=2 )

In [34]:
# selected_fet.fit(sel_X_train,sel_y_train)

In [35]:
# len(selected_fet.support_)

In [36]:
# selected_col= pd.DataFrame({'columns' : X_train.columns , 'imp_fet': list(selected_fet.support_)})

In [37]:
# sorted_sel=selected_col.sort_values(by='imp_fet',ascending=False).head(16)

In [38]:
# sorted_sel

In [39]:
# X_train2 = pd.DataFrame(X_train,columns = sorted_sel['columns'])
# X_test2 = pd.DataFrame(X_test,columns = sorted_sel['columns'])
# X_train2

# model selection 

In [40]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier

In [41]:
cl0=KNeighborsClassifier()
cl1=SVC()
cl2=GaussianNB()
cl3=LogisticRegression()
label=['KNeighborsClassifier','SVC','GaussianNB','LogisticRegression']
for clf,label in zip([cl0,cl1,cl2,cl3],label):
  score=cross_val_score(clf,X_train_bal,y_train_bal,cv=5,scoring='roc_auc',n_jobs=-1,verbose=2)
  print(score.mean(),label)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   16.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   16.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


0.9587465608283132 KNeighborsClassifier


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.8min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


0.8593652062529801 SVC


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


0.8054842059279703 GaussianNB
0.8936965856420169 LogisticRegression


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.7s finished


In [42]:
cl4=RandomForestClassifier()
cl5=XGBClassifier()
cl6=CatBoostClassifier()
label=['RandomForestClassifier','XGBClassifier','CatBoostClassifier']
for clf,label in zip([cl4,cl5,cl6],label):
  score=cross_val_score(clf,X_train_bal,y_train_bal,cv=5,scoring='roc_auc',n_jobs=-1,verbose=2)
  print(score.mean(),label)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   15.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   15.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


0.9921006347026843 RandomForestClassifier


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   12.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   12.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


0.9734625248465051 XGBClassifier




0.9821696669533176 CatBoostClassifier


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.2min finished


In [43]:
score=cross_val_score(MLPClassifier(),X_train_bal,y_train_bal,cv=5,scoring='roc_auc',n_jobs=-1,verbose=2)
print(score.mean())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


0.9601445389018501 CatBoostClassifier


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.5min finished


Selected Model :-

RandomForestClassifier

# HyperParameter tuning for RandomForestClassifier

In [49]:
grid_param={'bootstrap': [True],
 'max_depth':np.arange(2,15),
 'max_features': ['auto'],
 'min_samples_leaf': np.arange(0,5),
 'min_samples_split': np.arange(0,5),
 'n_estimators': [200,300,400,500]
 }


In [50]:
RF=RandomForestClassifier()

In [53]:
rf_random = RandomizedSearchCV(RF, grid_param, n_iter = 30, cv = 3, verbose=2, random_state=42, n_jobs = -1)

In [54]:
rf_random.fit(X_train_bal,y_train_bal)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  3.5min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [55]:
rf_random.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=14, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=14, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [56]:
rf_random.best_score_

0.9165307058303563

In [61]:
grid_param2={
 'max_depth':np.arange(12,16),
 'min_samples_leaf': np.arange(1,3),
 'min_samples_split': np.arange(1,3),
 'n_estimators': [300,400]
 }


In [62]:
rf_grid = GridSearchCV(RF, grid_param2, cv = 2, verbose=2,n_jobs = -1,scoring='roc_auc')

In [63]:
rf_grid.fit(X_train_bal,y_train_bal)

Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:  2.9min finished


GridSearchCV(cv=2, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [64]:
rf_grid.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=15, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=15, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [65]:
rf_grid.best_score_

0.9816543552247932

In [66]:
pred=rf_grid.predict(X_test1)

In [67]:
from sklearn.metrics import confusion_matrix,accuracy_score,roc_auc_score

In [68]:
confusion_matrix(y_test1,pred)

array([[7663,  717],
       [ 409,  706]])

In [69]:
accuracy_score(y_test1,pred)

0.8814112690889943

In [70]:
roc_auc_score(y_test1,pred)

0.7738114986568491

In [75]:
model3=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=15, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [71]:
from sklearn.decomposition import PCA

In [72]:
pca=PCA()

In [73]:
X_train_pca=pca.fit_transform(X_train_bal)

In [74]:
X_test_pca=pca.transform(X_test1)

In [77]:
model3.fit(X_train_pca,y_train_bal)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=15, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [78]:
pca_pred=model3.predict(X_test_pca)

In [79]:
confusion_matrix(y_test1,pca_pred)

array([[7445,  935],
       [ 268,  847]])

In [80]:
roc_auc_score(y_test1,pca_pred)

0.8240330383038839