In [1]:
from vecstack import stacking
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score

from imblearn.over_sampling import SMOTE 
from collections import Counter 

import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('Cell Phone Churn-TRAIN.csv')
train.head()

Unnamed: 0,MOUMO,MOUPMO,MOU3MO,MOUCH1M,MOUCH3M,CUM3MCH,CUSTMOS,LONGD,CALLW,LINES,...,INCOME,SPORTS,NATURE,ARTS,HRS_TV,TRAVEL,EDUC,TOTMOU,TOTCHNG,TARGET
0,105,15,99,90,-84,6,11,0,0,2,...,51000,B,1,N,10,,Bachelors,219,6,0
1,999,1200,1145,-201,55,-146,3,1,1,2,...,80000,A,0,N,40,Domestic,Masters,3344,-146,1
2,5,2,1,3,1,4,23,0,0,1,...,80000,C,0,Y,50,International,HighSchool,8,4,1
3,600,456,398,144,58,202,13,0,1,1,...,88000,A,1,N,0,International,Masters,1454,202,1
4,2,0,0,0,0,0,1,0,0,1,...,29000,B,0,N,75,Domestic,HighSchool,0,0,0


In [3]:
test = pd.read_csv('Cell Phone Churn-TEST.csv')
display(train.shape, test.shape)

(1695, 24)

(375, 24)

In [6]:
train.columns

Index(['MOUMO', 'MOUPMO', 'MOU3MO', 'MOUCH1M', 'MOUCH3M', 'CUM3MCH', 'CUSTMOS',
       'LONGD', 'CALLW', 'LINES', 'VOICEM', 'CELL', 'CONVB', 'SEX', 'INCOME',
       'SPORTS', 'NATURE', 'ARTS', 'HRS_TV', 'TRAVEL', 'EDUC', 'TOTMOU',
       'TOTCHNG', 'TARGET'],
      dtype='object')

In [7]:
test.columns

Index(['MOUMO', 'MOUPMO', 'MOU3MO', 'MOUCH1M', 'MOUCH3M', 'CUM3MCH', 'CUSTMOS',
       'LONGD', 'CALLW', 'LINES', 'VOICEM', 'CELL', 'CONVB', 'SEX', 'INCOME',
       'SPORTS', 'NATURE', 'ARTS', 'HRS_TV', 'TRAVEL', 'EDUC', 'TOTMOU',
       'TOTCHNG', 'TARGET'],
      dtype='object')

In [8]:
test.head()

Unnamed: 0,MOUMO,MOUPMO,MOU3MO,MOUCH1M,MOUCH3M,CUM3MCH,CUSTMOS,LONGD,CALLW,LINES,...,INCOME,SPORTS,NATURE,ARTS,HRS_TV,TRAVEL,EDUC,TOTMOU,TOTCHNG,TARGET
0,344,232,455,112,-223,-111,11,1,1,2,...,75000,B,1,Y,25,InterGalactic,GED,1031,-111,0
1,555,444,678,111,-234,-123,27,1,1,2,...,134000,na,1,Y,5,Domestic,PhD,1677,-123,0
2,2,0,4,1,-3,-2,4,0,0,1,...,53000,na,1,Y,125,,Bachelors,6,-2,0
3,5678,3457,2156,2221,1301,3522,14,1,1,1,...,33000,A,1,Y,25,Domestic,Masters,11291,3522,0
4,333,122,144,211,-22,189,9,0,1,1,...,44000,F,1,N,35,International,Masters,599,189,1


In [16]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1695 entries, 0 to 1694
Data columns (total 24 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   MOUMO    1695 non-null   int64 
 1   MOUPMO   1695 non-null   int64 
 2   MOU3MO   1695 non-null   int64 
 3   MOUCH1M  1695 non-null   int64 
 4   MOUCH3M  1695 non-null   int64 
 5   CUM3MCH  1695 non-null   int64 
 6   CUSTMOS  1695 non-null   int64 
 7   LONGD    1695 non-null   int64 
 8   CALLW    1695 non-null   int64 
 9   LINES    1695 non-null   int64 
 10  VOICEM   1695 non-null   int64 
 11  CELL     1695 non-null   int64 
 12  CONVB    1695 non-null   object
 13  SEX      1695 non-null   object
 14  INCOME   1695 non-null   int64 
 15  SPORTS   1695 non-null   object
 16  NATURE   1695 non-null   int64 
 17  ARTS     1695 non-null   object
 18  HRS_TV   1695 non-null   int64 
 19  TRAVEL   1695 non-null   object
 20  EDUC     1695 non-null   object
 21  TOTMOU   1695 non-null   int64 
 22  

In [22]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)

In [23]:
# seperate features from target for train data set
X_train = train.drop("TARGET",axis=1)
Y_train = train["TARGET"]

In [24]:
# seperate features from target for test data set
X_test = test.drop("TARGET",axis=1)
Y_test = test["TARGET"]

In [29]:
# Default Decision Tree model
clf = DecisionTreeClassifier()
clf.fit(X_train,Y_train)
clf_predict=clf.predict(X_test)
print("accuracy Score (training) for Decision Tree:{0:6f}".format(clf.score(X_test,Y_test)))
print("Confusion Matrix for Decision Tree")
print(confusion_matrix(Y_test,clf_predict))
roc_auc_score(Y_test,clf_predict)
print(classification_report(Y_test, clf_predict))

accuracy Score (training) for Decision Tree:0.994667
Confusion Matrix for Decision Tree
[[235   0]
 [  2 138]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       235
           1       1.00      0.99      0.99       140

    accuracy                           0.99       375
   macro avg       1.00      0.99      0.99       375
weighted avg       0.99      0.99      0.99       375



In [31]:
# Default Random Forest model
rfc = RandomForestClassifier()
rfc.fit(X_train,Y_train)
rfc_predict=rfc.predict(X_test)
print("accuracy Score (training) for Random Forest:{0:6f}".format(rfc.score(X_test,Y_test)))
print("Confusion Matrix for Random Forest")
print(confusion_matrix(Y_test,rfc_predict))
roc_auc_score(Y_test,rfc_predict)
print(classification_report(Y_test, rfc_predict))

accuracy Score (training) for Random Forest:0.994667
Confusion Matrix for Random Forest
[[235   0]
 [  2 138]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       235
           1       1.00      0.99      0.99       140

    accuracy                           0.99       375
   macro avg       1.00      0.99      0.99       375
weighted avg       0.99      0.99      0.99       375



In [32]:
# K-nearest neighbours
neigh = KNeighborsClassifier()
neigh.fit(X_train,Y_train)
neigh_predict=neigh.predict(X_test)
print("accuracy Score (training) for KNN:{0:6f}".format(neigh.score(X_test,Y_test)))
print("Confusion Matrix for KNN")
print(confusion_matrix(Y_test,neigh_predict))
roc_auc_score(Y_test,neigh_predict)
print(classification_report(Y_test, neigh_predict))

accuracy Score (training) for KNN:0.994667
Confusion Matrix for KNN
[[235   0]
 [  2 138]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       235
           1       1.00      0.99      0.99       140

    accuracy                           0.99       375
   macro avg       1.00      0.99      0.99       375
weighted avg       0.99      0.99      0.99       375



In [33]:
# Multilayer Perceptron
mlp = MLPClassifier()
mlp.fit(X_train, Y_train)
mlp_predict=mlp.predict(X_test)
print("accuracy Score (training) for MultiLayer Perceptron:{0:6f}".format(mlp.score(X_test,Y_test)))
print("Confusion Matrix for MultiLayer Perceptron:")
print(confusion_matrix(Y_test,mlp_predict))
roc_auc_score(Y_test,mlp_predict)
print(classification_report(Y_test, mlp_predict))

accuracy Score (training) for MultiLayer Perceptron:0.728000
Confusion Matrix for MultiLayer Perceptron:
[[235   0]
 [102  38]]
              precision    recall  f1-score   support

           0       0.70      1.00      0.82       235
           1       1.00      0.27      0.43       140

    accuracy                           0.73       375
   macro avg       0.85      0.64      0.62       375
weighted avg       0.81      0.73      0.67       375



In [34]:
# SVM
svm = SVC() 
svm.fit(X_train, Y_train) 
svm_predict=svm.predict(X_test) 
print("accuracy Score (training) for SVM Classifier:{0:6f}".format(svm.score(X_test,Y_test))) 
print("Confusion Matrix for SVM Classifier:") 
print(confusion_matrix(Y_test,svm_predict))
roc_auc_score(Y_test,svm_predict)
print(classification_report(Y_test, svm_predict))

accuracy Score (training) for SVM Classifier:0.650667
Confusion Matrix for SVM Classifier:
[[226   9]
 [122  18]]
              precision    recall  f1-score   support

           0       0.65      0.96      0.78       235
           1       0.67      0.13      0.22       140

    accuracy                           0.65       375
   macro avg       0.66      0.55      0.50       375
weighted avg       0.66      0.65      0.57       375



In [38]:
## KNN with hyper-parameter tuning using RANDOM search

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
params = {  'n_neighbors': [5,7,9],
                'leaf_size': [20,25,30],
                'metric' : ['str','callable', 'minkowski']}
randomized_search_knn = RandomizedSearchCV(KNeighborsClassifier(), params, cv=5)
randomized_search_knn.fit(X_train, Y_train)
randomized_search_knn.best_estimator_

KNeighborsClassifier(leaf_size=25)

In [51]:
neigh1 = KNeighborsClassifier(leaf_size=25)
neigh1.fit(X_train,Y_train)
neigh1_predict=neigh1.predict(X_test)
print("accuracy Score (training) for tuned KNN:{0:6f}".format(neigh1.score(X_test,Y_test)))
print("Confusion Matrix for KNN")
print(confusion_matrix(Y_test,neigh1_predict))
roc_auc_score(Y_test,neigh1_predict)
print(classification_report(Y_test, neigh1_predict))

accuracy Score (training) for tuned KNN:0.994667
Confusion Matrix for KNN
[[235   0]
 [  2 138]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       235
           1       1.00      0.99      0.99       140

    accuracy                           0.99       375
   macro avg       1.00      0.99      0.99       375
weighted avg       0.99      0.99      0.99       375



In [40]:
## MLP with hyper-parameter tuning using RANDOM search

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV
params = {  'activation': ['identity', 'logistic', 'tanh', 'relu'],
                'solver': ['lbfgs','sgd','adam'],
                'max_iter' : [100,200,300]}
randomized_search_mlp = RandomizedSearchCV(MLPClassifier(), params, cv=5)
randomized_search_mlp.fit(X_train, Y_train)
randomized_search_mlp.best_estimator_

MLPClassifier(activation='logistic', max_iter=100, solver='lbfgs')

In [52]:
# Multilayer Perceptron
mlp1 = MLPClassifier(activation='logistic', max_iter=100, solver='lbfgs')
mlp1.fit(X_train, Y_train)
mlp1_predict=mlp1.predict(X_test)
print("accuracy Score (training) for MultiLayer Perceptron:{0:6f}".format(mlp1.score(X_test,Y_test)))
print("Confusion Matrix for MultiLayer Perceptron:")
print(confusion_matrix(Y_test,mlp1_predict))
roc_auc_score(Y_test,mlp1_predict)
print(classification_report(Y_test, mlp1_predict))

accuracy Score (training) for MultiLayer Perceptron:0.765333
Confusion Matrix for MultiLayer Perceptron:
[[216  19]
 [ 69  71]]
              precision    recall  f1-score   support

           0       0.76      0.92      0.83       235
           1       0.79      0.51      0.62       140

    accuracy                           0.77       375
   macro avg       0.77      0.71      0.72       375
weighted avg       0.77      0.77      0.75       375



## STACKING

In [45]:
models = [ KNeighborsClassifier(), MLPClassifier(), SVC(), RandomForestClassifier(), DecisionTreeClassifier() ]
      
S_Train, S_Test = stacking(models, X_train,Y_train, X_test,                  
                           regression=False,      
                           mode='oof_pred_bag',        
                           needs_proba=False,         
                           save_dir=None,             
                           metric=accuracy_score, 
                           n_folds=4, 
                           stratified=True,           
                           shuffle=True,              
                           random_state=0,    
                           verbose=2)

task:         [classification]
n_classes:    [2]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [5]

model  0:     [KNeighborsClassifier]
    fold  0:  [0.96933962]
    fold  1:  [0.98820755]
    fold  2:  [0.98820755]
    fold  3:  [0.97635934]
    ----
    MEAN:     [0.98052851] + [0.00807014]
    FULL:     [0.98053097]

model  1:     [MLPClassifier]
    fold  0:  [0.76415094]
    fold  1:  [0.70990566]
    fold  2:  [0.69811321]
    fold  3:  [0.70449173]
    ----
    MEAN:     [0.71916538] + [0.02630567]
    FULL:     [0.71917404]

model  2:     [SVC]
    fold  0:  [0.63679245]
    fold  1:  [0.66981132]
    fold  2:  [0.64386792]
    fold  3:  [0.65484634]
    ----
    MEAN:     [0.65132951] + [0.01245940]
    FULL:     [0.65132743]

model  3:     [RandomForestClassifier]
    fold  0:  [0.98113208]
    fold  1:  [0.98820755]
    fold  2:  [0.98820755]
    fold  3:  [0.99290780]
    ----
    MEAN:     [0.98761374] + [0.00420548]
    FULL:     [0.98761062]

In [49]:
# USING KNN AS STACKING CLASSIFIER
model = KNeighborsClassifier()
model = model.fit(S_Train, Y_train)
Y_pred = model.predict(S_Test)

In [50]:
print("accuracy Score (training) for Stacked Random Forest:{0:6f}".format(model.score(S_Test,Y_test)))
print("Confusion Matrix for Stacked Random Forest")
print(confusion_matrix(Y_test,Y_pred))
roc_auc_score(Y_test,Y_pred)
print(classification_report(Y_test, Y_pred))

accuracy Score (training) for Stacked Random Forest:0.994667
Confusion Matrix for Stacked Random Forest
[[235   0]
 [  2 138]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       235
           1       1.00      0.99      0.99       140

    accuracy                           0.99       375
   macro avg       1.00      0.99      0.99       375
weighted avg       0.99      0.99      0.99       375

