In [1]:
## Importing necessary libraries 

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
## Installing our favourite pycaret library
!pip install pycaret



In [3]:
# Load dataset
data = pd.read_csv('https://raw.githubusercontent.com/diazers/dataset/main/WA_Fn-UseC_-Telco-Customer-Churn.csv')
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
# cek data duplicate
data.duplicated().any()

False

In [5]:
## Checking for the null values
data.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [6]:
data['TotalCharges'][data['TotalCharges'] == ' '].count()

11

In [7]:
# mengubah value '1' menjadi 'yes' di kolom SeniorCitizen 
data['SeniorCitizen'] = data['SeniorCitizen'].map({1 : 'Yes', 0 : "No"})

print(data['SeniorCitizen'].unique())
data

['No' 'Yes']


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,No,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,No,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,No,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,No,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,No,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,No,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,No,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,No,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,Yes,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [8]:
# drop baris data dengan spasi

drop_rows = data[(data['TotalCharges'] == ' ')].index
data.drop(drop_rows, inplace = True)


In [9]:
for col in data:
  print(col)
  space = data[col][data[col] == ' '].count()
  print(f"Number of space value:", {space})

customerID
Number of space value: {0}
gender
Number of space value: {0}
SeniorCitizen
Number of space value: {0}
Partner
Number of space value: {0}
Dependents
Number of space value: {0}
tenure
Number of space value: {0}
PhoneService
Number of space value: {0}
MultipleLines
Number of space value: {0}
InternetService
Number of space value: {0}
OnlineSecurity
Number of space value: {0}
OnlineBackup
Number of space value: {0}
DeviceProtection
Number of space value: {0}
TechSupport
Number of space value: {0}
StreamingTV
Number of space value: {0}
StreamingMovies
Number of space value: {0}
Contract
Number of space value: {0}
PaperlessBilling
Number of space value: {0}
PaymentMethod
Number of space value: {0}
MonthlyCharges
Number of space value: {0}
TotalCharges
Number of space value: {0}
Churn
Number of space value: {0}


In [10]:
## Checking the data type of the columns
data.dtypes

customerID           object
gender               object
SeniorCitizen        object
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [11]:
data['TotalCharges'] = data['TotalCharges'].astype('float64')
data['tenure'] = data['tenure'].astype('float64')

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   object 
 2   SeniorCitizen     7032 non-null   object 
 3   Partner           7032 non-null   object 
 4   Dependents        7032 non-null   object 
 5   tenure            7032 non-null   float64
 6   PhoneService      7032 non-null   object 
 7   MultipleLines     7032 non-null   object 
 8   InternetService   7032 non-null   object 
 9   OnlineSecurity    7032 non-null   object 
 10  OnlineBackup      7032 non-null   object 
 11  DeviceProtection  7032 non-null   object 
 12  TechSupport       7032 non-null   object 
 13  StreamingTV       7032 non-null   object 
 14  StreamingMovies   7032 non-null   object 
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   object 


In [12]:
data = data.drop(['customerID'],axis=1)
data.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [13]:
# This command will basically import all the modules from pycaret that are necessary for classification tasks
from pycaret.classification import *

In [14]:
from imblearn.over_sampling import SMOTE, ADASYN

In [None]:
clf=setup(data=data, 
          data_split_stratify = True,
          target='Churn', 
          fix_imbalance = True,
          fix_imbalance_method = ADASYN(),	
          )

Unnamed: 0,Description,Value
0,session_id,7055
1,Target,Churn
2,Target Type,Binary
3,Label Encoded,"No: 0, Yes: 1"
4,Original Data,"(7032, 20)"
5,Missing Values,False
6,Numeric Features,3
7,Categorical Features,16
8,Ordinal Features,False
9,High Cardinality Features,False


In [None]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8017,0.8484,0.6024,0.6364,0.6179,0.4844,0.4854,1.469
lightgbm,Light Gradient Boosting Machine,0.7952,0.8372,0.5504,0.6345,0.5886,0.4533,0.4559,0.433
ada,Ada Boost Classifier,0.7924,0.8448,0.6315,0.6063,0.6178,0.4755,0.4763,0.478
rf,Random Forest Classifier,0.7842,0.822,0.5092,0.6154,0.5562,0.4155,0.4194,1.118
et,Extra Trees Classifier,0.7666,0.7924,0.4878,0.5722,0.5262,0.3727,0.3751,1.016
nb,Naive Bayes,0.7389,0.8308,0.8027,0.5059,0.6205,0.4368,0.4641,0.066
lr,Logistic Regression,0.7308,0.8406,0.8211,0.4964,0.6185,0.4295,0.4624,0.188
ridge,Ridge Classifier,0.73,0.0,0.8272,0.4955,0.6196,0.4302,0.4643,0.065
lda,Linear Discriminant Analysis,0.7277,0.8403,0.828,0.4929,0.6179,0.4269,0.4616,0.09
dt,Decision Tree Classifier,0.7259,0.655,0.5024,0.4854,0.493,0.3055,0.306,0.1


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=7055, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [15]:
# Setting up the classifier
# Pass the complete dataset as data and the featured to be predicted as target
clf=setup(
          data=data, 
          data_split_stratify = True,
          target='Churn', 
          fix_imbalance = True,
          fix_imbalance_method = SMOTE(),
          normalize =  True,
          transformation = True,
          session_id = 123	
         )

Unnamed: 0,Description,Value
0,session_id,123
1,Target,Churn
2,Target Type,Binary
3,Label Encoded,"No: 0, Yes: 1"
4,Original Data,"(7032, 20)"
5,Missing Values,False
6,Numeric Features,3
7,Categorical Features,16
8,Ordinal Features,False
9,High Cardinality Features,False


In [16]:
# This model will be used to compare all the model along with the cross validation
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.7881,0.8463,0.6705,0.5899,0.6272,0.4802,0.4824,1.304
lightgbm,Light Gradient Boosting Machine,0.7859,0.8345,0.591,0.5983,0.5943,0.4489,0.4491,0.318
rf,Random Forest Classifier,0.781,0.8179,0.5696,0.5921,0.58,0.4321,0.4327,1.057
ada,Ada Boost Classifier,0.7733,0.8457,0.7179,0.5581,0.6273,0.4683,0.4765,0.41
et,Extra Trees Classifier,0.7696,0.7935,0.5122,0.5737,0.5405,0.3877,0.3891,0.996
lr,Logistic Regression,0.7523,0.8439,0.7753,0.5233,0.6246,0.4501,0.4697,0.435
ridge,Ridge Classifier,0.7503,0.0,0.783,0.5205,0.6249,0.4491,0.4705,0.076
lda,Linear Discriminant Analysis,0.7499,0.8445,0.7906,0.5196,0.6268,0.4506,0.4735,0.104
nb,Naive Bayes,0.7393,0.8302,0.7799,0.507,0.6142,0.4307,0.4537,0.076
dummy,Dummy Classifier,0.7343,0.5,0.0,0.0,0.0,0.0,0.0,0.07


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=123, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [None]:
GradientBoostingClassifier=create_model('gbc')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7708,0.814,0.6336,0.5608,0.595,0.436,0.4376
1,0.7708,0.8346,0.6489,0.5592,0.6007,0.4412,0.4436
2,0.8171,0.8822,0.7231,0.6351,0.6763,0.5495,0.5518
3,0.7967,0.8647,0.6615,0.6056,0.6324,0.4923,0.4932
4,0.7541,0.8169,0.626,0.5325,0.5754,0.4039,0.4065
5,0.7642,0.8414,0.6565,0.5478,0.5972,0.4325,0.436
6,0.8008,0.8493,0.7099,0.6078,0.6549,0.5161,0.5192
7,0.8089,0.864,0.7023,0.6259,0.6619,0.5293,0.531
8,0.8049,0.833,0.6718,0.6241,0.6471,0.5125,0.5131
9,0.7927,0.8651,0.6489,0.6028,0.625,0.482,0.4826


In [None]:
LGBMClassifier = create_model('lightgbm')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7728,0.8153,0.542,0.5772,0.5591,0.4063,0.4066
1,0.7667,0.8246,0.5344,0.5645,0.549,0.3919,0.3921
2,0.7988,0.8698,0.6,0.624,0.6118,0.476,0.4762
3,0.7886,0.8394,0.5846,0.6032,0.5938,0.4509,0.451
4,0.7703,0.7987,0.5725,0.5682,0.5703,0.4136,0.4136
5,0.7744,0.8348,0.6183,0.5704,0.5934,0.4376,0.4383
6,0.7866,0.8293,0.6183,0.5956,0.6067,0.4604,0.4605
7,0.7866,0.8533,0.6107,0.597,0.6038,0.4578,0.4578
8,0.8069,0.8307,0.6107,0.6452,0.6275,0.4973,0.4976
9,0.7988,0.8436,0.6107,0.625,0.6178,0.4812,0.4813


In [None]:
RandomForestClassifier = create_model('rf')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7951,0.8021,0.5649,0.6271,0.5944,0.4578,0.459
1,0.7769,0.8063,0.542,0.5868,0.5635,0.4139,0.4145
2,0.8069,0.8519,0.5923,0.6471,0.6185,0.4896,0.4904
3,0.7846,0.8131,0.5385,0.6034,0.5691,0.4261,0.4273
4,0.7602,0.7749,0.5344,0.5512,0.5426,0.3802,0.3802
5,0.7744,0.8173,0.5878,0.5746,0.5811,0.4268,0.4268
6,0.7886,0.8377,0.6336,0.5971,0.6148,0.4693,0.4697
7,0.8008,0.8354,0.6031,0.632,0.6172,0.4827,0.4829
8,0.7805,0.8053,0.5115,0.6036,0.5537,0.4095,0.412
9,0.7683,0.8265,0.4962,0.5752,0.5328,0.3798,0.3817


In [17]:
AdaBoostClassifier = create_model('ada')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7647,0.8141,0.6794,0.546,0.6054,0.4406,0.446
1,0.7627,0.8321,0.7176,0.5402,0.6164,0.4495,0.459
2,0.811,0.8921,0.8,0.6082,0.691,0.5585,0.5694
3,0.7846,0.8548,0.6846,0.5779,0.6268,0.4768,0.4802
4,0.748,0.8233,0.7176,0.5193,0.6026,0.4249,0.4368
5,0.7663,0.8342,0.7328,0.5455,0.6254,0.4608,0.4714
6,0.7703,0.8467,0.7481,0.5506,0.6343,0.4725,0.4843
7,0.7886,0.8572,0.7328,0.5818,0.6486,0.5003,0.5071
8,0.7764,0.8412,0.6794,0.5669,0.6181,0.4618,0.4656
9,0.7846,0.86,0.7176,0.5767,0.6395,0.4884,0.4943


In [18]:
LogisticRegression = create_model('lr')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7323,0.8156,0.7176,0.4974,0.5875,0.3988,0.4135
1,0.7424,0.8296,0.7481,0.5104,0.6068,0.4252,0.4424
2,0.7764,0.8936,0.8538,0.5495,0.6687,0.5117,0.54
3,0.7805,0.8552,0.8,0.5591,0.6582,0.5039,0.5215
4,0.7317,0.8067,0.771,0.4975,0.6048,0.4157,0.4385
5,0.7459,0.8408,0.7634,0.5155,0.6154,0.4362,0.4549
6,0.7703,0.8526,0.8015,0.5469,0.6502,0.4881,0.5079
7,0.7805,0.8617,0.7786,0.5635,0.6538,0.4991,0.5131
8,0.7581,0.8355,0.7557,0.5323,0.6246,0.454,0.4692
9,0.7378,0.8443,0.7405,0.5052,0.6006,0.4156,0.4325


In [None]:
# Whenenver we compare different models or build a model, the model uses deault
# hyperparameter values. Hence, we need to tune our model to get better performance

tuned_GradientBoosting_classifier = tune_model(GradientBoostingClassifier, optimize = 'F1')
print(tuned_GradientBoosting_classifier)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7485,0.8153,0.6947,0.52,0.5948,0.4178,0.427
1,0.7748,0.8318,0.7023,0.561,0.6237,0.4659,0.4719
2,0.7967,0.8861,0.7615,0.5893,0.6644,0.522,0.5309
3,0.7927,0.8591,0.7,0.5909,0.6408,0.4966,0.5001
4,0.752,0.8148,0.7099,0.5254,0.6039,0.4292,0.4395
5,0.7622,0.8361,0.7099,0.5407,0.6139,0.4466,0.4552
6,0.7866,0.855,0.7863,0.5722,0.6624,0.512,0.5258
7,0.8028,0.8664,0.7481,0.6049,0.6689,0.5308,0.5369
8,0.7927,0.8318,0.6947,0.5948,0.6408,0.4964,0.4993
9,0.7886,0.8574,0.6947,0.5871,0.6364,0.4888,0.4923


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.01, loss='deviance', max_depth=7,
                           max_features='sqrt', max_leaf_nodes=None,
                           min_impurity_decrease=0.05, min_impurity_split=None,
                           min_samples_leaf=2, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=140,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=123, subsample=0.35, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)


In [None]:
tuned_LGBMClassifier = tune_model(LGBMClassifier, optimize = 'F1')
print(tuned_LGBMClassifier)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7505,0.8266,0.7176,0.5222,0.6045,0.4288,0.4404
1,0.789,0.8318,0.7176,0.5839,0.6438,0.4962,0.5015
2,0.7988,0.8835,0.7538,0.5939,0.6644,0.5236,0.5312
3,0.7846,0.8527,0.7077,0.575,0.6345,0.4841,0.4893
4,0.7459,0.8083,0.7176,0.5165,0.6006,0.4215,0.4338
5,0.7602,0.8243,0.7176,0.5371,0.6144,0.4455,0.4554
6,0.7825,0.8466,0.7557,0.569,0.6492,0.4961,0.5066
7,0.8069,0.8589,0.7176,0.6184,0.6643,0.5298,0.5327
8,0.7825,0.825,0.687,0.5769,0.6272,0.4753,0.4789
9,0.7785,0.8593,0.687,0.5696,0.6228,0.4679,0.4721


LGBMClassifier(bagging_fraction=0.7, bagging_freq=2, boosting_type='gbdt',
               class_weight=None, colsample_bytree=1.0, feature_fraction=0.8,
               importance_type='split', learning_rate=0.0005, max_depth=-1,
               min_child_samples=1, min_child_weight=0.001, min_split_gain=0.2,
               n_estimators=30, n_jobs=-1, num_leaves=256, objective=None,
               random_state=123, reg_alpha=3, reg_lambda=0.001, silent='warn',
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)


In [None]:
tuned_RandomForestClassifier = tune_model(RandomForestClassifier, optimize = 'F1')
print(tuned_RandomForestClassifier)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7465,0.816,0.7099,0.5167,0.5981,0.4195,0.4308
1,0.7647,0.8274,0.7328,0.5424,0.6234,0.4578,0.4687
2,0.8008,0.8908,0.8385,0.586,0.6899,0.5499,0.569
3,0.7886,0.8606,0.7462,0.5774,0.651,0.5029,0.5114
4,0.748,0.8146,0.7328,0.5189,0.6076,0.4298,0.4437
5,0.7419,0.8217,0.7939,0.5098,0.6209,0.439,0.4637
6,0.7642,0.8541,0.8092,0.5381,0.6463,0.48,0.5025
7,0.7927,0.8594,0.7863,0.5819,0.6688,0.5228,0.5353
8,0.7764,0.8294,0.7023,0.5644,0.6259,0.4691,0.4748
9,0.7602,0.8447,0.7023,0.538,0.6093,0.4406,0.4487


RandomForestClassifier(bootstrap=False, ccp_alpha=0.0,
                       class_weight='balanced_subsample', criterion='gini',
                       max_depth=6, max_features='log2', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.001,
                       min_impurity_split=None, min_samples_leaf=6,
                       min_samples_split=9, min_weight_fraction_leaf=0.0,
                       n_estimators=190, n_jobs=-1, oob_score=False,
                       random_state=123, verbose=0, warm_start=False)


In [19]:
# AdaBoostClassifier

tuned_AdaBoostClassifier = tune_model(AdaBoostClassifier, optimize = 'F1')
print(tuned_AdaBoostClassifier)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7404,0.8153,0.7252,0.508,0.5975,0.4145,0.4288
1,0.7546,0.8368,0.7481,0.5269,0.6183,0.4453,0.4602
2,0.7967,0.8925,0.8846,0.575,0.697,0.5542,0.5833
3,0.7805,0.8628,0.7538,0.5632,0.6447,0.4907,0.5016
4,0.7459,0.8239,0.7557,0.5156,0.613,0.4338,0.4513
5,0.7317,0.8403,0.7786,0.4976,0.6071,0.4181,0.4423
6,0.7703,0.8537,0.8397,0.5446,0.6607,0.4987,0.5255
7,0.8008,0.864,0.8092,0.5922,0.6839,0.5435,0.5576
8,0.7683,0.8384,0.7099,0.5503,0.62,0.4572,0.4648
9,0.7541,0.8505,0.7328,0.5275,0.6134,0.44,0.4528


AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=0.4,
                   n_estimators=230, random_state=123)


In [20]:
# LogisticRegression

tuned_LogisticRegression = tune_model(LogisticRegression, optimize = 'F1')
print(tuned_LogisticRegression)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7383,0.8165,0.7176,0.5054,0.5931,0.4087,0.4223
1,0.7444,0.8305,0.7634,0.5128,0.6135,0.4334,0.4525
2,0.7805,0.8942,0.8538,0.555,0.6727,0.5185,0.5458
3,0.7764,0.858,0.7846,0.5543,0.6497,0.4925,0.5086
4,0.7337,0.8079,0.7786,0.5,0.609,0.4213,0.4451
5,0.7459,0.84,0.771,0.5153,0.6177,0.4385,0.4585
6,0.7663,0.8525,0.8168,0.5404,0.6505,0.4856,0.509
7,0.7866,0.8602,0.7786,0.573,0.6602,0.5098,0.5226
8,0.7622,0.8341,0.7557,0.538,0.6286,0.4609,0.4753
9,0.7337,0.845,0.7405,0.5,0.5969,0.4091,0.4267


LogisticRegression(C=0.049, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=123, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


hasil test dari Gradient Boosting yang sudah di tuning

In [None]:
predict_model(tuned_GradientBoosting_classifier)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.7763,0.8374,0.7326,0.5607,0.6352,0.478,0.4869


Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Female,SeniorCitizen_Yes,Partner_Yes,Dependents_Yes,MultipleLines_No,MultipleLines_Yes,InternetService_DSL,...,Contract_One year,Contract_Two year,PaperlessBilling_No,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn,Label,Score
0,40.0,85.050003,3355.649902,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,No,No,0.5029
1,29.0,100.550003,2878.750000,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,No,Yes,0.5049
2,19.0,20.850000,467.500000,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,No,No,0.5205
3,65.0,24.750000,1715.099976,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,No,No,0.5206
4,54.0,114.650002,6049.500000,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,No,No,0.5101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2105,57.0,90.650002,5199.799805,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,No,No,0.5059
2106,28.0,91.000000,2626.149902,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,No,No,0.5044
2107,71.0,106.800003,7623.200195,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,No,No,0.5133
2108,59.0,54.150002,3116.149902,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,No,No,0.5159


hasil test dari LGBM yang sudah di tuning

In [None]:
predict_model(tuned_LGBMClassifier);

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.7782,0.8361,0.7255,0.5645,0.6349,0.4792,0.487


hasil test dari random forest yang sudah di tuning

In [None]:
predict_model(tuned_RandomForestClassifier);

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.7692,0.8391,0.7398,0.5489,0.6302,0.4678,0.4788


hasil dari AdaBoost yang sudah di tuning

In [21]:
predict_model(tuned_AdaBoostClassifier)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ada Boost Classifier,0.7678,0.8447,0.7647,0.5451,0.6365,0.4728,0.4875


Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Male,SeniorCitizen_Yes,Partner_No,Dependents_Yes,PhoneService_Yes,MultipleLines_No,MultipleLines_Yes,...,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn,Label,Score
0,0.704866,1.216222,1.242410,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,No,No,0.5396
1,-1.364351,-1.110543,-1.298694,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,Yes,Yes,0.5357
2,0.054800,0.845716,0.651641,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,No,No,0.5309
3,-0.956951,0.584408,-0.610470,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,Yes,Yes,0.5585
4,1.498302,1.483919,1.702654,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,No,No,0.6489
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2105,-1.260806,-0.561514,-1.194966,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,No,Yes,0.5132
2106,-0.907340,0.692524,-0.556244,1.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,Yes,Yes,0.5526
2107,-1.364351,0.332483,-1.253528,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,Yes,Yes,0.5850
2108,1.498302,-1.280602,0.141922,0.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,No,No,0.6864


hasil dari LogisticRegression yang sudah di tuning

In [22]:
predict_model(tuned_LogisticRegression)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.7602,0.8433,0.7594,0.5345,0.6274,0.4584,0.4737


Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Male,SeniorCitizen_Yes,Partner_No,Dependents_Yes,PhoneService_Yes,MultipleLines_No,MultipleLines_Yes,...,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn,Label,Score
0,0.704866,1.216222,1.242410,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,No,No,0.7161
1,-1.364351,-1.110543,-1.298694,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,Yes,Yes,0.6517
2,0.054800,0.845716,0.651641,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,No,No,0.5930
3,-0.956951,0.584408,-0.610470,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,Yes,Yes,0.9209
4,1.498302,1.483919,1.702654,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,No,No,0.9498
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2105,-1.260806,-0.561514,-1.194966,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,No,Yes,0.6545
2106,-0.907340,0.692524,-0.556244,1.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,Yes,Yes,0.9166
2107,-1.364351,0.332483,-1.253528,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,Yes,Yes,0.9134
2108,1.498302,-1.280602,0.141922,0.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,No,No,0.9861


In [None]:
# Setting up the classifier
# Pass the complete dataset as data and the featured to be predicted as target
clf=setup(data=data, 
          data_split_stratify = True,
          target='Churn', 
          fix_imbalance = True,
          fix_imbalance_method = ADASYN(),
          normalize =  True,
          transformation = True,	
          )

Unnamed: 0,Description,Value
0,session_id,4901
1,Target,Churn
2,Target Type,Binary
3,Label Encoded,"No: 0, Yes: 1"
4,Original Data,"(7032, 20)"
5,Missing Values,False
6,Numeric Features,3
7,Categorical Features,16
8,Ordinal Features,False
9,High Cardinality Features,False


In [None]:
Model	                            Accuracy	AUC	   Recall	Prec.	     F1	   Kappa	MCC	    TT (Sec)
gbc	Gradient Boosting Classifier	0.7926	0.8480	0.6728	0.5982	0.6326	0.4889	0.4911	1.164

In [None]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.7857,0.8412,0.6652,0.5862,0.6228,0.4741,0.4762,1.319
lightgbm,Light Gradient Boosting Machine,0.7791,0.8239,0.5818,0.5847,0.5829,0.4328,0.4331,0.472
rf,Random Forest Classifier,0.7743,0.8092,0.5658,0.5766,0.5708,0.4178,0.418,1.204
ada,Ada Boost Classifier,0.7682,0.8372,0.7256,0.5485,0.6241,0.4613,0.4714,0.545
et,Extra Trees Classifier,0.7603,0.7831,0.513,0.553,0.5317,0.3711,0.3719,1.155
dummy,Dummy Classifier,0.7343,0.5,0.0,0.0,0.0,0.0,0.0,0.207
lr,Logistic Regression,0.7249,0.8385,0.8234,0.49,0.6141,0.4212,0.4559,0.656
lda,Linear Discriminant Analysis,0.7233,0.8384,0.8265,0.4885,0.6137,0.4198,0.4555,0.235
ridge,Ridge Classifier,0.7231,0.0,0.8265,0.4882,0.6134,0.4194,0.4552,0.206
nb,Naive Bayes,0.7188,0.826,0.8188,0.4833,0.6076,0.4105,0.4455,0.208


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=4901, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)