In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from imblearn.combine import SMOTETomek

In [22]:
df = pd.read_csv('data/telecom_churn.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,MonthlyCharges,TotalCharges,Churn,gender_Male,SeniorCitizen_Yes,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,...,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,0,False,False,True,False,False,True,...,False,True,False,True,False,False,False,False,False,False
1,1,56.95,1889.5,0,True,False,False,False,True,False,...,False,False,False,False,True,False,True,False,False,False
2,2,53.85,108.15,1,True,False,False,False,True,False,...,False,True,False,False,True,False,False,False,False,False
3,3,42.3,1840.75,0,True,False,False,False,False,True,...,False,False,False,False,False,False,False,True,False,False
4,4,70.7,151.65,1,False,False,False,False,True,False,...,False,True,False,True,False,False,False,False,False,False


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 36 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Unnamed: 0                             7032 non-null   int64  
 1   MonthlyCharges                         7032 non-null   float64
 2   TotalCharges                           7032 non-null   float64
 3   Churn                                  7032 non-null   int64  
 4   gender_Male                            7032 non-null   bool   
 5   SeniorCitizen_Yes                      7032 non-null   bool   
 6   Partner_Yes                            7032 non-null   bool   
 7   Dependents_Yes                         7032 non-null   bool   
 8   PhoneService_Yes                       7032 non-null   bool   
 9   MultipleLines_No phone service         7032 non-null   bool   
 10  MultipleLines_Yes                      7032 non-null   bool   
 11  Inte

In [24]:
df = df.drop(columns={'Unnamed: 0'},axis=1)
df.head()

Unnamed: 0,MonthlyCharges,TotalCharges,Churn,gender_Male,SeniorCitizen_Yes,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,...,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,29.85,29.85,0,False,False,True,False,False,True,False,...,False,True,False,True,False,False,False,False,False,False
1,56.95,1889.5,0,True,False,False,False,True,False,False,...,False,False,False,False,True,False,True,False,False,False
2,53.85,108.15,1,True,False,False,False,True,False,False,...,False,True,False,False,True,False,False,False,False,False
3,42.3,1840.75,0,True,False,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False
4,70.7,151.65,1,False,False,False,False,True,False,False,...,False,True,False,True,False,False,False,False,False,False


In [25]:
X = df.drop(columns={'Churn'},axis=1)
y = df['Churn']

In [26]:
print(X.shape)
print(y.shape)

(7032, 34)
(7032,)


In [27]:
X.head()

Unnamed: 0,MonthlyCharges,TotalCharges,gender_Male,SeniorCitizen_Yes,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,...,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,29.85,29.85,False,False,True,False,False,True,False,False,...,False,True,False,True,False,False,False,False,False,False
1,56.95,1889.5,True,False,False,False,True,False,False,False,...,False,False,False,False,True,False,True,False,False,False
2,53.85,108.15,True,False,False,False,True,False,False,False,...,False,True,False,False,True,False,False,False,False,False
3,42.3,1840.75,True,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
4,70.7,151.65,False,False,False,False,True,False,False,True,...,False,True,False,True,False,False,False,False,False,False


In [28]:
y.head()

0    0
1    0
2    1
3    0
4    1
Name: Churn, dtype: int64

In [29]:
sm = SMOTETomek(sampling_strategy='minority',n_jobs=-1,random_state=42)
X_res, y_res = sm.fit_resample(X,y)



In [33]:
print(X_res.shape)
print(y_res.shape)

(9472, 34)
(9472,)


In [35]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score,confusion_matrix

In [50]:
models = {
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Logistic Regression": LogisticRegression(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "XGBClassifier": XGBClassifier(), 
    "CatBoosting Classifier": CatBoostClassifier(verbose=False),
    "AdaBoost Classifier": AdaBoostClassifier(),
}

In [51]:
def evaluate(y_test,y_pred):
    acc = accuracy_score(y_test,y_pred)
    prec = precision_score(y_test,y_pred)
    rec = recall_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    roc_auc = roc_auc_score(y_test,y_pred)
    return acc,f1,prec,rec,roc_auc

In [52]:
def evaluate_model(X,y,models):
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
    models_list = []
    accuracy_list = []
    recall_list = []

    for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(X_train,y_train)

        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        model_train_accuracy, model_train_f1,model_train_precision,model_train_recall,model_train_rocauc_score = evaluate(y_train ,y_train_pred)

        model_test_accuracy,model_test_f1,model_test_precision, model_test_recall,model_test_rocauc_score=evaluate(y_test, y_test_pred)
        
        print(list(models.keys())[i])
        models_list.append(list(models.keys())[i])

        print('Model performance for Training set')
        print("- Accuracy: {:.4f}".format(model_train_accuracy))
        print('- F1 score: {:.4f}'.format(model_train_f1)) 
        print('- Precision: {:.4f}'.format(model_train_precision))
        print('- Recall: {:.4f}'.format(model_train_recall))
        print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))

        print('----------------------------------')

        print('Model performance for Test set')
        print('- Accuracy: {:.4f}'.format(model_test_accuracy))
        print('- F1 score: {:.4f}'.format(model_test_f1))
        print('- Precision: {:.4f}'.format(model_test_precision))
        print('- Recall: {:.4f}'.format(model_test_recall))
        print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))
        
        accuracy_list.append(model_test_accuracy)
        recall_list.append(model_test_recall)
        print('='*35)
        print('\n')

    report = pd.DataFrame(list(zip(models_list, accuracy_list, recall_list)), columns=['Model Name', 'Accuracy','Recall']).sort_values(by=["Recall"],ascending=False)
        
    return report

In [53]:
evaluate_model(X_res,y_res,models)

Random Forest
Model performance for Training set
- Accuracy: 0.9984
- F1 score: 0.9984
- Precision: 0.9982
- Recall: 0.9987
- Roc Auc Score: 0.9984
----------------------------------
Model performance for Test set
- Accuracy: 0.8660
- F1 score: 0.8624
- Precision: 0.8441
- Recall: 0.8815
- Roc Auc Score: 0.8667


Decision Tree
Model performance for Training set
- Accuracy: 0.9984
- F1 score: 0.9984
- Precision: 0.9997
- Recall: 0.9971
- Roc Auc Score: 0.9984
----------------------------------
Model performance for Test set
- Accuracy: 0.8148
- F1 score: 0.8152
- Precision: 0.7771
- Recall: 0.8571
- Roc Auc Score: 0.8167


Gradient Boosting
Model performance for Training set
- Accuracy: 0.8649
- F1 score: 0.8686
- Precision: 0.8548
- Recall: 0.8829
- Roc Auc Score: 0.8646
----------------------------------
Model performance for Test set
- Accuracy: 0.8517
- F1 score: 0.8498
- Precision: 0.8213
- Recall: 0.8804
- Roc Auc Score: 0.8530


Logistic Regression
Model performance for Training 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


K-Neighbors Classifier
Model performance for Training set
- Accuracy: 0.8614
- F1 score: 0.8703
- Precision: 0.8264
- Recall: 0.9191
- Roc Auc Score: 0.8607
----------------------------------
Model performance for Test set
- Accuracy: 0.7984
- F1 score: 0.8041
- Precision: 0.7488
- Recall: 0.8682
- Roc Auc Score: 0.8015


XGBClassifier
Model performance for Training set
- Accuracy: 0.9586
- F1 score: 0.9594
- Precision: 0.9510
- Recall: 0.9679
- Roc Auc Score: 0.9584
----------------------------------
Model performance for Test set
- Accuracy: 0.8633
- F1 score: 0.8608
- Precision: 0.8361
- Recall: 0.8870
- Roc Auc Score: 0.8644


CatBoosting Classifier
Model performance for Training set
- Accuracy: 0.9192
- F1 score: 0.9208
- Precision: 0.9137
- Recall: 0.9280
- Roc Auc Score: 0.9191
----------------------------------
Model performance for Test set
- Accuracy: 0.8686
- F1 score: 0.8658
- Precision: 0.8435
- Recall: 0.8893
- Roc Auc Score: 0.8695






AdaBoost Classifier
Model performance for Training set
- Accuracy: 0.8349
- F1 score: 0.8407
- Precision: 0.8213
- Recall: 0.8609
- Roc Auc Score: 0.8346
----------------------------------
Model performance for Test set
- Accuracy: 0.8412
- F1 score: 0.8400
- Precision: 0.8078
- Recall: 0.8749
- Roc Auc Score: 0.8427




Unnamed: 0,Model Name,Accuracy,Recall
6,CatBoosting Classifier,0.868602,0.889258
5,XGBClassifier,0.863325,0.887043
0,Random Forest,0.865963,0.881506
2,Gradient Boosting,0.851715,0.880399
7,AdaBoost Classifier,0.841161,0.874862
4,K-Neighbors Classifier,0.798417,0.868217
1,Decision Tree,0.814776,0.857143
3,Logistic Regression,0.83219,0.822813


In [54]:
from imblearn.combine import SMOTEENN
sm = SMOTEENN(sampling_strategy='minority',random_state=42)
X_resampled1, y_resampled1 = sm.fit_resample(X,y)

In [55]:
evaluate_model(X_resampled1,y_resampled1,models)

Random Forest
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.9494
- F1 score: 0.9529
- Precision: 0.9416
- Recall: 0.9645
- Roc Auc Score: 0.9485


Decision Tree
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.9392
- F1 score: 0.9435
- Precision: 0.9295
- Recall: 0.9580
- Roc Auc Score: 0.9379


Gradient Boosting
Model performance for Training set
- Accuracy: 0.9619
- F1 score: 0.9653
- Precision: 0.9604
- Recall: 0.9702
- Roc Auc Score: 0.9610
----------------------------------
Model performance for Test set
- Accuracy: 0.9529
- F1 score: 0.9563
- Precision: 0.9406
- Recall: 0.9725
- Roc Auc Score: 0.9516


Logistic Regression
Model performance for Training 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


K-Neighbors Classifier
Model performance for Training set
- Accuracy: 0.9713
- F1 score: 0.9740
- Precision: 0.9650
- Recall: 0.9831
- Roc Auc Score: 0.9701
----------------------------------
Model performance for Test set
- Accuracy: 0.9503
- F1 score: 0.9537
- Precision: 0.9431
- Recall: 0.9645
- Roc Auc Score: 0.9494


XGBClassifier
Model performance for Training set
- Accuracy: 0.9996
- F1 score: 0.9996
- Precision: 0.9996
- Recall: 0.9996
- Roc Auc Score: 0.9996
----------------------------------
Model performance for Test set
- Accuracy: 0.9520
- F1 score: 0.9554
- Precision: 0.9419
- Recall: 0.9693
- Roc Auc Score: 0.9509


CatBoosting Classifier
Model performance for Training set
- Accuracy: 0.9876
- F1 score: 0.9887
- Precision: 0.9852
- Recall: 0.9922
- Roc Auc Score: 0.9871
----------------------------------
Model performance for Test set
- Accuracy: 0.9572
- F1 score: 0.9600
- Precision: 0.9509
- Recall: 0.9693
- Roc Auc Score: 0.9564






AdaBoost Classifier
Model performance for Training set
- Accuracy: 0.9490
- F1 score: 0.9536
- Precision: 0.9481
- Recall: 0.9592
- Roc Auc Score: 0.9480
----------------------------------
Model performance for Test set
- Accuracy: 0.9357
- F1 score: 0.9403
- Precision: 0.9263
- Recall: 0.9548
- Roc Auc Score: 0.9345




Unnamed: 0,Model Name,Accuracy,Recall
2,Gradient Boosting,0.952871,0.972536
5,XGBClassifier,0.952014,0.969305
6,CatBoosting Classifier,0.957155,0.969305
0,Random Forest,0.949443,0.964459
4,K-Neighbors Classifier,0.9503,0.964459
1,Decision Tree,0.93916,0.957997
7,AdaBoost Classifier,0.935733,0.954766
3,Logistic Regression,0.912596,0.925687


In [56]:
gb = GradientBoostingClassifier()
X_train,X_test,y_train,y_test = train_test_split(X_resampled1,y_resampled1,test_size=0.2,random_state=42)
gb.fit(X_train,y_train)
y_pred = gb.predict(X_test)
accuracy_score(y_test,y_pred)

0.9528706083976007

In [57]:
recall_score(y_test,y_pred)

0.9725363489499192