In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [5]:
df = pd.read_csv('../data/Telco-Customer-Churn.csv')

In [6]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [7]:
# drop the customerID 
df = df.drop(columns=['customerID'],axis=1)

In [8]:
# conver the invalid data type and remove the null values TotalCharges column

df['TotalCharges'] = pd.to_numeric(df['TotalCharges'],errors='coerce')
print(f"Null Values Count for Total Charges Column: {df['TotalCharges'].isnull().sum()}") 
df['TotalCharges'].fillna(df['TotalCharges'].median(),inplace=True)
print(f"Null Values Count for Total Charges Column: {df['TotalCharges'].isnull().sum()}") 

Null Values Count for Total Charges Column: 11
Null Values Count for Total Charges Column: 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df['TotalCharges'].median(),inplace=True)


In [9]:
df.head(2)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No


In [10]:
# change Chrun column values to 0 and 1 -> because models cannot understand string values only can understand numeric values

df['Churn'] = df['Churn'].map({'Yes':1 , 'No':0})
df.head(2)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0


In [11]:
# seperate the independent features and dependeant features
X = df.drop(columns=['Churn'],axis=1)
y = df['Churn']

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [13]:
# separate the numerical and categorical features sepratly

num_feature =  X.select_dtypes(exclude='object').columns
cat_feature =  X.select_dtypes(include='object').columns

In [14]:
num_feature,cat_feature

(Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges'], dtype='object'),
 Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
        'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
        'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
        'PaperlessBilling', 'PaymentMethod'],
       dtype='object'))

In [15]:
## split the dataset into train and test
from sklearn.model_selection import train_test_split


X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

X_train.shape,X_test.shape

((5634, 19), (1409, 19))

In [16]:
y_train

2142    0
1623    0
6074    1
1362    1
6754    0
       ..
3772    1
5191    0
5226    0
5390    1
860     0
Name: Churn, Length: 5634, dtype: int64

In [17]:
## independent features standeralization and encoding

from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer



In [18]:
# create pipelines
from sklearn.pipeline import Pipeline

num_pipeline = Pipeline(
    [
        ('scaling',StandardScaler())

    ]
)

cat_pipeline = Pipeline(
    [
        ('ohe',OneHotEncoder()),
        ('scaling',StandardScaler(with_mean=False))
    ]
)


preprocessor = ColumnTransformer(
    [
        ('num_pipeline',num_pipeline,num_feature),
        ('cat_pipeline',cat_pipeline,cat_feature)
    ]
)



In [19]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [20]:
X_train

array([[-4.37749204e-01, -4.65683364e-01, -4.73723375e-04, ...,
         0.00000000e+00,  0.00000000e+00,  2.38521970e+00],
       [-4.37749204e-01,  8.85536787e-01,  1.07475386e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-4.37749204e-01, -1.28460467e+00, -1.37649913e+00, ...,
         0.00000000e+00,  2.11879552e+00,  0.00000000e+00],
       ...,
       [-4.37749204e-01, -8.34197950e-01, -1.45294499e+00, ...,
         0.00000000e+00,  2.11879552e+00,  0.00000000e+00],
       [ 2.28441306e+00, -8.34197950e-01,  1.14953785e+00, ...,
         0.00000000e+00,  2.11879552e+00,  0.00000000e+00],
       [-4.37749204e-01, -2.60953038e-01, -1.49781538e+00, ...,
         2.42283052e+00,  0.00000000e+00,  0.00000000e+00]],
      shape=(5634, 45))

In [21]:
X_test

array([[-0.4377492 , -1.28460467, -1.33162874, ...,  0.        ,
         2.11879552,  0.        ],
       [-0.4377492 ,  0.35323794, -1.31667194, ...,  0.        ,
         0.        ,  0.        ],
       [-0.4377492 ,  0.80364466, -1.51277218, ...,  0.        ,
         0.        ,  2.3852197 ],
       ...,
       [-0.4377492 , -0.62946762, -1.49449165, ...,  0.        ,
         0.        ,  0.        ],
       [-0.4377492 ,  1.49972776, -0.69513389, ...,  2.42283052,
         0.        ,  0.        ],
       [-0.4377492 , -1.28460467, -1.11392424, ...,  0.        ,
         0.        ,  2.3852197 ]], shape=(1409, 45))

In [22]:
# import the machine leaning algorithms

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier


from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

In [23]:
# create the model

models = {
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "GradientBoostingClassifier": GradientBoostingClassifier(),
    "AdaBoostClassifier": AdaBoostClassifier(),
    "XGBClassifier": XGBClassifier()
}

In [24]:
def evaluate_model(true,pred) :
    clr= classification_report(true,pred)
    acc = accuracy_score(true,pred)
    cm = confusion_matrix(true,pred)

    return (
        clr,
        acc,
        cm
    )

In [25]:
model_list = []

for name, model in models.items():

    # Train the model
    model.fit(X_train, y_train)

    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluation
    clr_train, acc_train, cm_train = evaluate_model(y_train, y_train_pred)
    clr_test, acc_test, cm_test = evaluate_model(y_test, y_test_pred)

    print(name)
    model_list.append(name)

    print('Model performance for Training set')
    print("Classification Report:\n", clr_train)
    print("Accuracy Score:", acc_train)
    print("Confusion Matrix:\n", cm_train)

    print('----------------------------------')

    print('Model performance for Testing set')
    print("Classification Report:\n", clr_test)
    print("Accuracy Score:", acc_test)
    print("Confusion Matrix:\n", cm_test)
    print('\n')


DecisionTreeClassifier
Model performance for Training set
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4138
           1       1.00      1.00      1.00      1496

    accuracy                           1.00      5634
   macro avg       1.00      1.00      1.00      5634
weighted avg       1.00      1.00      1.00      5634

Accuracy Score: 0.9985800496982605
Confusion Matrix:
 [[4136    2]
 [   6 1490]]
----------------------------------
Model performance for Testing set
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.80      0.81      1036
           1       0.46      0.47      0.47       373

    accuracy                           0.72      1409
   macro avg       0.64      0.64      0.64      1409
weighted avg       0.72      0.72      0.72      1409

Accuracy Score: 0.7154009936124911
Confusion Matrix:
 [[831 205]
 [196 177]]


RandomFores