In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
# Converting 'TotalCharges' to numeric

df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [5]:
# Checking for Missing values

df.isnull().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [6]:
# Filling 'totalcharges' missing values with 0

df['TotalCharges'].fillna(0,inplace = True)

In [8]:
df['TotalCharges'].isnull().sum()

0

In [9]:
# Convert 'Churn' to Binary

df['Churn'] = df['Churn'].apply(lambda x: 1 if x=='Yes' else 0)

In [11]:
# Train-Test Split

from sklearn.model_selection import train_test_split

# Define features and target

X = df.drop(columns=['Churn'])
y = df['Churn']
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [12]:
# Feature Engineering

categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 
               'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 
               'Contract', 'PaperlessBilling', 'PaymentMethod']
numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

In [15]:
# Scaling Numerical Features

from sklearn.preprocessing import StandardScaler

# Scale numerical features

scaler = StandardScaler()
X_train_numerical = scaler.fit_transform(X_train[numerical])
X_test_numerical = scaler.transform(X_test[numerical])

# Convert back to DataFrame

X_train_numerical = pd.DataFrame(X_train_numerical, columns = numerical, index=X_train.index)
X_test_numerical = pd.DataFrame(X_test_numerical, columns=numerical, index=X_test.index)

In [20]:
# One-Hot Encoding Categorical Features

from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False,drop='first') # drop='first' to avoid multicollinearity
X_train_categorical = encoder.fit_transform(X_train[categorical])
X_test_categorical = encoder.transform(X_test[categorical])

# Convert back to DataFrame

X_train_categorical = pd.DataFrame(X_train_categorical,columns=encoder.get_feature_names_out(categorical), index=X_train.index)
X_test_categorical = pd.DataFrame(X_test_categorical, columns=encoder.get_feature_names_out(categorical), index=X_test.index)

In [21]:
# Combine scaled numerical and one-hot encoded categorical features.

X_train_processed = pd.concat([X_train_numerical, X_train_categorical], axis=1)
X_test_processed = pd.concat([X_test_numerical, X_test_categorical], axis=1)

In [22]:
# Model Training and Evaluation

# Train the models using RandomForestClassifier and ExtraTreesClassifier.

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Train Random Forest

rf = RandomForestClassifier(random_state=1)
rf.fit(X_train_processed,y_train)
rf_pred = rf.predict(X_test_processed)

# Train Extra Trees

et = ExtraTreesClassifier(random_state=1)
et.fit(X_train_processed, y_train)
et_pred = et.predict(X_test_processed)


In [26]:
# Evaluate Random Forest

print("Random Forest Classifier:")
print("Accuracy",accuracy_score(y_test,rf_pred))
print("consfusion matrix:")
print(confusion_matrix(y_test,rf_pred))
print(classification_report(y_test,rf_pred))

Random Forest Classifier:
Accuracy 0.8005677785663591
consfusion matrix:
[[936 125]
 [156 192]]
              precision    recall  f1-score   support

           0       0.86      0.88      0.87      1061
           1       0.61      0.55      0.58       348

    accuracy                           0.80      1409
   macro avg       0.73      0.72      0.72      1409
weighted avg       0.80      0.80      0.80      1409



In [27]:
# Evaluate Extra Trees

print("Extra Trees Classifier:")
print("Accuracy:", accuracy_score(y_test, et_pred))
print("consfusion matrix:")
print(confusion_matrix(y_test, et_pred))
print(classification_report(y_test, et_pred))

Extra Trees Classifier:
Accuracy: 0.7828246983676366
consfusion matrix:
[[926 135]
 [171 177]]
              precision    recall  f1-score   support

           0       0.84      0.87      0.86      1061
           1       0.57      0.51      0.54       348

    accuracy                           0.78      1409
   macro avg       0.71      0.69      0.70      1409
weighted avg       0.78      0.78      0.78      1409



In [30]:
! pip install xgboost
! pip install lightbgm

Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.1/99.8 MB 3.6 MB/s eta 0:00:28
   ---------------------------------------- 0.4/99.8 MB 4.9 MB/s eta 0:00:21
    --------------------------------------- 1.5/99.8 MB 10.5 MB/s eta 0:00:10
    --------------------------------------- 2.3/99.8 MB 12.2 MB/s eta 0:00:08
   - -------------------------------------- 3.1/99.8 MB 13.1 MB/s eta 0:00:08
   - -------------------------------------- 3.9/99.8 MB 13.9 MB/s eta 0:00:07
   - -------------------------------------- 4.9/99.8 MB 14.8 MB/s eta 0:00:07
   -- ------------------------------------- 5.9/99.8 MB 15.7 MB/s eta 0:00:06
   -- ------------------------------------- 6.9/99.8 MB 16.3 MB/s eta 0:00:06
   --- ------------------------------------ 8.0/99.8 MB 17.0 MB/s eta 0:00:06
   --

ERROR: Could not find a version that satisfies the requirement lightbgm (from versions: none)
ERROR: No matching distribution found for lightbgm


In [31]:
# Train XGBoost and LightGBM Models
# Train the models using XGBClassifier and LGBMClassifier.

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Train XGBoost
xgb = XGBClassifier(random_state=1, use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train_processed, y_train)
xgb_pred = xgb.predict(X_test_processed)

# Train LightGBM
lgb = LGBMClassifier(random_state=1)
lgb.fit(X_train_processed, y_train)
lgb_pred = lgb.predict(X_test_processed)

[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002565 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 638
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785


In [32]:
# Evaluate XGBoost
print("XGBoost Classifier:")
print("Accuracy:", accuracy_score(y_test, xgb_pred))
print(confusion_matrix(y_test, xgb_pred))
print(classification_report(y_test, xgb_pred))

XGBoost Classifier:
Accuracy: 0.7991483321504613
[[929 132]
 [151 197]]
              precision    recall  f1-score   support

           0       0.86      0.88      0.87      1061
           1       0.60      0.57      0.58       348

    accuracy                           0.80      1409
   macro avg       0.73      0.72      0.72      1409
weighted avg       0.80      0.80      0.80      1409



In [33]:
# Evaluate LightGBM
print("LightGBM Classifier:")
print("Accuracy:", accuracy_score(y_test, lgb_pred))
print(confusion_matrix(y_test, lgb_pred))
print(classification_report(y_test, lgb_pred))

LightGBM Classifier:
Accuracy: 0.8147622427253371
[[948 113]
 [148 200]]
              precision    recall  f1-score   support

           0       0.86      0.89      0.88      1061
           1       0.64      0.57      0.61       348

    accuracy                           0.81      1409
   macro avg       0.75      0.73      0.74      1409
weighted avg       0.81      0.81      0.81      1409

