## ML_Assignment2

# Telco Customer Churn

In [1]:
# Data source :- https://www.kaggle.com/datasets/blastchar/telco-customer-churn/data

## Import Required Libraries

In [2]:
# Data handling
import pandas as pd
import numpy as np
import streamlit as st
import joblib

from sklearn.metrics import confusion_matrix

# Visualization (optional but useful)
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# ML Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

# Evaluation Metrics
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    matthews_corrcoef,
    confusion_matrix,
    classification_report
)

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")


### Load the Dataset

In [3]:
# Load dataset
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Check shape
print("Dataset Shape:", df.shape)

# View first 5 rows
df.head()


Dataset Shape: (6673, 21)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### Dataset Overview & Understanding

In [4]:
# Column names
print("Columns:\n", df.columns)

# Data types
print("\nData Types:\n")
print(df.dtypes)

# Target variable distribution
print("\nChurn Distribution:\n")
print(df['Churn'].value_counts())


Columns:
 Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

Data Types:

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: 

### Data Cleaning
#### Remove unnecessary columns and fix incorrect data types

In [5]:
# Drop customerID (Not useful for prediction)
df.drop('customerID', axis=1, inplace=True)

#Convert TotalCharges to numeric
# Convert TotalCharges to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Check missing values
df.isnull().sum()

# Handle Missing Values
# Fill missing values with median
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)



### Encode Categorical Variables

In [6]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
print("Categorical Columns:\n", categorical_cols)


Categorical Columns:
 Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod', 'Churn'],
      dtype='object')


In [7]:
# le = LabelEncoder()

# for col in categorical_cols:
#     df[col] = le.fit_transform(df[col])


In [8]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


In [9]:
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,29.85,29.85,0
1,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,56.95,1889.50,0
2,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,53.85,108.15,1
3,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,42.30,1840.75,0
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,70.70,151.65,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6668,1,0,1,1,24,1,2,0,2,0,2,2,2,2,1,1,3,84.80,1990.50,0
6669,0,0,1,1,72,1,2,1,0,2,2,0,2,2,1,1,1,103.20,7362.90,0
6670,0,0,1,1,11,0,1,0,2,0,0,0,0,0,0,1,2,29.60,346.45,0
6671,1,1,1,0,4,1,2,1,0,0,0,0,0,0,0,1,3,74.40,306.60,1


In [10]:
# Feature & Target Split - Separate inputs (X) and output (y)

# Features and target
X = df.drop('Churn', axis=1)
y = df['Churn']

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (6673, 19)
y shape: (6673,)


In [11]:
# Train-Test Split -Test model performance on unseen data.

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])


Training samples: 5338
Testing samples: 1335


### Feature Scaling

In [12]:
# scaler = StandardScaler()

# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)


In [13]:


# scaler = StandardScaler()

# # Fit ONLY on training FEATURES
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)


In [14]:
from sklearn.preprocessing import StandardScaler

feature_scaler = StandardScaler()

X_train_scaled = feature_scaler.fit_transform(X_train)
X_test_scaled = feature_scaler.transform(X_test)


In [15]:
# Define Evaluation Function as it's required for all 6 metrics for every model

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }


### Logistic Regression

In [16]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_scaled, y_train)

lr_metrics = evaluate_model(lr, X_test_scaled, y_test)
lr_metrics


{'Accuracy': 0.8112359550561797,
 'AUC': 0.8609052218133233,
 'Precision': 0.6783216783216783,
 'Recall': 0.5480225988700564,
 'F1 Score': 0.60625,
 'MCC': 0.4887102639090668}

### Decision Tree Classifier

In [17]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

dt_metrics = evaluate_model(dt, X_test, y_test)
dt_metrics


{'Accuracy': 0.7438202247191011,
 'AUC': 0.6708578816726849,
 'Precision': 0.5171428571428571,
 'Recall': 0.5112994350282486,
 'F1 Score': 0.5142045454545454,
 'MCC': 0.34026502866478237}

### K-Nearest Neighbors (KNN)

In [18]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

knn_metrics = evaluate_model(knn, X_test_scaled, y_test)
knn_metrics


{'Accuracy': 0.753558052434457,
 'AUC': 0.7752092008039762,
 'Precision': 0.5389408099688473,
 'Recall': 0.4887005649717514,
 'F1 Score': 0.5125925925925926,
 'MCC': 0.34895395721306394}

### Naive Bayes

In [19]:
nb = GaussianNB()
nb.fit(X_train_scaled, y_train)

nb_metrics = evaluate_model(nb, X_test_scaled, y_test)
nb_metrics


{'Accuracy': 0.7655430711610487,
 'AUC': 0.8383553044569992,
 'Precision': 0.5427974947807933,
 'Recall': 0.7344632768361582,
 'F1 Score': 0.6242496998799519,
 'MCC': 0.4704795645966955}

### Random Forest

In [20]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

rf_metrics = evaluate_model(rf, X_test, y_test)
rf_metrics


{'Accuracy': 0.799250936329588,
 'AUC': 0.8360674856165449,
 'Precision': 0.6666666666666666,
 'Recall': 0.4858757062146893,
 'F1 Score': 0.5620915032679739,
 'MCC': 0.4451748861681467}

### XGBoost

In [21]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

xgb.fit(X_train, y_train)

xgb_metrics = evaluate_model(xgb, X_test, y_test)
xgb_metrics


{'Accuracy': 0.8,
 'AUC': 0.8459458525544671,
 'Precision': 0.6494845360824743,
 'Recall': 0.5338983050847458,
 'F1 Score': 0.586046511627907,
 'MCC': 0.45965272386835015}

### Comparison Table

To select the best-performing model, all evaluation metrics (Accuracy, AUC, Precision, Recall, F1-score, and MCC) were normalized and combined into a single overall score. The model with the highest overall score was selected as the best model, and its confusion matrix was displayed for focused performance analysis.

In [22]:
results = pd.DataFrame([
    lr_metrics,
    dt_metrics,
    knn_metrics,
    nb_metrics,
    rf_metrics,
    xgb_metrics
], index=[
    "Logistic Regression",
    "Decision Tree",
    "KNN",
    "Naive Bayes",
    "Random Forest",
    "XGBoost"
])

results


Unnamed: 0,Accuracy,AUC,Precision,Recall,F1 Score,MCC
Logistic Regression,0.811236,0.860905,0.678322,0.548023,0.60625,0.48871
Decision Tree,0.74382,0.670858,0.517143,0.511299,0.514205,0.340265
KNN,0.753558,0.775209,0.538941,0.488701,0.512593,0.348954
Naive Bayes,0.765543,0.838355,0.542797,0.734463,0.62425,0.47048
Random Forest,0.799251,0.836067,0.666667,0.485876,0.562092,0.445175
XGBoost,0.8,0.845946,0.649485,0.533898,0.586047,0.459653


In [23]:
### Confusion Matrix (For Streamlit)

In [24]:
# cm = confusion_matrix(y_test, xgb.predict(X_test))

# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
# plt.xlabel("Predicted")
# plt.ylabel("Actual")
# plt.title("Confusion Matrix - XGBoost")
# plt.show()


In [25]:
### Create an Overall Score Using ALL Metrics

In [26]:
# Compute Overall Score - Model with highest average = Best Model

from sklearn.preprocessing import MinMaxScaler

# Copy results to avoid modifying original
score_df = results.copy()

# Normalize all metric columns
scaler = MinMaxScaler()
score_df[:] = scaler.fit_transform(score_df)

# Create an overall score using all metrics
score_df['Overall_Score'] = score_df.mean(axis=1)

# Identify best model
best_model_name = score_df['Overall_Score'].idxmax()

print("Best Model (based on all metrics):", best_model_name)

score_df


Best Model (based on all metrics): Logistic Regression


Unnamed: 0,Accuracy,AUC,Precision,Recall,F1 Score,MCC,Overall_Score
Logistic Regression,1.0,1.0,1.0,0.25,0.838795,1.0,0.848132
Decision Tree,0.0,0.0,0.0,0.102273,0.014437,0.0,0.019452
KNN,0.144444,0.549081,0.135241,0.011364,0.0,0.058533,0.149777
Naive Bayes,0.322222,0.881346,0.159169,1.0,1.0,0.877189,0.706654
Random Forest,0.822222,0.869308,0.927689,0.0,0.443312,0.706724,0.628209
XGBoost,0.833333,0.921286,0.821086,0.193182,0.657853,0.804254,0.705166


In [27]:
# Map Models to Objects
models = {
    "Logistic Regression": lr,
    "Decision Tree": dt,
    "KNN": knn,
    "Naive Bayes": nb,
    "Random Forest": rf,
    "XGBoost": xgb
}


In [28]:
# Generic Confusion Matrix Code (Best Model)

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

def plot_confusion_matrix_best_model(
    models,
    best_model_name,
    X_test,
    X_test_scaled,
    y_test
):
    model = models[best_model_name]

    # Choose correct test data
    if best_model_name in ["Logistic Regression", "KNN", "Naive Bayes"]:
        X = X_test_scaled
    else:
        X = X_test

    y_pred = model.predict(X)
    cm = confusion_matrix(y_test, y_pred)

    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(f"Confusion Matrix - Best Model ({best_model_name})")
    plt.show()


In [29]:
# Plot the Confusion Matix
plot_confusion_matrix_best_model(
    models=models,
    best_model_name=best_model_name,
    X_test=X_test,
    X_test_scaled=X_test_scaled,
    y_test=y_test
)


In [30]:
### Model Selection Explanation

In [31]:
def explain_best_model(results, score_df):
    best_model = score_df['Overall_Score'].idxmax()
    best_scores = results.loc[best_model]

    explanation = f"""
The best performing model selected for this problem is **{best_model}**.

This model was chosen based on a comprehensive evaluation using multiple performance
metrics including Accuracy, AUC, Precision, Recall, F1-score, and Matthews Correlation
Coefficient (MCC). After normalizing all metrics to a common scale and computing an
overall performance score, **{best_model}** achieved the highest combined score among
all evaluated models.

Specifically, {best_model} demonstrated strong class discrimination capability (AUC = {best_scores['AUC']:.3f}),
balanced predictive performance (F1-score = {best_scores['F1 Score']:.3f}),
and reliable performance on imbalanced data (MCC = {best_scores['MCC']:.3f}).

Therefore, this model was selected as the most robust and reliable classifier for the
Telco Customer Churn prediction task.
"""
    print(explanation)


In [32]:
# def explain_best_model(results, score_df):
#     best_model = score_df['Overall_Score'].idxmax()
#     best_scores = results.loc[best_model]

#     explanation = f"""
# ### üèÜ Selected Best Model: **{best_model}**

# The best performing model was selected based on a comprehensive evaluation
# using multiple performance metrics including Accuracy, AUC, Precision, Recall,
# F1-score, and Matthews Correlation Coefficient (MCC).

# All metrics were first normalized to a common scale and combined into a single
# overall performance score. **{best_model}** achieved the highest overall score
# among all evaluated models.

# Key performance highlights for this model:
# - **AUC:** {best_scores['AUC']:.3f} (strong class separation)
# - **F1-score:** {best_scores['F1 Score']:.3f} (balanced precision & recall)
# - **MCC:** {best_scores['MCC']:.3f} (robust performance on imbalanced data)

# Therefore, **{best_model}** was selected as the most reliable classifier for the
# Telco Customer Churn prediction task.
# """
#     return explanation


In [33]:
explanation = explain_best_model(results, score_df)



The best performing model selected for this problem is **Logistic Regression**.

This model was chosen based on a comprehensive evaluation using multiple performance
metrics including Accuracy, AUC, Precision, Recall, F1-score, and Matthews Correlation
Coefficient (MCC). After normalizing all metrics to a common scale and computing an
overall performance score, **Logistic Regression** achieved the highest combined score among
all evaluated models.

Specifically, Logistic Regression demonstrated strong class discrimination capability (AUC = 0.861),
balanced predictive performance (F1-score = 0.606),
and reliable performance on imbalanced data (MCC = 0.489).

Therefore, this model was selected as the most robust and reliable classifier for the
Telco Customer Churn prediction task.



In [34]:
explain_best_model(results, score_df)


The best performing model selected for this problem is **Logistic Regression**.

This model was chosen based on a comprehensive evaluation using multiple performance
metrics including Accuracy, AUC, Precision, Recall, F1-score, and Matthews Correlation
Coefficient (MCC). After normalizing all metrics to a common scale and computing an
overall performance score, **Logistic Regression** achieved the highest combined score among
all evaluated models.

Specifically, Logistic Regression demonstrated strong class discrimination capability (AUC = 0.861),
balanced predictive performance (F1-score = 0.606),
and reliable performance on imbalanced data (MCC = 0.489).

Therefore, this model was selected as the most robust and reliable classifier for the
Telco Customer Churn prediction task.



In [35]:
# This same explanation string can be shown in Streamlit on webpage
st.markdown(explanation)
    

2026-02-09 20:38:05.169 
  command:

    streamlit run C:\Users\erajedu\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\ipykernel_launcher.py [ARGUMENTS]


DeltaGenerator()

In [73]:
# Save Models & Preprocessing Objects
# Create Model Folder
import os

os.makedirs("model", exist_ok=True)

# Save Scaler
import joblib

#joblib.dump(scaler, "model/scaler.pkl")
joblib.dump(feature_scaler, "model/scaler.pkl")

feature_names = X_train.columns.tolist()
joblib.dump(feature_names, "model/feature_names.pkl")
joblib.dump(label_encoders, "model/label_encoders.pkl")



# Save ALL Models
models = {
    "Logistic Regression": lr,
    "Decision Tree": dt,
    "KNN": knn,
    "Naive Bayes": nb,
    "Random Forest": rf,
    "XGBoost": xgb
}

joblib.dump(models, "model/models.pkl")

# Save Results Table to csv file
results.to_csv("model/model_results.csv")



In [74]:
scaler = joblib.load("model/scaler.pkl")
print(scaler.feature_names_in_)


['gender' 'SeniorCitizen' 'Partner' 'Dependents' 'tenure' 'PhoneService'
 'MultipleLines' 'InternetService' 'OnlineSecurity' 'OnlineBackup'
 'DeviceProtection' 'TechSupport' 'StreamingTV' 'StreamingMovies'
 'Contract' 'PaperlessBilling' 'PaymentMethod' 'MonthlyCharges'
 'TotalCharges']
