In [1]:
# importing libraries
!pip install scikit-learn xgboost joblib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import urllib.request
import os
import joblib
import json

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score, recall_score,
    f1_score, matthews_corrcoef, confusion_matrix, classification_report
)
from sklearn.pipeline import Pipeline

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

os.makedirs("model", exist_ok=True)



In [2]:
try:
    df = pd.read_csv("telco-customer-churn.csv")
    print("Loaded dataset from local file.")
except FileNotFoundError:
    print("Local dataset not found. Downloading...")
    url = "https://raw.githubusercontent.com/SohelRaja/Customer-Churn-Analysis/master/Datasets/WA_Fn-UseC_-Telco-Customer-Churn.csv"
    urllib.request.urlretrieve(url, "telco-customer-churn.csv")
    df = pd.read_csv("telco-customer-churn.csv")
    print("Dataset downloaded successfully!")

df.head()

Local dataset not found. Attempting to download...
Dataset downloaded successfully!
Dataset preview:


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
# Convert TotalCharges to numeric
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# Drop missing rows
df = df.dropna()

# Drop customerID
df = df.drop("customerID", axis=1)

# Encode target
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})

print("Dataset shape after cleaning:", df.shape)
df.head()

Dataset shape after cleaning: (7032, 20)


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


In [4]:
# ------------------------------------------------------------
# Split Features (X) and Target (y)
# ------------------------------------------------------------
# The dataset contains multiple input columns (features) and one
# output column (target). Here:
#   - X will store all input features used by ML models.
#   - y will store the target variable "Churn" (1 = Yes, 0 = No).
# This separation is required before training any ML classifier.
# ------------------------------------------------------------

X = df.drop("Churn", axis=1)
y = df["Churn"]

numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns
categorical_cols = X.select_dtypes(include=["object"]).columns

print("Numeric columns:", numeric_cols.tolist())
print("Categorical columns:", categorical_cols.tolist())

Numeric columns: ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
Categorical columns: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']


In [5]:
# ------------------------------------------------------------
# Preprocessing Pipeline
# ------------------------------------------------------------
# Machine learning models require data to be in numerical form.
# This pipeline prepares the dataset by applying:
#
# 1. StandardScaler  → to numeric columns
#    - Normalizes values to a standard range (mean=0, std=1)
#    - Helps models like Logistic Regression, KNN perform better
#
# 2. OneHotEncoder   → to categorical columns
#    - Converts categories (e.g., 'Yes'/'No', 'DSL') into binary
#      indicator variables (0/1)
#    - handle_unknown='ignore' prevents errors if new categories
#      appear in test or Streamlit-uploaded data
#
# ColumnTransformer combines both transformations so they
# are applied in one unified preprocessing step.
# ------------------------------------------------------------

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ]
)

print("Preprocessing pipeline ready.")

Preprocessing pipeline created successfully.


In [6]:
# ------------------------------------------------------------
# Train–Test Split
# ------------------------------------------------------------
# We divide the dataset into:
#   - Training set (80%) → used to train the ML models
#   - Testing set  (20%) → used to evaluate model performance
#
# Parameters:
#   test_size=0.20
#       - 20% of the dataset is reserved for testing.
#
#   random_state=42
#       - Ensures reproducible results every time you run the notebook.
#
#   stratify=y
#       - Ensures the class distribution (Churn vs No Churn)
#         remains consistent in both train and test sets.
#         This is very important for imbalanced datasets.
# ------------------------------------------------------------



X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# Save default test set for Streamlit
X_test.to_csv("model/test_default.csv", index=False)
print("Saved test_default.csv")

print("Training features shape:", X_train.shape)
print("Testing features shape:", X_test.shape)
print("Training labels shape:", y_train.shape)
print("Testing labels shape:", y_test.shape)

Default test dataset saved to: model/test_default.csv
Training features shape: (5625, 19)
Testing features shape: (1407, 19)
Training labels shape: (5625,)
Testing labels shape: (1407,)


In [7]:
import json
import os

# ------------------------------------------------------------
# Universal function to train, evaluate, save, and print metrics
# ------------------------------------------------------------
def train_evaluate_save(model_name, model_obj):

    # 1. Fit preprocessing on training data
    X_train_trans = preprocessor.fit_transform(X_train)
    X_test_trans = preprocessor.transform(X_test)

    # 2. Train model
    model_obj.fit(X_train_trans, y_train)

    # 3. Predictions
    y_pred = model_obj.predict(X_test_trans)
    y_proba = model_obj.predict_proba(X_test_trans)[:, 1]

    # 4. Metrics
    metrics = {
        "accuracy": float(accuracy_score(y_test, y_pred)),
        "auc": float(roc_auc_score(y_test, y_proba)),
        "precision": float(precision_score(y_test, y_pred)),
        "recall": float(recall_score(y_test, y_pred)),
        "f1": float(f1_score(y_test, y_pred)),
        "mcc": float(matthews_corrcoef(y_test, y_pred)),
        "confusion_matrix": confusion_matrix(y_test, y_pred).tolist(),
        "classification_report": classification_report(y_test, y_pred, output_dict=True)
    }

    # Print metrics in console
    print("--------------------------------------------------")
    print(f"{model_name} - Evaluation Metrics")
    print("--------------------------------------------------")
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"AUC:       {auc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print(f"MCC:       {mcc:.4f}")

    # 5. Save model (compressed, small)
    filename = f"model/{model_name.lower().replace(' ', '_')}.pkl"
    joblib.dump(model_obj, filename, compress=3)
    print(f"Saved {filename}")

    # 6. Save preprocessor ONCE
    joblib.dump(preprocessor, "model/preprocessor.pkl", compress=3)

    return metrics

In [8]:
# ------------------------------------------------------------
# Logistic Regression
# ------------------------------------------------------------
# Baseline linear classifier used widely for binary classification.
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Decision Tree": DecisionTreeClassifier(max_depth=6, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),

    # Random Forest
    "Random Forest": RandomForestClassifier(
        n_estimators=100,
        max_depth=8,
        min_samples_split=10,
        min_samples_leaf=4,
        random_state=42
    ),

    # XGBoost
    "XGBoost": XGBClassifier(
        n_estimators=80,
        max_depth=4,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.7,
        tree_method="approx",
        eval_metric="logloss",
        random_state=42
    )
}

all_metrics = {}

for name, model in models.items():
    print("\nTraining:", name)
    all_metrics[name] = train_evaluate_save(name, model)

print("\nDONE TRAINING ALL MODELS")

--------------------------------------------------
Logistic Regression - Evaluation Metrics
--------------------------------------------------
Accuracy:  0.8038
AUC:       0.8359
Precision: 0.6485
Recall:    0.5722
F1 Score:  0.6080
MCC:       0.4795
--------------------------------------------------
Decision Tree - Evaluation Metrics
--------------------------------------------------
Accuracy:  0.7306
AUC:       0.6565
Precision: 0.4934
Recall:    0.4973
F1 Score:  0.4953
MCC:       0.3116
--------------------------------------------------
KNN - Evaluation Metrics
--------------------------------------------------
Accuracy:  0.7605
AUC:       0.7796
Precision: 0.5468
Recall:    0.5775
F1 Score:  0.5618
MCC:       0.3974
--------------------------------------------------
Naive Bayes - Evaluation Metrics
--------------------------------------------------
Accuracy:  0.6823
AUC:       0.8049
Precision: 0.4472
Recall:    0.8262
F1 Score:  0.5803
MCC:       0.4033
--------------------------

In [9]:
# ------------------------------------------------------------
# Save all model metrics to metrics.json
# ------------------------------------------------------------

# Ensure the folder exists
os.makedirs("model", exist_ok=True)

with open("model/metrics.json", "w") as f:
    json.dump(all_metrics, f, indent=4)

print("Saved model/metrics.json successfully!")


All model metrics have been saved to model/metrics.json
