In [1]:
# ============================================
# BITS WILP ML Assignment-2 (Classification)
# Dataset: Telco Customer Churn
# Models: LR, DT, KNN, NB, RF, XGBoost
# Metrics: Accuracy, AUC, Precision, Recall, F1, MCC
# ============================================
# ============================================================
# STEP 0: Imports + Setup
# ============================================================
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer

from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score, recall_score,
    f1_score, matthews_corrcoef
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

import joblib

# XGBoost
from xgboost import XGBClassifier

# For Naive Bayes dense conversion (because OneHotEncoder returns sparse matrix)
import scipy.sparse as sp


In [2]:
# ============================================================
# STEP 1: Load Dataset
# ============================================================
DATA_PATH = "WA_Fn-UseC_-Telco-Customer-Churn.csv"

df = pd.read_csv(DATA_PATH)
print("Shape:", df.shape)
df.head()

Shape: (7043, 21)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
# ============================================================
# STEP 2: Basic EDA (shape, nulls, class balance)
# ============================================================
print("Columns:", df.columns.tolist())
print("\nData types:\n", df.dtypes)

print("\nMissing values (raw):\n", df.isna().sum().sort_values(ascending=False).head(10))

print("\nTarget (Churn) value counts:\n", df["Churn"].value_counts())
print("\nTarget (Churn) percentage:\n", df["Churn"].value_counts(normalize=True) * 100)


Columns: ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']

Data types:
 customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

Missing values (raw):
 customerID         0
gender

In [4]:
# ============================================================
# STEP 3: Required Cleaning / Preprocessing Setup
# - TotalCharges safe numeric conversion (coerce blanks/spaces)
# - Drop customerID
# - y = Churn (Yes/No -> 1/0)
# ============================================================
# ---- 3.1 Clean TotalCharges strip spaces -> numeric -> coerce invalid to NaN
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"].astype(str).str.strip(), errors="coerce")

# ---- 3.2 Drop customerID ----
df = df.drop(columns=["customerID"])

# ---- 3.3 Target encoding (Yes/No -> 1/0) ----
y = df["Churn"].map({"No": 0, "Yes": 1}).astype(int)
X = df.drop(columns=["Churn"])

# ---- 3.4 Identify numeric & categorical columns ----
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = [c for c in X.columns if c not in numeric_features]

print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)
print("Total features:", len(numeric_features) + len(categorical_features))

Numeric features: ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
Categorical features: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
Total features: 19


In [5]:
# ============================================================
# STEP 4: Train/Test Split (with stratify)
# ============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    random_state=42,
    stratify=y  # keeps class ratio same in train/test
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
print("Train churn %:", y_train.mean()*100, "Test churn %:", y_test.mean()*100)


Train shape: (5634, 19) Test shape: (1409, 19)
Train churn %: 26.53532126375577 Test churn %: 26.54364797728886


In [14]:
# ============================================================
# Create ONE test CSV file (NOT used during training)
# - Uses X_test + y_test from train_test_split
# - Saves a single CSV you can upload directly to Streamlit
# ============================================================

import os

OUT_DIR = "data_exports"
os.makedirs(OUT_DIR, exist_ok=True)

# Build test dataframe (this data was NOT used for training)
test_df = X_test.copy()

# Add target back so Streamlit can show metrics
test_df["Churn"] = y_test.map({0: "No", 1: "Yes"})

# Save single test CSV
test_csv_path = os.path.join(OUT_DIR, "telco_test_data.csv")
test_df.to_csv(test_csv_path, index=False)

print("✅ Test CSV created:", test_csv_path)
print("Rows in test file:", len(test_df))

# Preview
test_df.head()


✅ Test CSV created: data_exports/telco_test_data.csv
Rows in test file: 1409


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
437,Male,0,Yes,Yes,72,Yes,Yes,Fiber optic,Yes,Yes,Yes,Yes,Yes,Yes,Two year,Yes,Credit card (automatic),114.05,8468.2,No
2280,Female,1,No,No,8,Yes,Yes,Fiber optic,No,No,No,Yes,Yes,Yes,Month-to-month,Yes,Credit card (automatic),100.15,908.55,No
2235,Female,0,Yes,Yes,41,Yes,Yes,DSL,Yes,Yes,Yes,No,Yes,No,One year,Yes,Credit card (automatic),78.35,3211.2,No
4460,Male,0,Yes,No,18,Yes,No,Fiber optic,No,No,Yes,Yes,No,No,Month-to-month,No,Electronic check,78.2,1468.75,No
3761,Female,0,Yes,No,72,Yes,Yes,DSL,Yes,Yes,Yes,No,Yes,Yes,Two year,Yes,Credit card (automatic),82.65,5919.35,No


In [6]:
# ============================================================
# STEP 5: ColumnTransformer (Mandatory)
# - Numeric: impute median + StandardScaler
# - Categorical: impute most_frequent + OneHotEncoder(handle_unknown='ignore')
# ============================================================
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),  # handles missing TotalCharges after coercion
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ],
    remainder="drop"
)

preprocessor


In [7]:
# ============================================================
# STEP 6: Define a PICKLE-SAFE sparse->dense function
# GaussianNB needs dense input, and joblib cannot pickle lambdas.
# ============================================================
def sparse_to_dense(x):
    """Convert sparse matrix to dense numpy array (needed for GaussianNB)."""
    return x.toarray() if sp.issparse(x) else x

to_dense = FunctionTransformer(sparse_to_dense, accept_sparse=True)


In [8]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=2000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "kNN": KNeighborsClassifier(n_neighbors=7),
    "Naive Bayes (Gaussian)": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1),
    "XGBoost": XGBClassifier(
        n_estimators=250,
        max_depth=4,
        learning_rate=0.08,
        subsample=0.85,
        colsample_bytree=0.85,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1,
        eval_metric="logloss",
        tree_method="hist"
    )
}


In [9]:
# ============================================================
# STEP 8: Build pipelines
# Note: Only Naive Bayes pipeline includes to_dense step.
# ============================================================
pipelines = {}

for name, model in models.items():
    if "Naive Bayes" in name:
        pipe = Pipeline(steps=[
            ("preprocess", preprocessor),
            ("to_dense", to_dense),
            ("model", model)
        ])
    else:
        pipe = Pipeline(steps=[
            ("preprocess", preprocessor),
            ("model", model)
        ])
    pipelines[name] = pipe

list(pipelines.keys())



['Logistic Regression',
 'Decision Tree',
 'kNN',
 'Naive Bayes (Gaussian)',
 'Random Forest',
 'XGBoost']

In [10]:
# ============================================================
# STEP 9: Metrics helpers (Accuracy, AUC, Precision, Recall, F1, MCC)
# ============================================================
def get_probabilities(pipeline, X_data):
    """
    AUC requires scores for the positive class.
    Prefer predict_proba; fall back to decision_function if available.
    """
    if hasattr(pipeline, "predict_proba"):
        return pipeline.predict_proba(X_data)[:, 1]
    if hasattr(pipeline, "decision_function"):
        scores = pipeline.decision_function(X_data)
        return 1 / (1 + np.exp(-scores))  # sigmoid scaling
    return pipeline.predict(X_data).astype(float)  # last resort


def evaluate_model(name, pipeline, X_test, y_test):
    y_proba = get_probabilities(pipeline, X_test)
    y_pred = (y_proba >= 0.5).astype(int)

    return {
        "ML Model Name": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_proba),
        "Precision": precision_score(y_test, y_pred, zero_division=0),
        "Recall": recall_score(y_test, y_pred, zero_division=0),
        "F1": f1_score(y_test, y_pred, zero_division=0),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }


In [11]:
# ============================================================
# STEP 10: Train + Evaluate all models (Mandatory)
# ============================================================
results = []
trained_pipelines = {}

for name, pipe in pipelines.items():
    print(f"Training: {name} ...")
    pipe.fit(X_train, y_train)
    trained_pipelines[name] = pipe

    metrics = evaluate_model(name, pipe, X_test, y_test)
    results.append(metrics)

metrics_df = pd.DataFrame(results)

# Clean comparison table (rounded)
comparison_df = metrics_df.sort_values(by="AUC", ascending=False).reset_index(drop=True)
comparison_df_rounded = comparison_df.copy()
for col in ["Accuracy", "AUC", "Precision", "Recall", "F1", "MCC"]:
    comparison_df_rounded[col] = comparison_df_rounded[col].round(4)

comparison_df_rounded


Training: Logistic Regression ...
Training: Decision Tree ...
Training: kNN ...
Training: Naive Bayes (Gaussian) ...
Training: Random Forest ...
Training: XGBoost ...


Unnamed: 0,ML Model Name,Accuracy,AUC,Precision,Recall,F1,MCC
0,Logistic Regression,0.8055,0.8419,0.6572,0.5588,0.604,0.479
1,XGBoost,0.8006,0.8378,0.6525,0.5321,0.5862,0.4607
2,Random Forest,0.7793,0.8171,0.6068,0.4786,0.5351,0.3978
3,Naive Bayes (Gaussian),0.6948,0.8074,0.4589,0.8369,0.5928,0.4245
4,kNN,0.7651,0.8049,0.5556,0.5749,0.565,0.4043
5,Decision Tree,0.7282,0.6573,0.4884,0.5053,0.4967,0.3107


In [12]:
# ============================================================
# STEP 11: Auto-generate short observations per model (based on your results)
# ============================================================
def make_observation(row, best_auc, best_f1, best_mcc):
    name = row["ML Model Name"]
    auc = row["AUC"]
    f1 = row["F1"]
    mcc = row["MCC"]
    recall = row["Recall"]
    precision = row["Precision"]

    notes = []

    if auc == best_auc:
        notes.append("Best AUC (strong class separation).")
    if f1 == best_f1:
        notes.append("Best F1 (best precision–recall balance).")
    if mcc == best_mcc:
        notes.append("Best MCC (strong overall correlation; good for imbalance).")

    if "Logistic" in name:
        notes.append("Interpretable baseline; often stable.")
    if "Decision Tree" in name:
        notes.append("Can overfit; sensitive to depth/splits.")
    if "kNN" in name:
        notes.append("Sensitive to scaling and k; slower prediction on large data.")
    if "Naive Bayes" in name:
        notes.append("Fast baseline; assumes feature independence.")
    if "Random Forest" in name:
        notes.append("Robust ensemble; reduces overfitting vs single tree.")
    if "XGBoost" in name:
        notes.append("Strong boosting model; often top performer with tuning.")

    if recall > precision:
        notes.append("Higher recall than precision (catches more churn but more false positives).")
    elif precision > recall:
        notes.append("Higher precision than recall (fewer false positives but may miss churn).")

    return " ".join(notes)


best_auc = comparison_df["AUC"].max()
best_f1 = comparison_df["F1"].max()
best_mcc = comparison_df["MCC"].max()

comparison_with_obs = comparison_df.copy()
comparison_with_obs["Observation about model performance"] = comparison_with_obs.apply(
    lambda r: make_observation(r, best_auc, best_f1, best_mcc), axis=1
)

comparison_with_obs[["ML Model Name", "Observation about model performance"]]


Unnamed: 0,ML Model Name,Observation about model performance
0,Logistic Regression,Best AUC (strong class separation). Best F1 (b...
1,XGBoost,Strong boosting model; often top performer wit...
2,Random Forest,Robust ensemble; reduces overfitting vs single...
3,Naive Bayes (Gaussian),Fast baseline; assumes feature independence. H...
4,kNN,Sensitive to scaling and k; slower prediction ...
5,Decision Tree,Can overfit; sensitive to depth/splits. Higher...


In [15]:
# ============================================================
# STEP 12: Save all pipelines to model
# ============================================================
MODEL_DIR = "model"
os.makedirs(MODEL_DIR, exist_ok=True)

filename_map = {
    "Logistic Regression": "logistic_regression.joblib",
    "Decision Tree": "decision_tree.joblib",
    "kNN": "knn.joblib",
    "Naive Bayes (Gaussian)": "naive_bayes_gaussian.joblib",
    "Random Forest": "random_forest.joblib",
    "XGBoost": "xgboost.joblib"
}

for name, pipe in trained_pipelines.items():
    out_path = os.path.join(MODEL_DIR, filename_map[name])
    joblib.dump(pipe, out_path, compress=3)
    print("Saved:", out_path)


Saved: model/logistic_regression.joblib
Saved: model/decision_tree.joblib
Saved: model/knn.joblib
Saved: model/naive_bayes_gaussian.joblib
Saved: model/random_forest.joblib
Saved: model/xgboost.joblib
