# 📊 Ensemble Learning: Bagging vs Boosting vs Stacking (Telco Churn Dataset)


This notebook demonstrates three major ensemble techniques:
- **Bagging** (Random Forest)
- **Boosting** (XGBoost)
- **Stacking** (with multiple base learners)

We will use the Telco Customer Churn dataset, perform preprocessing, train all models, and compare their results with ROC curves, confusion matrices, and bar plots.


## Step 1: Load and Explore the Telco Churn Dataset

In [None]:

import pandas as pd

# Load dataset (make sure the CSV is in your environment)
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()


## Step 2: Preprocess the Data

In [None]:

# Drop customerID and handle missing values
df = df.drop(columns='customerID')
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df = df.dropna()

# Encode target
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# One-hot encoding for categorical variables
X = pd.get_dummies(df.drop('Churn', axis=1), drop_first=True)
y = df['Churn']


## Step 3: Train-Test Split

In [None]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


## Step 4: Bagging - Random Forest

In [None]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Random Forest Report:")
print(classification_report(y_test, y_pred_rf))


## Step 5: Boosting - XGBoost

In [None]:

from xgboost import XGBClassifier

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

print("XGBoost Report:")
print(classification_report(y_test, y_pred_xgb))


## Step 6: Stacking Ensemble

In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier

estimators = [
    ('rf', RandomForestClassifier(random_state=42)),
    ('svc', SVC(probability=True))
]

stack = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
stack.fit(X_train, y_train)
y_pred_stack = stack.predict(X_test)

print("Stacking Classifier Report:")
print(classification_report(y_test, y_pred_stack))


## Step 7: ROC Curves and Confusion Matrices

In [None]:

from sklearn.metrics import roc_curve, auc, ConfusionMatrixDisplay, confusion_matrix
import matplotlib.pyplot as plt

# Predict probabilities
rf_probs = rf.predict_proba(X_test)[:, 1]
xgb_probs = xgb.predict_proba(X_test)[:, 1]
stack_probs = stack.predict_proba(X_test)[:, 1]

# Compute ROC curves
fpr_rf, tpr_rf, _ = roc_curve(y_test, rf_probs)
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, xgb_probs)
fpr_stack, tpr_stack, _ = roc_curve(y_test, stack_probs)

# Plot ROC curves
plt.figure(figsize=(8, 6))
plt.plot(fpr_rf, tpr_rf, label='Random Forest (AUC = {:.2f})'.format(auc(fpr_rf, tpr_rf)))
plt.plot(fpr_xgb, tpr_xgb, label='XGBoost (AUC = {:.2f})'.format(auc(fpr_xgb, tpr_xgb)))
plt.plot(fpr_stack, tpr_stack, label='Stacking (AUC = {:.2f})'.format(auc(fpr_stack, tpr_stack)))
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for All Ensemble Methods')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Confusion Matrices
models = [('Random Forest', y_pred_rf), ('XGBoost', y_pred_xgb), ('Stacking', y_pred_stack)]
fig, axes = plt.subplots(1, 3, figsize=(18, 4))
for i, (name, preds) in enumerate(models):
    cm = confusion_matrix(y_test, preds)
    ConfusionMatrixDisplay(confusion_matrix=cm).plot(ax=axes[i])
    axes[i].set_title(f'{name} Confusion Matrix')
plt.tight_layout()
plt.show()


## Step 8: Bar Plot of F1-Scores

In [None]:

from sklearn.metrics import f1_score

f1_scores = {
    'Random Forest': f1_score(y_test, y_pred_rf),
    'XGBoost': f1_score(y_test, y_pred_xgb),
    'Stacking': f1_score(y_test, y_pred_stack)
}

# Bar plot
plt.figure(figsize=(6, 4))
plt.bar(f1_scores.keys(), f1_scores.values(), color='skyblue')
plt.ylabel('F1 Score')
plt.title('F1 Score Comparison Across Ensemble Methods')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
