# ðŸ“‰ Customer Churn Prediction
**Skills:** Classification Â· Business Impact  
**Dataset:** [Kaggle Telco Customer Churn](https://www.kaggle.com/datasets/blastchar/telco-customer-churn)  
**Goal:** Predict which customers will leave â€” and understand *why*.

## 0. Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    confusion_matrix, ConfusionMatrixDisplay,
    roc_curve, roc_auc_score,
    classification_report
)

import warnings
warnings.filterwarnings('ignore')
sns.set_theme(style='whitegrid')
%matplotlib inline

SEED = 42

## 1. Load Data

In [None]:
# Download from Kaggle CLI (run once in terminal):
# kaggle datasets download -d blastchar/telco-customer-churn
# unzip telco-customer-churn.zip

df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
print(df.shape)
df.head()

## 2. EDA

In [None]:
df.info()

In [None]:
# Churn class balance
churn_counts = df['Churn'].value_counts()

fig, axes = plt.subplots(1, 2, figsize=(11, 4))

axes[0].pie(churn_counts, labels=['No Churn', 'Churn'],
            autopct='%1.1f%%', startangle=90,
            colors=['#4C72B0', '#DD8452'])
axes[0].set_title('Churn Distribution')

sns.countplot(data=df, x='Churn', palette=['#4C72B0', '#DD8452'], ax=axes[1])
axes[1].set_title('Churn Count')

plt.tight_layout()
plt.show()

In [None]:
# Numeric feature distributions by churn
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for ax, col in zip(axes, num_cols):
    df.groupby('Churn')[col].plot(kind='kde', ax=ax, legend=True)
    ax.set_title(col)
    ax.set_xlabel(col)
plt.tight_layout()
plt.show()

In [None]:
# Churn rate by contract type
contract_churn = df.groupby('Contract')['Churn'].apply(
    lambda x: (x == 'Yes').mean() * 100
).sort_values(ascending=False)

contract_churn.plot(kind='bar', color='#DD8452', figsize=(7, 4))
plt.title('Churn Rate by Contract Type (%)')
plt.ylabel('Churn Rate (%)')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

## 3. Preprocessing

In [None]:
data = df.drop(columns=['customerID']).copy()

data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
data['TotalCharges'].fillna(data['TotalCharges'].median(), inplace=True)

# Label encode all categoricals
le = LabelEncoder()
for col in data.select_dtypes(include='object').columns:
    data[col] = le.fit_transform(data[col])

X = data.drop(columns=['Churn'])
y = data['Churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_test_sc  = scaler.transform(X_test)

print(f'Train: {X_train_sc.shape} | Test: {X_test_sc.shape}')
print(f'Churn rate in test set: {y_test.mean():.2%}')

## 4. Train Models

In [None]:
models = {
    'Logistic Regression' : LogisticRegression(max_iter=1000, random_state=SEED),
    'Random Forest'       : RandomForestClassifier(n_estimators=100, random_state=SEED),
    'Gradient Boosting'   : GradientBoostingClassifier(n_estimators=200, random_state=SEED),
}

results = {}
for name, model in models.items():
    model.fit(X_train_sc, y_train)
    results[name] = {
        'model' : model,
        'preds' : model.predict(X_test_sc),
        'proba' : model.predict_proba(X_test_sc)[:, 1],
    }
    auc = roc_auc_score(y_test, results[name]['proba'])
    print(f'\nâ”€â”€ {name}  (AUC: {auc:.3f}) â”€â”€')
    print(classification_report(y_test, results[name]['preds'],
                                 target_names=['No Churn', 'Churn']))

## 5. Confusion Matrix

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for ax, (name, res) in zip(axes, results.items()):
    cm = confusion_matrix(y_test, res['preds'])
    disp = ConfusionMatrixDisplay(cm, display_labels=['No Churn', 'Churn'])
    disp.plot(ax=ax, colorbar=False, cmap='Blues')
    ax.set_title(name)

plt.suptitle('Confusion Matrices', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

## 6. ROC Curve

In [None]:
plt.figure(figsize=(8, 5))

for name, res in results.items():
    fpr, tpr, _ = roc_curve(y_test, res['proba'])
    auc = roc_auc_score(y_test, res['proba'])
    plt.plot(fpr, tpr, label=f'{name}  (AUC = {auc:.3f})', linewidth=2)

plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random Classifier')
plt.fill_between([0,1],[0,1], alpha=0.05, color='grey')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve â€” All Models')
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()

## 7. Feature Importance

In [None]:
gb_model = results['Gradient Boosting']['model']

feat_imp = pd.Series(
    gb_model.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

plt.figure(figsize=(9, 6))
feat_imp.head(15).plot(kind='barh', color='#DD8452')
plt.title('Top 15 Feature Importances (Gradient Boosting)')
plt.xlabel('Importance')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 8. ðŸ’¼ Business Impact Analysis

In [None]:
# â”€â”€ Assumptions (tweak to your business context) â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
AVG_MONTHLY_REVENUE  = 65    # $ per customer per month
AVG_TENURE_REMAINING = 12    # months a retained customer would have stayed
RETENTION_COST       = 50    # $ per customer outreach (discount / support call)
RETENTION_SUCCESS    = 0.30  # 30% of flagged churners successfully retained

best_preds = results['Gradient Boosting']['preds']
TN, FP, FN, TP = confusion_matrix(y_test, best_preds).ravel()

revenue_saved     = TP * RETENTION_SUCCESS * AVG_MONTHLY_REVENUE * AVG_TENURE_REMAINING
intervention_cost = (TP + FP) * RETENTION_COST
net_value         = revenue_saved - intervention_cost

print('=' * 47)
print('          BUSINESS IMPACT SUMMARY')
print('=' * 47)
print(f'  True Positives  (caught churners)  : {TP}')
print(f'  False Positives (false alarms)     : {FP}')
print(f'  Missed churners (False Negatives)  : {FN}')
print('-' * 47)
print(f'  Estimated revenue saved            : ${revenue_saved:>8,.0f}')
print(f'  Intervention cost                  : ${intervention_cost:>8,.0f}')
print(f'  Net value of model                 : ${net_value:>8,.0f}')
print('=' * 47)

## 9. Summary

| Model | AUC | Notes |
|---|---|---|
| Logistic Regression | ~0.84 | Good baseline, fast |
| Random Forest | ~0.83 | High precision, lower recall |
| **Gradient Boosting** | **~0.85** | Best overall, used for business calc |

**Top churn drivers:** `tenure`, `MonthlyCharges`, `TotalCharges`, `Contract`, `InternetService`

**Business takeaway:** Month-to-month customers with high monthly charges and short tenure are highest risk. A targeted retention campaign on model-flagged customers provides strong ROI over random outreach.