# Handling Imbalanced Data

This notebook demonstrates:
- Why accuracy fails for imbalanced datasets
- PR curves vs ROC curves
- SMOTE and sampling techniques
- Cost-sensitive learning (class_weight)
- Threshold optimization
- Balanced ensemble methods

**Requirements**: `pip install imbalanced-learn`

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    classification_report, f1_score, precision_score, recall_score,
    precision_recall_curve, roc_curve, auc, average_precision_score,
    matthews_corrcoef, ConfusionMatrixDisplay, make_scorer
)

# Create imbalanced dataset
X, y = make_classification(
    n_samples=5000, n_features=20, n_informative=10,
    weights=[0.95, 0.05], random_state=42, flip_y=0.01
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(f'Train: {Counter(y_train)}, Test: {Counter(y_test)}')

## 1. Baseline and Why Accuracy Fails

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)[:, 1]

print('Baseline (no imbalance handling):')
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))
print(f'MCC: {matthews_corrcoef(y_test, y_pred):.4f}')
print(f'PR-AUC: {average_precision_score(y_test, y_proba):.4f}')

## 2. PR Curve vs ROC Curve

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

fpr, tpr, _ = roc_curve(y_test, y_proba)
axes[0].plot(fpr, tpr, 'b-', lw=2, label=f'ROC AUC={auc(fpr,tpr):.3f}')
axes[0].plot([0,1],[0,1],'k--',alpha=0.5)
axes[0].set_xlabel('FPR'); axes[0].set_ylabel('TPR')
axes[0].set_title('ROC Curve'); axes[0].legend()

prec, rec, _ = precision_recall_curve(y_test, y_proba)
axes[1].plot(rec, prec, 'r-', lw=2, label=f'PR AUC={auc(rec,prec):.3f}')
axes[1].axhline(y_test.mean(), color='k', ls='--', alpha=0.5)
axes[1].set_xlabel('Recall'); axes[1].set_ylabel('Precision')
axes[1].set_title('PR Curve'); axes[1].legend()

plt.tight_layout()
plt.show()

## 3. Sampling Techniques

In [None]:
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from imblearn.pipeline import Pipeline as ImbPipeline

strategies = {
    'No Sampling': None,
    'Random Over': RandomOverSampler(random_state=42),
    'Random Under': RandomUnderSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42),
    'Borderline-SMOTE': BorderlineSMOTE(random_state=42),
    'SMOTE-ENN': SMOTEENN(random_state=42),
}

results = {}
for name, sampler in strategies.items():
    if sampler is None:
        pipe = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    else:
        pipe = ImbPipeline([
            ('sampler', sampler),
            ('clf', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1))
        ])
    cv = StratifiedKFold(5, shuffle=True, random_state=42)
    scores = cross_validate(pipe, X_train, y_train, cv=cv,
                           scoring={'f1':'f1','pr_auc':'average_precision'}, n_jobs=-1)
    results[name] = {
        'F1': scores['test_f1'].mean(),
        'PR-AUC': scores['test_pr_auc'].mean()
    }

pd.DataFrame(results).T.sort_values('F1', ascending=False).round(4)

## 4. Cost-Sensitive Learning

In [None]:
rf_balanced = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42, n_jobs=-1)
rf_balanced.fit(X_train, y_train)
y_pred_bal = rf_balanced.predict(X_test)

print('class_weight="balanced":')
print(classification_report(y_test, y_pred_bal, target_names=['Neg', 'Pos']))

## 5. Threshold Optimization

In [None]:
y_proba = rf.predict_proba(X_test)[:, 1]
thresholds = np.arange(0.05, 0.95, 0.05)
metrics = []
for t in thresholds:
    yp = (y_proba >= t).astype(int)
    metrics.append({'threshold': t, 'f1': f1_score(y_test, yp, zero_division=0),
                    'precision': precision_score(y_test, yp, zero_division=0),
                    'recall': recall_score(y_test, yp)})

mdf = pd.DataFrame(metrics)
fig, ax = plt.subplots(figsize=(10, 5))
for col in ['precision', 'recall', 'f1']:
    ax.plot(mdf['threshold'], mdf[col], lw=2, label=col)
best = mdf.loc[mdf['f1'].idxmax()]
ax.axvline(best['threshold'], color='red', ls='--', label=f'Best F1={best["f1"]:.3f} @ {best["threshold"]:.2f}')
ax.set_xlabel('Threshold'); ax.set_ylabel('Score')
ax.set_title('Threshold Optimization'); ax.legend()
plt.tight_layout()
plt.show()

## 6. Balanced Ensemble Methods

In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier

models = {
    'RF (default)': RandomForestClassifier(100, random_state=42, n_jobs=-1),
    'RF (balanced)': RandomForestClassifier(100, class_weight='balanced', random_state=42, n_jobs=-1),
    'BalancedRF': BalancedRandomForestClassifier(100, random_state=42, n_jobs=-1),
    'EasyEnsemble': EasyEnsembleClassifier(10, random_state=42, n_jobs=-1),
}

res = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    yp = model.predict(X_test)
    ypr = model.predict_proba(X_test)[:, 1]
    res[name] = {'F1': f1_score(y_test,yp), 'PR-AUC': average_precision_score(y_test,ypr),
                 'MCC': matthews_corrcoef(y_test,yp)}

pd.DataFrame(res).T.round(4)