**Boosting Techniques**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
np.random.seed(42)

from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, RocCurveDisplay
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

try:
    import xgboost as xgb
    from xgboost import XGBClassifier
    HAS_XGB = True
except Exception as e:
    HAS_XGB = False

# ======================================================
# Q1: What is Boosting? How does it work?
# ======================================================
# Boosting is an ensemble technique that builds a strong learner by sequentially
# training weak learners. Each new learner focuses on correcting errors of the previous ensemble
# (by reweighting samples or fitting residuals). Final prediction is a weighted combination
# of all learners. Boosting reduces bias and can produce very accurate models.

# ======================================================
# Q2: What is AdaBoost and how does it work?
# ======================================================
# AdaBoost (Adaptive Boosting) trains weak learners sequentially (often decision stumps).
# After each round, it increases weights for misclassified samples so the next learner
# focuses on hard examples. Final prediction is a weighted vote of the learners.
# Key hyperparameters: number of estimators, learning_rate, base_estimator complexity.

# ======================================================
# Q3: What is Gradient Boosting and how is it different from AdaBoost?
# ======================================================
# Gradient Boosting builds learners to fit the negative gradient (residuals) of a loss function.
# Each new model learns to predict residuals of the previous ensemble. It is a functional
# gradient descent approach. Differences: AdaBoost reweights samples; Gradient Boosting
# fits residuals directly and generalizes to many loss functions (regression, classification, ranking).

# ======================================================
# Q4: XGBoost / LightGBM — benefits and why used
# ======================================================
# XGBoost, LightGBM (and CatBoost) are optimized gradient boosting frameworks.
# Benefits:
# - Speed and scalability (histogram-based splits, multi-threading)
# - Regularization options (L1/L2, shrinkage)
# - Built-in handling for missing values
# - Advanced features: tree pruning, approximate algorithms, early stopping
# Use them when you need top-of-the-line GB performance on tabular data.

# ======================================================
# Q5: When to use boosting & pitfalls
# ======================================================
# Use boosting when:
# - You want high predictive accuracy on tabular data
# - You can tolerate longer training time than simple models
# Pitfalls:
# - Can overfit if not regularized
# - Sensitive to noisy labels/outliers (but regularization + early stopping helps)
# - Less interpretable than a single decision tree (but feature importance + SHAP help)

# ======================================================
# Q6: AdaBoost on Iris — compare with base decision stump
# ======================================================
print("\n" + "="*8 + " Q6: AdaBoost on Iris " + "="*8)
iris = datasets.load_iris()
X, y = iris.data, iris.target
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Base learner = decision stump
stump = DecisionTreeClassifier(max_depth=1, random_state=42)
stump.fit(Xtr, ytr)
pred_stump = stump.predict(Xte)
acc_stump = accuracy_score(yte, pred_stump)

# AdaBoost
adb = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                         n_estimators=100, learning_rate=1.0, random_state=42)
adb.fit(Xtr, ytr)
pred_adb = adb.predict(Xte)
acc_adb = accuracy_score(yte, pred_adb)

print(f"Decision stump accuracy: {acc_stump:.4f}")
print(f"AdaBoost accuracy: {acc_adb:.4f}")

# Show classification report for AdaBoost
print("\nAdaBoost classification report:\n", classification_report(yte, pred_adb, target_names=iris.target_names))

# Plot feature importances (AdaBoost)
if hasattr(adb, "feature_importances_"):
    fi = adb.feature_importances_
    plt.figure(figsize=(6,3))
    plt.barh(iris.feature_names, fi)
    plt.title("AdaBoost Feature Importances (Iris)")
    plt.xlabel("Importance")
    plt.show()

# ======================================================
# Q7: Gradient Boosting on Breast Cancer dataset (classification report)
# ======================================================
print("\n" + "="*8 + " Q7: Gradient Boosting on Breast Cancer " + "="*8)
bc = datasets.load_breast_cancer()
Xb, yb = bc.data, bc.target
Xb_tr, Xb_te, yb_tr, yb_te = train_test_split(Xb, yb, test_size=0.3, random_state=42, stratify=yb)

# Standard scaling helps gradient-based models
pipe_gb = Pipeline([
    ("scaler", StandardScaler()),
    ("gb", GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, random_state=42))
])
pipe_gb.fit(Xb_tr, yb_tr)
pred_gb = pipe_gb.predict(Xb_te)
acc_gb = accuracy_score(yb_te, pred_gb)
print(f"Gradient Boosting accuracy: {acc_gb:.4f}")
print("\nClassification report:\n", classification_report(yb_te, pred_gb, target_names=bc.target_names))

# Plot feature importances
gb = pipe_gb.named_steps["gb"]
fi_gb = pd.Series(gb.feature_importances_, index=bc.feature_names).nlargest(10)
plt.figure(figsize=(6,4))
fi_gb.plot.barh()
plt.gca().invert_yaxis()
plt.title("Top 10 Feature Importances (GradientBoosting on Breast Cancer)")
plt.show()

# ======================================================
# Q8: XGBoost on Wine dataset — GridSearchCV for C-like parameters (if XGBoost available)
# If XGBoost not available, use HistGradientBoostingClassifier as a modern alternative.
# ======================================================
print("\n" + "="*8 + " Q8: XGBoost (or HGB) on Wine with GridSearchCV " + "="*8)
wine = datasets.load_wine()
Xw, yw = wine.data, wine.target
Xw_tr, Xw_te, yw_tr, yw_te = train_test_split(Xw, yw, test_size=0.3, random_state=42, stratify=yw)

if HAS_XGB:
    print("XGBoost is available. Running GridSearchCV on XGBClassifier (may take some time)...")
    xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42, verbosity=0)
    param_grid = {
        "n_estimators": [50, 100],
        "max_depth": [3, 5],
        "learning_rate": [0.01, 0.1]
    }
    grid_xgb = GridSearchCV(xgb_model, param_grid, cv=4, n_jobs=-1)
    grid_xgb.fit(Xw_tr, yw_tr)
    best = grid_xgb.best_estimator_
    print("Best XGBoost params:", grid_xgb.best_params_)
    print("CV best score:", grid_xgb.best_score_)
    pred_xgb = best.predict(Xw_te)
    print("Test accuracy (best XGBoost):", accuracy_score(yw_te, pred_xgb))
    # Feature importances
    try:
        fi = pd.Series(best.feature_importances_, index=wine.feature_names).nlargest(10)
        plt.figure(figsize=(6,4)); fi.plot.barh(); plt.gca().invert_yaxis(); plt.title("XGBoost Feature Importances"); plt.show()
    except Exception:
        pass
else:
    print("XGBoost not found — using HistGradientBoostingClassifier (sklearn) as alternative.")
    hgb = HistGradientBoostingClassifier(random_state=42)
    param_grid = {"max_iter": [100, 200], "max_depth": [None, 10]}
    grid_hgb = GridSearchCV(hgb, param_grid, cv=4, n_jobs=-1)
    grid_hgb.fit(Xw_tr, yw_tr)
    best = grid_hgb.best_estimator_
    print("Best HGB params:", grid_hgb.best_params_)
    print("CV best score:", grid_hgb.best_score_)
    pred_hgb = best.predict(Xw_te)
    print("Test accuracy (best HGB):", accuracy_score(yw_te, pred_hgb))
    # Feature importances
    try:
        fi = pd.Series(best.feature_importances_, index=wine.feature_names).nlargest(10)
        plt.figure(figsize=(6,4)); fi.plot.barh(); plt.gca().invert_yaxis(); plt.title("HGB Feature Importances"); plt.show()
    except Exception:
        pass

# ======================================================
# Q9: Feature importance comparison & a simple partial-dependence style plot
# ======================================================
print("\n" + "="*8 + " Q9: Feature importances & PD-style plot " + "="*8)
# Use RandomForest or best gradient model's importances if available; fallback to AdaBoost (from Q6)
models_for_fi = {}
models_for_fi["AdaBoost(Iris)"] = adb if 'adb' in globals() else None
models_for_fi["GradientBoosting(BreastCancer)"] = gb if 'gb' in globals() else None
if HAS_XGB and 'best' in globals():
    models_for_fi["XGBoost/Winner"] = best

# Print available importances
for name, model in models_for_fi.items():
    if model is None:
        continue
    if hasattr(model, "feature_importances_"):
        try:
            importances = model.feature_importances_
            print(f"\n{name} top features:")
            if name.startswith("AdaBoost"):
                cols = iris.feature_names
            elif name.startswith("GradientBoosting"):
                cols = bc.feature_names
            elif name.startswith("XGBoost"):
                cols = wine.feature_names
            else:
                cols = [f"f{i}" for i in range(len(importances))]
            fi_series = pd.Series(importances, index=cols).nlargest(5)
            print(fi_series)
            # Plot top 5
            plt.figure(figsize=(5,2)); fi_series.plot.barh(); plt.gca().invert_yaxis(); plt.title(f"{name} - Top features"); plt.show()
        except Exception as e:
            print("Could not extract importances for", name, ":", e)

# Simple PD-like plot: vary a single feature and show model prediction (for AdaBoost on Iris)
try:
    feat_idx = 2  # petal length for iris
    grid_vals = np.linspace(X[:, feat_idx].min(), X[:, feat_idx].max(), 50)
    X_sample = Xte.copy() if 'Xte' in globals() else X.copy()
    preds = []
    for v in grid_vals:
        X_tmp = X_sample.copy()
        X_tmp[:, feat_idx] = v
        preds.append(adb.predict(X_tmp).mean())  # mean predicted label value (not strict PDP, but illustrative)
    plt.figure(figsize=(6,3))
    plt.plot(grid_vals, preds)
    plt.title("PD-style plot (AdaBoost on Iris) — avg predicted label vs feature value")
    plt.xlabel(iris.feature_names[feat_idx])
    plt.ylabel("Average predicted label")
    plt.show()
except Exception:
    pass

# ======================================================
# Q10: Handling class imbalance & early stopping (practical demo)
# - Show: (a) class weight or sample_weight usage, (b) early stopping with XGBoost/HGB
# ======================================================
print("\n" + "="*8 + " Q10: Class imbalance & Early Stopping " + "="*8)
# Create an imbalanced version of the breast cancer dataset by downsampling minority class
Xb_df = pd.DataFrame(Xb, columns=bc.feature_names)
yb_ser = pd.Series(yb)
# original class distribution
print("Original class counts:\n", yb_ser.value_counts())

# Downsample class 1 (malignant) to create imbalance (if both classes present)
from sklearn.utils import resample
df = Xb_df.copy(); df['target'] = yb_ser
# keep all class 0, downsample class 1
df_major = df[df['target']==0]
df_minor = df[df['target']==1]
df_minor_down = resample(df_minor, replace=False, n_samples=int(len(df_major)*0.2), random_state=42)
df_imbal = pd.concat([df_major, df_minor_down])
print("Imbalanced class counts:\n", df_imbal['target'].value_counts())

X_imb = df_imbal.drop(columns='target').values
y_imb = df_imbal['target'].values
Xtr_imb, Xte_imb, ytr_imb, yte_imb = train_test_split(X_imb, y_imb, test_size=0.3, random_state=42, stratify=y_imb)

# (a) Use class_weight in base estimator / sample_weight
# Example: AdaBoost accepts sample_weight in fit; we can pass higher weights to minority class
weights = np.where(ytr_imb==1, 5, 1)  # give minority class more weight
adb_imb = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=100, random_state=42)
adb_imb.fit(Xtr_imb, ytr_imb, sample_weight=weights)
pred_imb = adb_imb.predict(Xte_imb)
print("\nAdaBoost on imbalanced data (with sample_weight) accuracy:", accuracy_score(yte_imb, pred_imb))
print("Classification report:\n", classification_report(yte_imb, pred_imb))

# (b) Early stopping demonstration with XGBoost (if available) or HistGradientBoostingClassifier
if HAS_XGB:
    print("\nDemonstrating early stopping with XGBoost...")
    dtrain = xgb.DMatrix(Xtr_imb, label=ytr_imb)
    dtest = xgb.DMatrix(Xte_imb, label=yte_imb)
    params = {"objective":"binary:logistic", "eval_metric":"logloss", "verbosity":0}
    # use a small number of rounds and early_stopping_rounds
    watchlist = [(dtrain, "train"), (dtest, "eval")]
    bst = xgb.train(params, dtrain, num_boost_round=200, evals=watchlist, early_stopping_rounds=10, verbose_eval=False)
    preds_prob = bst.predict(dtest)
    preds = (preds_prob > 0.5).astype(int)
    print("XGBoost with early stopping accuracy:", accuracy_score(yte_imb, preds))
else:
    print("\nXGBoost not installed; using HistGradientBoostingClassifier with early stopping instead.")
    hgb_es = HistGradientBoostingClassifier(max_iter=1000, early_stopping=True, random_state=42)
    hgb_es.fit(Xtr_imb, ytr_imb)
    preds = hgb_es.predict(Xte_imb)
    print("HGB with early stopping accuracy:", accuracy_score(yte_imb, preds))

