# XGBoost in Python (Colab)

This Colab notebook is a practical, **end-to-end** template:
- Load a tabular dataset
- Train a baseline model
- Evaluate
- Tune hyperparameters (RandomizedSearchCV)
- Interpret with feature importance + SHAP


## 1) Install & Imports
Uncomment installs if needed in Colab.


In [None]:
# If needed:
# !pip -q install xgboost shap
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import matplotlib.pyplot as plt

RANDOM_STATE = 42


## 2) Dataset (Breast Cancer)
We use a built-in dataset so the notebook runs anywhere.


In [None]:
data = load_breast_cancer(as_frame=True)
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)
X_train.shape, X_test.shape


## 3) Baseline XGBoost (sklearn API)


In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    random_state=RANDOM_STATE,
    eval_metric='logloss',
)
xgb.fit(X_train, y_train)
proba = xgb.predict_proba(X_test)[:, 1]
pred = (proba >= 0.5).astype(int)

print('Accuracy:', accuracy_score(y_test, pred))
print('ROC AUC:', roc_auc_score(y_test, proba))
print('\n', classification_report(y_test, pred))


## 4) Hyperparameter tuning (RandomizedSearchCV)


In [None]:
param_dist = {
    'n_estimators': [200, 400, 700],
    'max_depth': [2, 3, 4, 6],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.85, 1.0],
    'colsample_bytree': [0.7, 0.85, 1.0],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.3],
    'reg_lambda': [0.5, 1.0, 2.0],
}

search = RandomizedSearchCV(
    estimator=XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss'),
    param_distributions=param_dist,
    n_iter=20,
    scoring='roc_auc',
    cv=5,
    verbose=1,
    random_state=RANDOM_STATE,
    n_jobs=-1,
)
search.fit(X_train, y_train)
print('Best params:', search.best_params_)
print('Best CV ROC AUC:', search.best_score_)

best_xgb = search.best_estimator_
proba = best_xgb.predict_proba(X_test)[:, 1]
print('Test ROC AUC:', roc_auc_score(y_test, proba))


## 5) Feature importance + SHAP


In [None]:
import pandas as pd
importances = pd.Series(best_xgb.feature_importances_, index=X.columns).sort_values(ascending=False)
display(importances.head(15))

importances.head(15).iloc[::-1].plot(kind='barh')
plt.title('Top 15 Feature Importances (XGBoost)')
plt.show()


In [None]:
import shap
shap.initjs()

sample = X_test.sample(n=min(200, len(X_test)), random_state=RANDOM_STATE)
explainer = shap.TreeExplainer(best_xgb)
shap_values = explainer.shap_values(sample)

shap.summary_plot(shap_values, sample, plot_type='bar')
shap.summary_plot(shap_values, sample)
