# LightGBM in Python (Colab)

This Colab notebook is a practical, **end-to-end** template:
- Load a tabular dataset
- Train a baseline model
- Evaluate
- Tune hyperparameters (RandomizedSearchCV)
- Interpret with feature importance + SHAP


## 1) Install & Imports
Uncomment installs if needed in Colab.


In [None]:
# If needed:
# !pip -q install lightgbm shap
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import matplotlib.pyplot as plt

RANDOM_STATE = 42


## 2) Dataset (Breast Cancer)
We use a built-in dataset so the notebook runs anywhere.


In [None]:
data = load_breast_cancer(as_frame=True)
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)
X_train.shape, X_test.shape


## 3) Baseline LightGBM (sklearn API)


In [None]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(
    n_estimators=800,
    learning_rate=0.03,
    num_leaves=31,
    max_depth=-1,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=RANDOM_STATE,
)
lgbm.fit(X_train, y_train)
proba = lgbm.predict_proba(X_test)[:, 1]
pred = (proba >= 0.5).astype(int)

print('Accuracy:', accuracy_score(y_test, pred))
print('ROC AUC:', roc_auc_score(y_test, proba))
print('\n', classification_report(y_test, pred))


## 4) Hyperparameter tuning (RandomizedSearchCV)


In [None]:
param_dist = {
    'n_estimators': [400, 800, 1200],
    'learning_rate': [0.01, 0.03, 0.07],
    'num_leaves': [15, 31, 63, 127],
    'max_depth': [-1, 3, 5, 8],
    'min_child_samples': [10, 20, 40],
    'subsample': [0.7, 0.85, 1.0],
    'colsample_bytree': [0.7, 0.85, 1.0],
    'reg_alpha': [0.0, 0.1, 0.5],
    'reg_lambda': [0.0, 0.1, 1.0],
}

search = RandomizedSearchCV(
    estimator=LGBMClassifier(random_state=RANDOM_STATE),
    param_distributions=param_dist,
    n_iter=25,
    scoring='roc_auc',
    cv=5,
    verbose=1,
    random_state=RANDOM_STATE,
    n_jobs=-1,
)
search.fit(X_train, y_train)
print('Best params:', search.best_params_)
print('Best CV ROC AUC:', search.best_score_)

best_lgbm = search.best_estimator_
proba = best_lgbm.predict_proba(X_test)[:, 1]
print('Test ROC AUC:', roc_auc_score(y_test, proba))


## 5) Feature importance + SHAP


In [None]:
import pandas as pd
importances = pd.Series(best_lgbm.feature_importances_, index=X.columns).sort_values(ascending=False)
display(importances.head(15))

importances.head(15).iloc[::-1].plot(kind='barh')
plt.title('Top 15 Feature Importances (LightGBM)')
plt.show()


In [None]:
import shap
shap.initjs()

sample = X_test.sample(n=min(200, len(X_test)), random_state=RANDOM_STATE)
explainer = shap.TreeExplainer(best_lgbm)
shap_values = explainer.shap_values(sample)

shap.summary_plot(shap_values, sample, plot_type='bar')
shap.summary_plot(shap_values, sample)
