# SHAP Explainability for PAM50 Subtype Prediction (Google Colab Version)
This notebook:
- Loads your clinical dataset
- Trains an XGBoost model
- Computes SHAP values for interpretability
- Visualizes global feature influence

In [None]:
# Install Required Packages (Colab-safe)
!pip install shap xgboost pandas matplotlib seaborn

In [None]:
# Upload your dataset
from google.colab import files
uploaded = files.upload()

import pandas as pd
import io
data = pd.read_csv(io.BytesIO(uploaded['capstone_step_6_model_data_for_shap_export.csv']))
data.head()

In [None]:
# Split into features and target
X = data.drop(columns=['pam50_numeric'])
y = data['pam50_numeric']

In [None]:
# Train the XGBoost model
from xgboost import XGBClassifier
model = XGBClassifier(objective='multi:softprob', num_class=5, eval_metric='mlogloss', n_jobs=-1)
model.fit(X, y)

In [None]:
# Compute SHAP values
import shap
explainer = shap.Explainer(model)
shap_values = explainer(X)

In [None]:
# Visualize SHAP summary
shap.summary_plot(shap_values, X)

In [None]:
# Optional: Save SHAP values to download later
import numpy as np
np.save('shap_values.npy', shap_values.values)

In [None]:
## Author:
Julian Borges, M.D. — Harvard Medical School (HMS)
Global Clinical Scholars Research Training (GCSRT) 2025  
Capstone Project: *"Hidden Biases in AI-Powered Genomic Subtyping of Breast Cancer"*
---