# SHAP Explainability for PAM50 Subtype Prediction
This notebook:
- Loads the breast cancer subtype dataset
- Trains a model
- Computes SHAP values
- Visualizes feature attributions

In [None]:
# Install SHAP & XGBoost if not already installed
!pip install shap xgboost pandas matplotlib seaborn

In [None]:
# Load Dataset
import pandas as pd
data = pd.read_csv('capstone_step_6_model_data_for_shap_export.csv')
data.head()

In [None]:
# Separate features and target
X = data.drop(columns=['pam50_numeric'])
y = data['pam50_numeric']

In [None]:
# Train XGBoost Model
from xgboost import XGBClassifier
model = XGBClassifier(objective='multi:softprob', num_class=5, eval_metric='mlogloss', n_jobs=-1)
model.fit(X, y)

In [None]:
# Compute SHAP Values
import shap
explainer = shap.Explainer(model)
shap_values = explainer(X)

In [None]:
# Plot SHAP Summary
shap.summary_plot(shap_values, X)

In [None]:
# Export SHAP Values (Optional)
import numpy as np
np.save('shap_values.npy', shap_values.values)

In [None]:
## Author:
Julian Borges, M.D. — Harvard Medical School (HMS)
Global Clinical Scholars Research Training (GCSRT) 2025  
Capstone Project: *"Hidden Biases in AI-Powered Genomic Subtyping of Breast Cancer"*
---