## üì• Load Dataset and Prepare Labels

In [None]:

import pandas as pd

# Mount Google Drive if needed
from google.colab import drive
drive.mount('/content/drive')

# Load enriched dataset (update path if needed)
df = pd.read_csv('/content/drive/MyDrive/BRCA-pathway-classifier/data/processed/brca_features_enriched.csv')

# Create binary label from CLIN_SIG one-hot columns
label_columns = df.columns[df.columns.str.startswith("CLIN_SIG_")]
df['label'] = df[label_columns].apply(
    lambda row: 1 if any("pathogenic" in str(col) and "benign" not in str(col).lower() for col in row.index[row == 1]) else 0,
    axis=1
)
y = df['label']


## üßπ Prepare Features and Train/Test Split

In [None]:

from sklearn.model_selection import train_test_split

# Drop unused columns and keep only numeric features
X_all = df.drop(columns=['Unnamed: 0'] + list(label_columns) + ['label'], errors='ignore')
X_numeric = X_all.select_dtypes(include=[float, int]).copy()

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_numeric, y, test_size=0.2, stratify=y, random_state=42
)


## ü§ñ Train XGBoost Classifier

In [None]:

import xgboost as xgb

model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    n_estimators=500,
    max_depth=7,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    reg_alpha=0.5,
    reg_lambda=5,
    random_state=42
)
model.fit(X_train, y_train)


## üîç SHAP Global Feature Importance

In [None]:

import shap
import numpy as np

X_array = X_test.to_numpy().astype(np.float32)
explainer = shap.Explainer(model.predict, X_array)
shap_values = explainer(X_array)
shap.summary_plot(shap_values, features=X_array, feature_names=X_test.columns, plot_type="bar")


## üíæ Export Top SHAP Features to CSV

In [None]:

shap_df = pd.DataFrame({
    "feature": X_test.columns,
    "mean_abs_shap": np.abs(shap_values.values).mean(axis=0)
})
shap_df.sort_values("mean_abs_shap", ascending=False).to_csv("shap_top_features.csv", index=False)
shap_df.head(10)
