# üìä Phase 4B: Grouped SHAP Interpretability by Feature Category

In [None]:

import pandas as pd
import numpy as np
import shap
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


## üì• Load Data and Prepare Labels

In [None]:

# Load enriched feature dataset
df = pd.read_csv('/content/drive/MyDrive/BRCA-pathway-classifier/data/processed/brca_features_enriched.csv')

# Recreate label from CLIN_SIG one-hot columns
label_columns = df.columns[df.columns.str.startswith("CLIN_SIG_")]
df['label'] = df[label_columns].apply(
    lambda row: 1 if any("pathogenic" in str(col) and "benign" not in str(col).lower() for col in row.index[row == 1]) else 0,
    axis=1
)


## ‚úÇÔ∏è Feature Cleanup and Train/Test Split

In [None]:

X_all = df.drop(columns=['Unnamed: 0'] + list(label_columns) + ['label'], errors='ignore')
X_numeric = X_all.select_dtypes(include=[float, int]).copy()
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X_numeric, y, test_size=0.2, stratify=y, random_state=42
)


## ü§ñ Train XGBoost + Compute SHAP Values

In [None]:

model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    n_estimators=500,
    max_depth=7,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    reg_alpha=0.5,
    reg_lambda=5,
    random_state=42
)
model.fit(X_train, y_train)

X_array = X_test.to_numpy().astype(np.float32)
explainer = shap.Explainer(model.predict, X_array)
shap_values = explainer(X_array)


## üì¶ Group Features by Category (Molecular, Pathway, Environmental)

In [None]:

# Reload full feature list to infer categories
feature_names = X_all.columns.tolist()
grouped_features = {"molecular": [], "pathway": [], "environmental": [], "other": []}

for col in feature_names:
    col_lower = col.lower()
    if "sift" in col_lower or "polyphen" in col_lower or "consequence" in col_lower or "af" in col_lower:
        grouped_features["molecular"].append(col)
    elif "pathway" in col_lower or "reactome" in col_lower or "string" in col_lower:
        grouped_features["pathway"].append(col)
    elif "chemical" in col_lower or "toxic" in col_lower or "ctd" in col_lower or "env" in col_lower:
        grouped_features["environmental"].append(col)
    else:
        grouped_features["other"].append(col)


## üìä Summarize SHAP Values by Feature Group

In [None]:

category_shap_means = {}
for category, cols in grouped_features.items():
    cols_in_test = [col for col in cols if col in X_test.columns]
    if not cols_in_test:
        continue
    idxs = [X_test.columns.get_loc(col) for col in cols_in_test]
    shap_vals = np.abs(shap_values.values[:, idxs])
    category_shap_means[category] = shap_vals.mean()

shap_group_df = pd.DataFrame.from_dict(category_shap_means, orient='index', columns=['mean_abs_shap'])
shap_group_df = shap_group_df.sort_values('mean_abs_shap', ascending=False)
shap_group_df


## üìà Bar Chart: SHAP by Category

In [None]:

shap_group_df.plot(kind='bar', legend=False)
plt.title("Mean SHAP Value by Feature Category")
plt.ylabel("Mean(|SHAP value|)")
plt.xlabel("Feature Category")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
