# üîÆ Phase 5: Predict on VUS (Variants of Uncertain Significance)

In [None]:

import pandas as pd
import numpy as np
import shap
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


## üì• Load Enriched Dataset and Extract VUS

In [None]:

# Load enriched dataset
df = pd.read_csv('/content/drive/MyDrive/BRCA-pathway-classifier/data/processed/brca_features_enriched.csv')

# Identify one-hot encoded clinical significance columns
label_columns = df.columns[df.columns.str.startswith("CLIN_SIG_")]

# Filter to likely VUS: Uncertain significance or not provided, and no conflicting labels
vus_df = df[
    ((df.get("CLIN_SIG_Uncertain significance", 0) == 1) |
     (df.get("CLIN_SIG_Not provided", 0) == 1)) &
    (df[label_columns].sum(axis=1) == 1)
].copy()

print(f"Extracted {vus_df.shape[0]} likely VUS variants.")


## ‚úÇÔ∏è Prepare Features for VUS Prediction

In [None]:

# Drop non-feature columns
drop_cols = ['Unnamed: 0'] + list(label_columns) + ['label']
X_vus = vus_df.drop(columns=drop_cols, errors='ignore')
X_vus_numeric = X_vus.select_dtypes(include=[float, int]).copy()


## ü§ñ Retrain Model on Full Dataset (for VUS Prediction)

In [None]:

# Recreate label from one-hot columns
df['label'] = df[label_columns].apply(
    lambda row: 1 if any("pathogenic" in str(col) and "benign" not in str(col).lower() for col in row.index[row == 1]) else 0,
    axis=1
)

X_all = df.drop(columns=['Unnamed: 0'] + list(label_columns) + ['label'], errors='ignore')
X_all_numeric = X_all.select_dtypes(include=[float, int])
y_all = df['label']

# Train on full labeled set
model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    n_estimators=500,
    max_depth=7,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    reg_alpha=0.5,
    reg_lambda=5,
    random_state=42
)
model.fit(X_all_numeric, y_all)


## üîÆ Predict Pathogenicity Probabilities for VUS

In [None]:

vus_preds = model.predict_proba(X_vus_numeric)[:, 1]
vus_df['predicted_pathogenicity'] = vus_preds

# Preview top predicted pathogenic variants
vus_df_sorted = vus_df.sort_values(by='predicted_pathogenicity', ascending=False)
vus_df_sorted[['predicted_pathogenicity']].head()


## üß† Explain Top VUS Predictions with SHAP

In [None]:

# Use SHAP to explain model predictions for VUS
X_array = X_vus_numeric.to_numpy().astype(np.float32)
explainer = shap.Explainer(model.predict, X_array)
shap_values = explainer(X_array)

# Plot SHAP summary for VUS predictions
shap.summary_plot(shap_values, features=X_array, feature_names=X_vus_numeric.columns, plot_type="bar")
