In [None]:
# Imports
import os, json, warnings
import numpy as np
import pandas as pd
import shap
from umap import UMAP
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ks_2samp, wasserstein_distance
from sklearn.model_selection import train_test_split
from pathlib import Path

from src import config
from src.utils import ensure_dirs, set_global_seed
from src.data_loader import download_stroke_dataset
from src.preprocess import get_feature_types, build_preprocessor, fit_transform, transform, ctgan_prepare_training_df
from xgboost import XGBClassifier

warnings.filterwarnings('ignore')
ensure_dirs(); set_global_seed(config.SEED)

In [None]:
# Load data and preprocess consistent with experiment notebook
csv_path = download_stroke_dataset(config.RAW_DIR)
df = pd.read_csv(csv_path)
y = df[config.TARGET_COL].values
X = df.drop(columns=[config.TARGET_COL])
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=config.TEST_SIZE, stratify=y, random_state=config.SEED
)
num_cols, cat_cols = get_feature_types(df)
pre = build_preprocessor(num_cols, cat_cols)
X_train, feat_names = fit_transform(pre, X_train_raw)
X_test = transform(pre, X_test_raw)
X_train = X_train.astype('float32'); X_test = X_test.astype('float32')
X_train.shape, X_test.shape

In [None]:
# Load synthetic CTGAN minority generated earlier
ctgan_csv = os.path.join(config.ARTIFACTS_DIR, 'synthetic_ctgan_minority.csv')
if not os.path.exists(ctgan_csv):
    raise FileNotFoundError('Run 1_experiment_pipeline to generate CTGAN samples first.')
synth_min = pd.read_csv(ctgan_csv)
# Encode synthetic
X_synth = transform(pre, synth_min.drop(columns=[config.TARGET_COL]))
y_synth = synth_min[config.TARGET_COL].values.astype(int)
X_synth.shape, y_synth.mean()

In [None]:
# Re-train the baseline and CTGAN-augmented classifiers (or load if you saved pickles)
from src.models import get_classifier
clf_base = get_classifier(); clf_base.fit(X_train, y_train)
clf_ctgan = get_classifier();
X_aug = np.vstack([X_train, X_synth]).astype('float32')
y_aug = np.concatenate([y_train, y_synth])
clf_ctgan.fit(X_aug, y_aug)
print('Models trained for XAI analysis.')

## SHAP Value Comparisons
We compare SHAP distributions for top features across groups.

In [None]:
expl_base = shap.TreeExplainer(clf_base)
expl_ctgan = shap.TreeExplainer(clf_ctgan)
# Select subsets
idx_min = np.where(y_train == 1)[0]; idx_maj = np.where(y_train == 0)[0]
X_min = X_train[idx_min]; X_maj = X_train[idx_maj]

# SHAP for baseline
sh_min_base = expl_base.shap_values(X_min)
sh_maj_base = expl_base.shap_values(X_maj)
# SHAP for CTGAN model on synthetic
sh_syn_ctgan = expl_ctgan.shap_values(X_synth)

# Plot violin for top 10 features on baseline (real) vs ctgan (synthetic)
plt.figure(figsize=(10,5)); shap.summary_plot(sh_min_base, features=X_min, feature_names=feat_names, show=False, max_display=10)
plt.title('Baseline SHAP: Real Minority'); plt.show()
plt.figure(figsize=(10,5)); shap.summary_plot(sh_syn_ctgan, features=X_synth, feature_names=feat_names, show=False, max_display=10)
plt.title('CTGAN Model SHAP: Synthetic Minority'); plt.show()

## 2D Projection (UMAP) with Decision Surface
Project encoded features to 2D; color by predicted probability to visualize decision regions.

In [None]:
umap = UMAP(n_neighbors=20, min_dist=0.1, random_state=config.SEED)
Z_real = umap.fit_transform(np.vstack([X_min[:1000], X_maj[:1000]]))
lab_real = np.concatenate([np.ones(min(1000, len(X_min))), np.zeros(min(1000, len(X_maj)))])
prob_real = clf_ctgan.predict_proba(np.vstack([X_min[:1000], X_maj[:1000]]))[:,1]
Z_syn = umap.transform(X_synth)
prob_syn = clf_ctgan.predict_proba(X_synth)[:,1]

fig, ax = plt.subplots(1,2, figsize=(12,5))
sc0 = ax[0].scatter(Z_real[:,0], Z_real[:,1], c=prob_real, cmap='viridis', s=10, alpha=0.7)
ax[0].set_title('Real (UMAP) colored by P(class=1)')
plt.colorbar(sc0, ax=ax[0])
sc1 = ax[1].scatter(Z_syn[:,0], Z_syn[:,1], c=prob_syn, cmap='viridis', s=10, alpha=0.7)
ax[1].set_title('Synthetic Minority (UMAP) colored by P(class=1)')
plt.colorbar(sc1, ax=ax[1]); plt.show()

## Quantitative Similarity: KS and Wasserstein
Compare per-feature distributions for real minority vs synthetic minority.

In [None]:
# For interpretability, compute on raw (pre-encoding) distributions for overlapping columns
from src.preprocess import ctgan_prepare_training_df
train_df_ctgan = X_train_raw.copy(); train_df_ctgan[config.TARGET_COL] = y_train
train_df_ctgan, _ = ctgan_prepare_training_df(train_df_ctgan)
real_min = train_df_ctgan[train_df_ctgan[config.TARGET_COL]==1]
common_cols = [c for c in real_min.columns if c in synth_min.columns and c!=config.TARGET_COL]
rows = []
for c in common_cols:
    a = real_min[c].values; b = synth_min[c].values
    # Convert categories to codes for distance ops if needed
    if a.dtype=='O' or b.dtype=='O':
        a = pd.Series(a).astype('category').cat.codes.values
        b = pd.Series(b).astype('category').cat.codes.values
    ks = ks_2samp(a, b).statistic
    wd = wasserstein_distance(a, b)
    rows.append({'feature': c, 'ks': float(ks), 'wasserstein': float(wd)})
dist_df = pd.DataFrame(rows).sort_values('ks', ascending=False)
dist_df.head(15)

## Decision Boundary Inclusion Test
What fraction of synthetic minority has predicted probability above a threshold (e.g., 0.5)?

In [None]:
thr = 0.5
p_syn = clf_ctgan.predict_proba(X_synth)[:,1]
inside = float((p_syn >= thr).mean())
p_min = clf_ctgan.predict_proba(X_min)[:,1]
inside_real = float((p_min >= thr).mean())
print({'synthetic_inside': inside, 'real_minority_inside': inside_real})

## Conclusion
- Do synthetic minority samples mimic real minority in feature and latent spaces?
- Are they inside the classifierâ€™s minority decision boundary?
- Support with the plots/tables above (KS/Wasserstein, UMAP, SHAP).