In [None]:
# Optional: install requirements in notebook runtime (uncomment if needed)
# !pip install -q -r ../requirements.txt
import os, json, warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from src import config
from src.utils import ensure_dirs, set_global_seed, save_json
from src.data_loader import download_stroke_dataset
from src.preprocess import get_feature_types, build_preprocessor, fit_transform, transform, ctgan_prepare_training_df
from src.models import train_and_eval
from src.oversampling import apply_smote, train_ctgan_and_generate

warnings.filterwarnings('ignore')
ensure_dirs()
set_global_seed(config.SEED)
print('Directories ready.')

In [None]:
# Download and load dataset using Kaggle API
csv_path = download_stroke_dataset(config.RAW_DIR)
df = pd.read_csv(csv_path)
print(df.shape)
df.head(3)

In [None]:
# EDA: summary, types, missing, class distribution, save to artifacts
summary = {
    'shape': df.shape,
    'dtypes': {c: str(t) for c, t in df.dtypes.items()},
    'missing': df.isna().sum().to_dict(),
}
target = config.TARGET_COL
cls_counts = df[target].value_counts().to_dict()
cls_pct = (df[target].value_counts(normalize=True)
            .rename(lambda x: f'class_{x}')
            .mul(100).round(2).to_dict())
summary['class_counts'] = cls_counts
summary['class_percentages'] = cls_pct
print('Class distribution (%):', cls_pct)
save_json(summary, os.path.join(config.ARTIFACTS_DIR, 'data_summary.json'))
pd.Series(cls_counts).plot(kind='bar', title='Class Counts'); plt.show()

## Train/Test Split and Preprocessing
- Stratified split with held-out test set.
- Tree models do not require scaling; we use OneHot for categorical.

In [None]:
# Split
y = df[target].values
X = df.drop(columns=[target])
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=config.TEST_SIZE, stratify=y, random_state=config.SEED
)
# Preprocessing
num_cols, cat_cols = get_feature_types(df)
pre = build_preprocessor(num_cols, cat_cols)
X_train, feat_names = fit_transform(pre, X_train_raw)
X_test = transform(pre, X_test_raw)
X_train = X_train.astype('float32'); X_test = X_test.astype('float32')
X_train.shape, X_test.shape, len(feat_names)

## Baseline: XGBoost (no resampling)
- Also compute class-weighted baseline via scale_pos_weight.

In [None]:
# Compute class weight ratio for reference
n_min = int((y_train == 1).sum()); n_maj = int((y_train == 0).sum())
spw = n_maj / max(1, n_min)
print('scale_pos_weight:', round(spw, 2))

cv_base, test_base, model_base = train_and_eval(X_train, y_train, X_test, y_test, scale_pos_weight=None)
cv_weighted, test_weighted, model_weighted = train_and_eval(X_train, y_train, X_test, y_test, scale_pos_weight=spw)
print('Baseline test ROC AUC:', round(test_base['roc_auc'], 4))
print('Weighted test ROC AUC:', round(test_weighted['roc_auc'], 4))

## SMOTE Oversampling (train folds only)

In [None]:
X_smote, y_smote = apply_smote(X_train, y_train, target_ratio=config.SMOTE_TARGET_RATIO)
cv_smote, test_smote, model_smote = train_and_eval(X_smote, y_smote, X_test, y_test)
print('SMOTE test ROC AUC:', round(test_smote['roc_auc'], 4))
# Save SMOTE synthetic indices (minority ones) is non-trivial post-resampling; skip detailed extraction.
# Instead, log counts
smote_info = {'train_size': int(len(y_train)), 'resampled_size': int(len(y_smote)), 'target_ratio': config.SMOTE_TARGET_RATIO}
save_json(smote_info, os.path.join(config.ARTIFACTS_DIR, 'smote_info.json'))

## CTGAN Oversampling (conditional minority generation)
- Train CTGAN on training data (with discrete columns), generate minority rows to reach target ratio.

In [None]:
# Prepare CTGAN training DF with imputations and discrete columns
train_df_ctgan = X_train_raw.copy()
train_df_ctgan[target] = y_train
train_df_ctgan, discrete_cols = ctgan_prepare_training_df(train_df_ctgan)
print('CTGAN discrete columns:', discrete_cols)

synth_min = train_ctgan_and_generate(train_df_ctgan, discrete_columns=discrete_cols, target_col=target, target_ratio=config.CTGAN_TARGET_RATIO, save_dir=config.CTGAN_DIR)
print('Generated minority rows:', len(synth_min))
# Save synthetic CSV
ctgan_csv = os.path.join(config.ARTIFACTS_DIR, 'synthetic_ctgan_minority.csv')
synth_min.to_csv(ctgan_csv, index=False)

# Encode synthetic to training feature space
if len(synth_min) > 0:
    X_synth_enc = transform(pre, synth_min.drop(columns=[target]))
    y_synth = synth_min[target].values.astype(int)
    X_ctgan = np.vstack([X_train, X_synth_enc]).astype('float32')
    y_ctgan = np.concatenate([y_train, y_synth])
else:
    X_ctgan, y_ctgan = X_train, y_train

cv_ctgan, test_ctgan, model_ctgan = train_and_eval(X_ctgan, y_ctgan, X_test, y_test)
print('CTGAN test ROC AUC:', round(test_ctgan['roc_auc'], 4))

## Compare Metrics and Curves

In [None]:
import json
from sklearn.metrics import roc_curve, precision_recall_curve, average_precision_score

comparison = {
    'baseline': test_base,
    'weighted': test_weighted,
    'smote': test_smote,
    'ctgan': test_ctgan,
}
save_json(comparison, os.path.join(config.ARTIFACTS_DIR, 'metrics.json'))
pd.DataFrame(comparison).T

In [None]:
# ROC/PR curves for the three models on the same axes
def collect_curves(model, X, y):
    prob = model.predict_proba(X)[:,1]
    fpr, tpr, _ = roc_curve(y, prob)
    prec, rec, _ = precision_recall_curve(y, prob)
    ap = average_precision_score(y, prob)
    return (fpr, tpr), (rec, prec), ap

curves = {
    'baseline': collect_curves(model_base, X_test, y_test),
    'smote': collect_curves(model_smote, X_test, y_test),
    'ctgan': collect_curves(model_ctgan, X_test, y_test),
}
fig, axes = plt.subplots(1,2, figsize=(12,5))
for k,(roc_c, pr_c, ap) in curves.items():
    fpr,tpr = roc_c; rec,prec = pr_c
    axes[0].plot(fpr,tpr,label=k)
    axes[1].plot(rec,prec,label=f
)
axes[0].plot([0,1],[0,1],'k--',alpha=.3); axes[0].set_title('ROC')
axes[0].set_xlabel('FPR'); axes[0].set_ylabel('TPR')
axes[1].set_title('Precision-Recall'); axes[1].set_xlabel('Recall'); axes[1].set_ylabel('Precision')
axes[0].legend(); axes[1].legend(); plt.show()

## Calibration & Threshold Analysis

In [None]:
from sklearn.metrics import precision_recall_curve
def plot_thresholds(model, X, y):
    prob = model.predict_proba(X)[:,1]
    prec, rec, th = precision_recall_curve(y, prob)
    fig, ax = plt.subplots(1,2, figsize=(12,4))
    ax[0].plot(rec, prec); ax[0].set_title('PR Curve'); ax[0].set_xlabel('Recall'); ax[0].set_ylabel('Precision')
    ax[1].plot(th, prec[:-1], label='Precision'); ax[1].plot(th, rec[:-1], label='Recall');
    ax[1].legend(); ax[1].set_title('Threshold vs Precision/Recall'); ax[1].set_xlabel('Threshold')
    plt.show()

plot_thresholds(model_ctgan, X_test, y_test)

## Conclusion (fill after running)
- Did CTGAN improve minority recall vs baseline and SMOTE?
- Numeric changes: recall_class1 and PR AUC deltas.
- Trade-offs in precision and false positives.

In [None]:
# Save trained models to models/
import joblib, os
from src import config

os.makedirs(config.MODELS_DIR, exist_ok=True)
joblib.dump(model_base, os.path.join(config.MODELS_DIR, 'baseline_model.pkl'))
joblib.dump(model_smote, os.path.join(config.MODELS_DIR, 'smote_model.pkl'))
joblib.dump(model_ctgan, os.path.join(config.MODELS_DIR, 'ctgan_model.pkl'))
print('Saved models to', config.MODELS_DIR)