### Step 1 ‚Äî Create the project skeleton

In [1]:
# STEP 1: Project skeleton
import os, pathlib, textwrap, sys

In [2]:
ROOT = pathlib.Path.cwd() / "mpox_repro_framework"
dirs = [
    "src/cbe_repro/epi",
    "src/cbe_repro/synth",
    "src/cbe_repro/eval",
    "src/cbe_repro/genai",
    "src/cbe_repro/experiments",
    "src/cbe_repro/configs",
    "src/cbe_repro/data",
    "tests",
    "docs",
    "scripts",
]


In [3]:
# 1) Make folders
for d in dirs:
    (ROOT / d).mkdir(parents=True, exist_ok=True)

In [4]:
# 2) Placeholders so imports won‚Äôt error later
(ROOT / "src" / "cbe_repro" / "__init__.py").write_text("__all__ = []\n")

13

In [5]:
# 3) Basic README + requirements (pin for reproducibility)
readme = """# Mpox Reproducibility Framework (with GenAI hooks)

This repo focuses on **reproducible** mpox modeling:
- deterministic outbreak simulations (SEIR),
- repeatable ML experiments (fixed seeds),
- GenAI hooks for synthetic data (class imbalance) and automated docs,
- a simple score to quantify reproducibility.

You'll add code in small steps‚Äîeach step is runnable and testable.
"""

(ROOT / "README.md").write_text(readme)

379

In [6]:
req = """numpy==1.26.4
pandas==2.2.2
scikit-learn==1.5.2
matplotlib==3.8.4
pyyaml==6.0.1
"""
(ROOT / "requirements.txt").write_text(req)

80

In [7]:
# 4) Tiny ‚Äúhello world‚Äù placeholders for later modules
(ROOT / "src" / "cbe_repro" / "epi" / "seir.py").write_text("def _placeholder():\n    pass\n")
(ROOT / "src" / "cbe_repro" / "synth" / "symptom_smote.py").write_text("def _placeholder():\n    pass\n")
(ROOT / "src" / "cbe_repro" / "eval" / "repro_score.py").write_text("def _placeholder():\n    pass\n")
(ROOT / "src" / "cbe_repro" / "genai" / "codegen.py").write_text("def _placeholder():\n    pass\n")
(ROOT / "src" / "cbe_repro" / "genai" / "docgen.py").write_text("def _placeholder():\n    pass\n")
(ROOT / "src" / "cbe_repro" / "experiments" / "run_experiment.py").write_text("def _placeholder():\n    pass\n")

29

In [8]:
# 5) Minimal configs we‚Äôll fill in later
(ROOT / "src" / "cbe_repro" / "configs" / "baseline.yaml").write_text("# to be filled in Step 4‚Äì7\n")
(ROOT / "src" / "cbe_repro" / "configs" / "genai.yaml").write_text("# to be filled in Step 7\n")
(ROOT / "src" / "cbe_repro" / "configs" / "datasets.yaml").write_text("# to be filled in Step 3\n")

25

In [9]:
# 6) Tests/docs/scripts placeholders
(ROOT / "tests" / "test_determinism.py").write_text("# to be implemented in Step 8\n")
(ROOT / "docs" / "README.md").write_text("Auto-generated reports will land here in Step 7.\n")
(ROOT / "scripts" / "run_baseline.sh").write_text("# to be filled in Step 7\n")
(ROOT / "scripts" / "run_genai.sh").write_text("# to be filled in Step 7\n")

25

In [10]:
print("‚úÖ Project skeleton created at:", ROOT.resolve())
for d in dirs:
    print(" -", (ROOT / d).relative_to(ROOT))

‚úÖ Project skeleton created at: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework
 - src/cbe_repro/epi
 - src/cbe_repro/synth
 - src/cbe_repro/eval
 - src/cbe_repro/genai
 - src/cbe_repro/experiments
 - src/cbe_repro/configs
 - src/cbe_repro/data
 - tests
 - docs
 - scripts


### Step 2 ‚Äî Create a tiny, reproducible mpox-like symptom dataset + register it

### 2.1 Generate & save the dataset

In [11]:
# A) Make the package importable in this notebook
import sys
from pathlib import Path

ROOT = Path.cwd() / "mpox_repro_framework"
SRC  = ROOT / "src"

if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))

print("sys.path updated ‚úÖ")

sys.path updated ‚úÖ


#### Step 2-Image(A) ‚Äî Register an image dataset

In [12]:
from pathlib import Path

RAW_DIR = Path.cwd() / "mpox_images_raw"  # adjust if your notebook is elsewhere
print("RAW_DIR:", RAW_DIR)

print("\nSubfolders:")
for p in RAW_DIR.iterdir():
    if p.is_dir():
        print(" -", p.name)

def count_images(d):
    exts = {".jpg",".jpeg",".png",".bmp"}
    return sum(1 for f in d.glob("*") if f.suffix.lower() in exts)

print("\nCounts:")
for cls in ["mpox", "non_mpox"]:
    d = RAW_DIR/cls
    print(f"{cls:10s}", count_images(d), "files")


RAW_DIR: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_images_raw

Subfolders:
 - non_mpox
 - mpox

Counts:
mpox       102 files
non_mpox   126 files


In [13]:
##2) Split raw into train/val (80/20) in the framework

In [14]:
import os, shutil, random
from pathlib import Path
import time # Import time for a small delay

# paths
ROOT = Path.cwd() / "mpox_repro_framework"
TARGET_DIR = ROOT / "src" / "cbe_repro" / "data" / "mpox_images"
RAW_DIR = Path.cwd() / "mpox_images_raw"   # change if needed

SPLIT_RATIO = 0.8
SEED = 1337
random.seed(SEED)

# --- More robust rmtree function ---
def rmtree_robust(path, max_attempts=5, delay_s=0.1):
    if not path.exists():
        return
    for attempt in range(max_attempts):
        try:
            shutil.rmtree(path)
            print(f"Successfully removed {path} on attempt {attempt + 1}")
            return
        except OSError as e:
            print(f"Attempt {attempt + 1} to remove {path} failed: {e}")
            if attempt < max_attempts - 1:
                time.sleep(delay_s) # Wait a bit before retrying
            else:
                raise # Re-raise if all attempts fail

# clean + make target dirs
if TARGET_DIR.exists():
    rmtree_robust(TARGET_DIR) # Use the robust function here
for sub in ["train/mpox","train/non_mpox","val/mpox","val/non_mpox"]:
    (TARGET_DIR / sub).mkdir(parents=True, exist_ok=True)

def split_class(class_name):
    exts = {".jpg",".jpeg",".png",".bmp"}
    files = [f for f in (RAW_DIR/class_name).glob("*") if f.suffix.lower() in exts]
    files.sort()           # deterministic order before shuffling
    random.shuffle(files)  # seeded shuffle
    n_train = int(len(files)*SPLIT_RATIO)
    train_files, val_files = files[:n_train], files[n_train:]
    for f in train_files:
        shutil.copy(f, TARGET_DIR/"train"/class_name/f.name)
    for f in val_files:
        shutil.copy(f, TARGET_DIR/"val"/class_name/f.name)
    return len(train_files), len(val_files)

for cls in ["mpox", "non_mpox"]:
    tr, va = split_class(cls)
    print(f"{cls:10s} ‚Üí train: {tr:4d}, val: {va:4d}")

print("\n‚úÖ Split complete at:", TARGET_DIR)

Successfully removed /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/data/mpox_images on attempt 1
mpox       ‚Üí train:   81, val:   21
non_mpox   ‚Üí train:  100, val:   26

‚úÖ Split complete at: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/data/mpox_images


In [15]:
### 3) Register the dataset in datasets.yaml

In [16]:
import yaml
from pathlib import Path

ROOT = Path.cwd() / "mpox_repro_framework"
cfg_path = ROOT / "src" / "cbe_repro" / "configs" / "datasets.yaml"

# load existing (robust to empty file)
existing_text = cfg_path.read_text() if cfg_path.exists() else ""
registry = yaml.safe_load(existing_text)
if registry is None:
    registry = {}

# add/replace mpox_images entry
registry["mpox_images"] = {
    "root": str(TARGET_DIR),            # absolute path to the split dataset
    "splits": {"train": "train", "val": "val"},
    "classes": {"positive": "mpox", "negative": "non_mpox"}
}

cfg_path.write_text(yaml.safe_dump(registry, sort_keys=False))
print("‚úÖ Updated datasets.yaml")
print(cfg_path.read_text())


‚úÖ Updated datasets.yaml
mpox_images:
  root: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/data/mpox_images
  splits:
    train: train
    val: val
  classes:
    positive: mpox
    negative: non_mpox



In [17]:
from pathlib import Path

root = TARGET_DIR
for split in ["train","val"]:
    for cls in ["mpox","non_mpox"]:
        n = sum(1 for f in (root/split/cls).glob("*") if f.suffix.lower() in {".jpg",".jpeg",".png",".bmp"})
        print(f"{split:5s} / {cls:9s}: {n} images")


train / mpox     : 81 images
train / non_mpox : 100 images
val   / mpox     : 21 images
val   / non_mpox : 26 images


### Step 2-Image(B) ‚Äî Add a minimal loader + deterministic augmentor

In [18]:
# Overwrite image_loader.py with ASCII-safe content
from pathlib import Path

ROOT = Path.cwd() / "mpox_repro_framework"
code_path = ROOT / "src" / "cbe_repro" / "synth" / "image_loader.py"

code = '''# -*- coding: utf-8 -*-
from __future__ import annotations
import numpy as np
from pathlib import Path
from PIL import Image

class ImageFolderDataset:
    """
    Minimal, dependency-light loader for mpox lesion images.
    - Expects ImageFolder-style structure: root/{split}/{class}/*.jpg
    - Deterministic augmentations for reproducible "GenAI-style" oversampling
    - Simple featurization: downsample + color histograms (works with sklearn)
    """
    def __init__(self, root: str, split: str,
                 pos_cls: str = "mpox", neg_cls: str = "non_mpox",
                 seed: int = 1337, img_size: int = 128):
        self.root = Path(root)
        self.split = split
        self.pos_cls = pos_cls
        self.neg_cls = neg_cls
        self.img_size = img_size
        self.rng = np.random.default_rng(seed)
        self.samples, self.labels = self._collect()

    def _collect(self):
        exts = {".jpg", ".jpeg", ".png", ".bmp"}
        paths, labels = [], []
        for label, cls in [(1, self.pos_cls), (0, self.neg_cls)]:
            for p in (self.root / self.split / cls).glob("*"):
                if p.suffix.lower() in exts:
                    paths.append(p)
                    labels.append(label)
        return np.array(paths), np.array(labels, dtype=int)

    def __len__(self):
        return len(self.samples)

    # ---------- I/O + deterministic aug ----------

    def _load_image(self, path: Path) -> Image.Image:
        img = Image.open(path).convert("RGB")
        img = img.resize((self.img_size, self.img_size))
        return img

    def _deterministic_augment(self, img: Image.Image, idx: int, enable: bool = False) -> Image.Image:
        """A tiny, deterministic augmentor controlled by sample index."""
        if not enable:
            return img
        m = idx % 4
        if m == 1:
            img = img.transpose(Image.FLIP_LEFT_RIGHT)
        elif m == 2:
            img = img.rotate(90, expand=False)
        elif m == 3:
            img = img.rotate(270, expand=False)
        return img

    # ---------- simple features ----------

    def _simple_features(self, img: Image.Image) -> np.ndarray:
        """
        Features = downsampled pixels + 3x16-bin color histograms.
        Keeps it light so we can use scikit-learn models deterministically.
        """
        arr = np.asarray(img, dtype=np.float32) / 255.0  # [H,W,3]
        # 3 histograms (R/G/B) with 16 bins each
        hist_r, _ = np.histogram(arr[:, :, 0], bins=16, range=(0, 1), density=True)
        hist_g, _ = np.histogram(arr[:, :, 1], bins=16, range=(0, 1), density=True)
        hist_b, _ = np.histogram(arr[:, :, 2], bins=16, range=(0, 1), density=True)
        # downsample (32x32x3 if img_size=128) and flatten
        flat = arr[::4, ::4, :].reshape(-1)
        feats = np.concatenate([flat, hist_r, hist_g, hist_b]).astype(np.float32)
        return feats

    # ---------- public API ----------

    def as_features_labels(self, synth_enabled: bool = False, synth_multiplier: float = 1.0):
        """
        Returns (X, y).
        If synth_enabled and multiplier>1, oversamples positives deterministically
        using the augmentor to diversify the added copies.
        """
        # base features
        X, y = [], []
        for i, p in enumerate(self.samples):
            img = self._load_image(p)
            img = self._deterministic_augment(img, i, enable=False)  # no aug on base
            X.append(self._simple_features(img))
            y.append(self.labels[i])
        X = np.vstack(X).astype(np.float32)
        y = np.array(y, dtype=int)

        # deterministic oversample of positives if requested
        if synth_enabled and synth_multiplier > 1.0:
            pos_idx = np.where(y == 1)[0]
            if len(pos_idx) > 0:
                target = int(len(pos_idx) * synth_multiplier)
                extra = target - len(pos_idx)
                if extra > 0:
                    rep = (extra + len(pos_idx) - 1) // len(pos_idx)
                    aug_idx = np.tile(pos_idx, rep)[:extra]
                    X_aug = []
                    for k, idx in enumerate(aug_idx):
                        img = self._load_image(self.samples[idx])
                        img = self._deterministic_augment(img, k, enable=True)
                        X_aug.append(self._simple_features(img))
                    if X_aug:
                        X = np.vstack([X, np.vstack(X_aug).astype(np.float32)])
                        y = np.concatenate([y, np.ones(len(X_aug), dtype=int)])
        return X, y
'''
code_path.parent.mkdir(parents=True, exist_ok=True)
code_path.write_text(code, encoding="utf-8")
print("Rewrote:", code_path)


Rewrote: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/synth/image_loader.py


In [19]:
import shutil
from pathlib import Path
pycache = (Path.cwd() / "mpox_repro_framework" / "src" / "cbe_repro" / "__pycache__")
shutil.rmtree(pycache, ignore_errors=True)
print("Cleared __pycache__ (if existed).")

Cleared __pycache__ (if existed).


In [20]:
import sys
from pathlib import Path

ROOT = Path.cwd() / "mpox_repro_framework"
SRC  = ROOT / "src"
if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))

from cbe_repro.synth.image_loader import ImageFolderDataset
print("Imported ImageFolderDataset ‚úÖ")

Imported ImageFolderDataset ‚úÖ


In [21]:
# STEP 2-Image(B).2: Smoke test the loader on your split dataset
from pathlib import Path
import yaml
from collections import Counter

ROOT = Path.cwd() / "mpox_repro_framework"
cfg_path = ROOT / "src" / "cbe_repro" / "configs" / "datasets.yaml"

# read registry
reg = yaml.safe_load(cfg_path.read_text())
entry = reg["mpox_images"]
root = entry["root"]
pos = entry["classes"]["positive"]
neg = entry["classes"]["negative"]
splits = entry["splits"]

# import the loader we wrote
from cbe_repro.synth.image_loader import ImageFolderDataset

SEED = 1337
IMG_SIZE = 128

# train and val datasets
train_ds = ImageFolderDataset(root, splits["train"], pos_cls=pos, neg_cls=neg, seed=SEED, img_size=IMG_SIZE)
val_ds   = ImageFolderDataset(root, splits["val"],   pos_cls=pos, neg_cls=neg, seed=SEED, img_size=IMG_SIZE)

# extract features
Xtr, ytr = train_ds.as_features_labels(synth_enabled=False)
Xva, yva = val_ds.as_features_labels(synth_enabled=False)

print("Train features:", Xtr.shape, "Label counts:", Counter(ytr))
print("Val   features:", Xva.shape, "Label counts:", Counter(yva))

# test GenAI-style oversampling (triple positives)
Xtr_gen, ytr_gen = train_ds.as_features_labels(synth_enabled=True, synth_multiplier=3.0)
print("Train (synth x3):", Xtr_gen.shape, "Label counts:", Counter(ytr_gen))


Train features: (181, 3120) Label counts: Counter({np.int64(0): 100, np.int64(1): 81})
Val   features: (47, 3120) Label counts: Counter({np.int64(0): 26, np.int64(1): 21})
Train (synth x3): (343, 3120) Label counts: Counter({np.int64(1): 243, np.int64(0): 100})


In [22]:
from pathlib import Path

ROOT = Path.cwd() / "mpox_repro_framework"
pkg = ROOT / "src" / "cbe_repro"
loader_file = pkg / "synth" / "image_loader.py"

# create __init__.py if it doesn't exist
(pkg).mkdir(parents=True, exist_ok=True)
(pkg / "__init__.py").write_text("__all__ = []\n") if not (pkg / "__init__.py").exists() else None

print("Package dir:", pkg.exists(), pkg)
print("Loader file:", loader_file.exists(), loader_file)

Package dir: True /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro
Loader file: True /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/synth/image_loader.py


### Step 2-Image(C) ‚Äî Add a tiny imaging experiment runner

In [23]:
# STEP 2-Image(C).1: Write imaging experiment runner
from pathlib import Path

ROOT = Path.cwd() / "mpox_repro_framework"
runner_path = ROOT / "src" / "cbe_repro" / "experiments" / "run_imaging.py"

code = '''# -*- coding: utf-8 -*-
from __future__ import annotations
import json, time, pathlib, yaml, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

from cbe_repro.synth.image_loader import ImageFolderDataset

def set_seed(seed: int):
    np.random.seed(seed)

def load_cfg(name: str):
    cfg_dir = pathlib.Path(__file__).resolve().parents[1] / "configs"
    with open(cfg_dir / name, "r") as f:
        return yaml.safe_load(f)

def _load_dataset_entry(reg_name: str):
    cfg_dir = pathlib.Path(__file__).resolve().parents[1] / "configs"
    with open(cfg_dir / "datasets.yaml","r") as f:
        reg = yaml.safe_load(f)
    return reg[reg_name]

def main(config_name="imaging_baseline.yaml"):
    cfg = load_cfg(config_name)
    seed = int(cfg.get("seed", 1337))
    set_seed(seed)

    ds_reg = _load_dataset_entry(cfg["dataset"])
    root = ds_reg["root"]
    pos = ds_reg["classes"]["positive"]
    neg = ds_reg["classes"]["negative"]
    splits = ds_reg["splits"]

    # Train features (optionally with deterministic oversampling)
    train_ds = ImageFolderDataset(root, splits["train"], pos_cls=pos, neg_cls=neg,
                                  seed=seed, img_size=cfg["image_size"])
    X_tr, y_tr = train_ds.as_features_labels(
        synth_enabled=cfg.get("synth",{}).get("enabled", False),
        synth_multiplier=float(cfg.get("synth",{}).get("minority_multiplier", 1.0))
    )

    # Validation features (clean)
    val_ds = ImageFolderDataset(root, splits["val"], pos_cls=pos, neg_cls=neg,
                                seed=seed, img_size=cfg["image_size"])
    X_va, y_va = val_ds.as_features_labels(synth_enabled=False)

    # Simple, deterministic classifier
    model = LogisticRegression(max_iter=300, **cfg.get("model",{}).get("params", {}))
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_va)

    acc = float(accuracy_score(y_va, y_pred))
    f1  = float(f1_score(y_va, y_pred))

    manifest = {
        "run_id": str(int(time.time())),
        "started": time.strftime("%Y-%m-%d %H:%M:%S"),
        "config_name": config_name,
        "metrics": {"acc": acc, "f1": f1},
        "notes": f"imaging pipeline; synth={cfg.get('synth',{}).get('enabled', False)}; img_size={cfg['image_size']}"
    }
    out_dir = pathlib.Path.cwd() / "runs" / manifest["run_id"]
    out_dir.mkdir(parents=True, exist_ok=True)
    (out_dir / "manifest.json").write_text(json.dumps(manifest, indent=2))
    print(json.dumps(manifest, indent=2))

if __name__ == "__main__":
    import argparse
    ap = argparse.ArgumentParser()
    ap.add_argument("--config-name", default="imaging_baseline.yaml")
    args = ap.parse_args()
    main(config_name=args.config_name)
'''
runner_path.parent.mkdir(parents=True, exist_ok=True)
runner_path.write_text(code, encoding="utf-8")
print("‚úÖ Wrote:", runner_path)


‚úÖ Wrote: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/experiments/run_imaging.py


### Add two configs: baseline vs GenAI

In [24]:
# STEP 2-Image(C).2: Write imaging configs
import yaml
from pathlib import Path

ROOT = Path.cwd() / "mpox_repro_framework"
cfg_dir = ROOT / "src" / "cbe_repro" / "configs"
cfg_dir.mkdir(parents=True, exist_ok=True)

imaging_baseline = {
    "seed": 1337,
    "dataset": "mpox_images",
    "task": "imaging_classification",
    "image_size": 128,
    "model": {"name": "LogisticRegression", "params": {"C": 1.0}},
    "synth": {"enabled": False, "minority_multiplier": 1.0}
}

imaging_genai = {
    "seed": 1337,
    "dataset": "mpox_images",
    "task": "imaging_classification",
    "image_size": 128,
    "model": {"name": "LogisticRegression", "params": {"C": 1.0}},
    "synth": {"enabled": True, "minority_multiplier": 3.0}  # deterministically oversample positives
}

(cfg_dir / "imaging_baseline.yaml").write_text(yaml.safe_dump(imaging_baseline, sort_keys=False))
(cfg_dir / "imaging_genai.yaml").write_text(yaml.safe_dump(imaging_genai, sort_keys=False))

print("‚úÖ Wrote:", cfg_dir / "imaging_baseline.yaml")
print("‚úÖ Wrote:", cfg_dir / "imaging_genai.yaml")


‚úÖ Wrote: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/configs/imaging_baseline.yaml
‚úÖ Wrote: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/configs/imaging_genai.yaml


### Run both experiments and compare

In [25]:
# Run imaging experiments with PYTHONPATH pointing at ./src
import os, sys, subprocess
from pathlib import Path

ROOT = Path.cwd() / "mpox_repro_framework"
SRC  = ROOT / "src"

env = os.environ.copy()
env["PYTHONPATH"] = str(SRC) + os.pathsep + env.get("PYTHONPATH", "")

def run_cfg(cfg_name):
    print(f"\n=== {cfg_name} ===")
    p = subprocess.run(
        [sys.executable, "-m", "cbe_repro.experiments.run_imaging", "--config-name", cfg_name],
        cwd=ROOT,
        env=env,
        capture_output=True,
        text=True
    )
    print("Return code:", p.returncode)
    if p.stdout: print("-- STDOUT --\n", p.stdout)
    if p.stderr: print("-- STDERR --\n", p.stderr)

run_cfg("imaging_baseline.yaml")
run_cfg("imaging_genai.yaml")



=== imaging_baseline.yaml ===
Return code: 0
-- STDOUT --
 {
  "run_id": "1756981262",
  "started": "2025-09-04 12:21:02",
  "config_name": "imaging_baseline.yaml",
  "metrics": {
    "acc": 0.5957446808510638,
    "f1": 0.5777777777777777
  },
  "notes": "imaging pipeline; synth=False; img_size=128"
}


=== imaging_genai.yaml ===
Return code: 0
-- STDOUT --
 {
  "run_id": "1756981264",
  "started": "2025-09-04 12:21:04",
  "config_name": "imaging_genai.yaml",
  "metrics": {
    "acc": 0.574468085106383,
    "f1": 0.6
  },
  "notes": "imaging pipeline; synth=True; img_size=128"
}



### Tabular loader & experiment runner

In [26]:
# Create tiny symptom dataset + ensure datasets.yaml has BOTH entries
import numpy as np, pandas as pd, yaml
from pathlib import Path

ROOT = Path.cwd() / "mpox_repro_framework"
DATA_DIR = ROOT / "src" / "cbe_repro" / "data"
CFG_PATH = ROOT / "src" / "cbe_repro" / "configs" / "datasets.yaml"
DATA_DIR.mkdir(parents=True, exist_ok=True)

csv_path = DATA_DIR / "symptom_demo.csv"

# 1A) Make the dataset if missing
if not csv_path.exists():
    rng = np.random.default_rng(1337)
    def synth_symptom(n: int, label: int, rng):
        p = {
            1: dict(fever=0.65, rash=0.70, lymph=0.55, headache=0.45, age_mu=32, age_sd=8),
            0: dict(fever=0.20, rash=0.10, lymph=0.15, headache=0.25, age_mu=29, age_sd=8),
        }[label]
        return pd.DataFrame({
            "fever":    rng.binomial(1, p["fever"],    n),
            "rash":     rng.binomial(1, p["rash"],     n),
            "lymph":    rng.binomial(1, p["lymph"],    n),
            "headache": rng.binomial(1, p["headache"], n),
            "age":      rng.normal(p["age_mu"], p["age_sd"], n).clip(0, 90),
            "label":    label
        })
    df = pd.concat([synth_symptom(40, 1, rng), synth_symptom(160, 0, rng)], ignore_index=True)
    df.to_csv(csv_path, index=False)
    print("‚úÖ Wrote dataset:", csv_path)
else:
    print("Dataset already exists:", csv_path)

# 1B) Merge/update datasets.yaml
existing = yaml.safe_load(CFG_PATH.read_text()) if CFG_PATH.exists() else {}
if existing is None:
    existing = {}

# preserve any existing mpox_images entry
mpox_images_entry = existing.get("mpox_images")

# add/update symptom_demo entry
existing["symptom_demo"] = {
    "path": "data/symptom_demo.csv"  # the runner reads from src/cbe_repro/data/<basename>
}

# restore mpox_images if it was present
if mpox_images_entry:
    existing["mpox_images"] = mpox_images_entry

CFG_PATH.write_text(yaml.safe_dump(existing, sort_keys=False))
print("‚úÖ Updated datasets.yaml:\n", CFG_PATH.read_text())


Dataset already exists: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/data/symptom_demo.csv
‚úÖ Updated datasets.yaml:
 mpox_images:
  root: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/data/mpox_images
  splits:
    train: train
    val: val
  classes:
    positive: mpox
    negative: non_mpox
symptom_demo:
  path: data/symptom_demo.csv



#### 3.1 Loader + synthetic augmentation (SMOTE/oversample fallback)

In [27]:
# src/cbe_repro/synth/symptom_smote.py
from pathlib import Path

ROOT = Path.cwd() / "mpox_repro_framework"
code_path = ROOT / "src" / "cbe_repro" / "synth" / "symptom_smote.py"

code = '''# -*- coding: utf-8 -*-
from __future__ import annotations
import numpy as np, pandas as pd

def _counts(y: pd.Series):
    # returns dict {label: count}
    vals, cnts = np.unique(y, return_counts=True)
    return dict(zip(vals, cnts))

def simple_random_oversample_any_minority(
    X: pd.DataFrame, y: pd.Series, target_ratio: float = 1.0, seed: int = 1337
):
    """
    Randomly oversample the true minority class(es) until:
      minority_count ‚âà target_ratio * majority_count
    target_ratio=1.0 -> full balance (equal counts).
    """
    rng = np.random.default_rng(seed)
    counts = _counts(y)
    if len(counts) <= 1:
        return X, y  # nothing to balance

    # identify majority and minority labels
    maj_label = max(counts, key=counts.get)
    maj_n = counts[maj_label]
    target_min_n = int(np.ceil(target_ratio * maj_n))

    X_aug, y_aug = X.copy(), y.copy()
    for lbl, n in counts.items():
        if lbl == maj_label:
            continue
        if n >= target_min_n:
            continue  # already at/above target
        need = target_min_n - n
        idx = np.where(y == lbl)[0]
        sampled = rng.choice(idx, size=need, replace=True)
        X_aug = pd.concat([X_aug, X.iloc[sampled]], ignore_index=True)
        y_aug = pd.concat([y_aug, y.iloc[sampled]], ignore_index=True)

    return X_aug, y_aug

def smote_or_oversample(
    X: pd.DataFrame,
    y: pd.Series,
    multiplier: float = 2.0,        # kept for backward-compat
    seed: int = 1337,
    balance_to_max: bool = False,   # NEW: if True, ignore multiplier and fully balance
    target_ratio: float | None = None  # NEW: minority/majority ratio; 1.0 == full balance
):
    """
    If balance_to_max=True -> fully balance (minority up to majority).
    Else if target_ratio is given -> minority up to target_ratio * majority.
    Else fallback to legacy 'multiplier' for *positive-class* only (y==1).
    """
    # Decide strategy
    if balance_to_max or (target_ratio is not None):
        ratio = 1.0 if balance_to_max else float(target_ratio)
        try:
            from imblearn.over_sampling import SMOTE
            counts = _counts(y)
            if len(counts) <= 1:
                return X, y

            # Build sampling_strategy dict: for every minority class, desired count
            maj_n = max(counts.values())
            desired = {lbl: max(n, int(np.ceil(ratio * maj_n)))
                       for lbl, n in counts.items()}
            # imblearn expects only minority targets in dict; remove majority if equal
            maj_label = max(counts, key=counts.get)
            if desired.get(maj_label, maj_n) == maj_n:
                desired.pop(maj_label, None)

            if not desired:
                return X, y  # already balanced to desired ratio

            # k_neighbors must be < minority count; pick safely
            min_minority_n = min(n for lbl, n in counts.items() if lbl != maj_label)
            k = max(1, min(5, min_minority_n - 1))
            sm = SMOTE(random_state=seed, k_neighbors=k, sampling_strategy=desired)
            X_res, y_res = sm.fit_resample(X, y)
            print(f"[DEBUG] SMOTE balance ‚Üí before={counts} after={_counts(y_res)} target_ratio={ratio}")
            # Return as pandas
            return (pd.DataFrame(X_res, columns=X.columns)
                    if not isinstance(X, pd.DataFrame) else X_res, 
                    pd.Series(y_res) if not isinstance(y, pd.Series) else y_res)
        except Exception as e:
            print(f"[DEBUG] SMOTE unavailable ({e}); falling back to random oversample")
            X2, y2 = simple_random_oversample_any_minority(X, y, target_ratio=ratio, seed=seed)
            print(f"[DEBUG] Random balance ‚Üí before={_counts(y)} after={_counts(y2)} target_ratio={ratio}")
            return X2, y2

    # -------- Legacy path (kept so your old profiles still run) --------
    # Only oversamples the positive class (1) by 'multiplier' ‚Äî not class-aware.
    # Prefer the balanced modes above for real class balancing.
    try:
        from imblearn.over_sampling import SMOTE
        pos_n = int((y == 1).sum()); neg_n = int((y == 0).sum())
        k = max(1, min(5, pos_n - 1))
        sm = SMOTE(random_state=seed, k_neighbors=k)
        X_res, y_res = sm.fit_resample(X, y)
        target_pos = int(multiplier * pos_n)
        cur_pos = int((y_res == 1).sum())
        if target_pos > cur_pos:
            more = target_pos - cur_pos
            pos_idx = np.where(y_res == 1)[0]
            rng = np.random.default_rng(seed)
            sampled = rng.choice(pos_idx, size=more, replace=True)
            X_res = pd.concat(
                [pd.DataFrame(X_res, columns=X.columns),
                 pd.DataFrame(X_res[sampled], columns=X.columns)],
                ignore_index=True
            )
            y_res = pd.concat([pd.Series(y_res), pd.Series(y_res[sampled])], ignore_index=True)
        print(f"[DEBUG] Legacy synth ‚Üí before=[{neg_n} {pos_n}] after={_counts(y_res)} multiplier={multiplier}")
        return X_res, y_res
    except Exception:
        # simple positive-only oversample
        rng = np.random.default_rng(seed)
        pos_idx = np.where(y == 1)[0]
        if len(pos_idx) == 0 or multiplier <= 1.0:  # nothing to do
            return X, y
        n_new = int((multiplier - 1.0) * len(pos_idx))
        sampled = rng.choice(pos_idx, size=n_new, replace=True)
        X_aug = pd.concat([X, X.iloc[sampled]], ignore_index=True)
        y_aug = pd.concat([y, y.iloc[sampled]], ignore_index=True)
        print(f"[DEBUG] Legacy random ‚Üí before={_counts(y)} after={_counts(y_aug)} multiplier={multiplier}")
        return X_aug, y_aug

'''
code_path.parent.mkdir(parents=True, exist_ok=True)
code_path.write_text(code, encoding="utf-8")
print("‚úÖ Wrote:", code_path)


‚úÖ Wrote: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/synth/symptom_smote.py


#### Tabular experiment runner

In [28]:
# src/cbe_repro/experiments/run_tabular.py
from pathlib import Path

ROOT = Path.cwd() / "mpox_repro_framework"
runner_path = ROOT / "src" / "cbe_repro" / "experiments" / "run_tabular.py"

code = '''# -*- coding: utf-8 -*-
from __future__ import annotations
import numpy as np, pandas as pd

def _as_df(X):
    # Ensure we keep column names after imblearn returns numpy arrays
    if isinstance(X, pd.DataFrame):
        return X
    raise TypeError("X must be a pandas DataFrame")

def _as_sr(y):
    if isinstance(y, pd.Series):
        return y
    raise TypeError("y must be a pandas Series")

def simple_random_oversample(X: pd.DataFrame, y: pd.Series, target_min_count: int, seed: int=1337):
    """Randomly oversample the current minority class up to target_min_count."""
    rng = np.random.default_rng(seed)
    X = _as_df(X); y = _as_sr(y)

    # Identify classes
    vc = y.value_counts()
    minority_label = vc.idxmin()
    minority_idx = np.where(y.values == minority_label)[0]

    need = int(target_min_count) - len(minority_idx)
    if need <= 0:
        return X, y

    sampled = rng.choice(minority_idx, size=need, replace=True)
    X_aug = pd.concat([X, X.iloc[sampled]], ignore_index=True)
    y_aug = pd.concat([y, y.iloc[sampled]], ignore_index=True)
    return X_aug, y_aug

def smote_or_oversample(
    X: pd.DataFrame,
    y: pd.Series,
    multiplier: float = 2.0,
    seed: int = 1337,
    *,
    balance_to_max: bool = False,
    target_ratio: float | None = None,
):
    """
    Oversample the minority class using SMOTE when available; else simple random oversampling.

    Parameters
    ----------
    multiplier : float
        If neither balance_to_max nor target_ratio is set, grow the minority count by this factor.
        e.g., 2.0 doubles the minority count.
    balance_to_max : bool
        If True, make the minority count equal to the majority count (full balance).
    target_ratio : float | None
        Desired minority/majority ratio (e.g., 0.7 -> minority will be 70% of majority size).

    Notes
    -----
    - Minority class is detected automatically from y‚Äôs value_counts().
    - Works when the *positive* class is the majority as well (we just balance the minority).
    """
    X = _as_df(X); y = _as_sr(y)

    vc = y.value_counts()
    if len(vc) < 2:
        # Nothing to resample if only one class
        return X, y

    # Identify minority/majority
    minority_label = vc.idxmin()
    majority_label = vc.idxmax()
    n_min = int(vc.loc[minority_label])
    n_maj = int(vc.loc[majority_label])

    # Decide target minority count
    if balance_to_max:
        target_min = n_maj
    elif target_ratio is not None:
        target_min = int(round(target_ratio * n_maj))
    else:
        target_min = int(round(multiplier * n_min))

    # No need to upsample if target not larger than current minority
    if target_min <= n_min:
        return X, y

    # Try SMOTE first
    try:
        from imblearn.over_sampling import SMOTE
        # sampling_strategy expects the *final* minority count
        sampling_strategy = {minority_label: target_min}
        k = min(5, max(1, n_min - 1))
        sm = SMOTE(random_state=seed, k_neighbors=k, sampling_strategy=sampling_strategy)
        X_res, y_res = sm.fit_resample(X, y)
        # Keep as DataFrame/Series with original column names
        X_res = pd.DataFrame(X_res, columns=X.columns)
        y_res = pd.Series(y_res, name=y.name)
        return X_res, y_res
    except Exception:
        # Fallback: random oversample to target_min
        return simple_random_oversample(X, y, target_min_count=target_min, seed=seed)

'''
runner_path.parent.mkdir(parents=True, exist_ok=True)
runner_path.write_text(code, encoding="utf-8")
print("‚úÖ Wrote:", runner_path)


‚úÖ Wrote: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/experiments/run_tabular.py


#### Tabular configs (baseline vs GenAI)

In [29]:
# src/cbe_repro/configs/tabular_baseline.yaml & tabular_genai.yaml
import yaml
from pathlib import Path
import warnings
warnings.filterwarnings("ignore", message=".*_validate_data.*", category=FutureWarning)

ROOT = Path.cwd() / "mpox_repro_framework"
cfg_dir = ROOT / "src" / "cbe_repro" / "configs"

tabular_baseline = {
    "seed": 1337,
    "dataset": "symptom_demo",   # was registered earlier
    "task": "symptom_classification",
    "model": {"name": "RandomForestClassifier", "params": {"n_estimators": 200, "max_depth": 6, "random_state": 1337}},
    "synth": {"enabled": False, "minority_multiplier": 1.0}
}

tabular_genai = {
    "seed": 1337,
    "dataset": "symptom_demo",
    "task": "symptom_classification",
    "model": {"name": "RandomForestClassifier", "params": {"n_estimators": 300, "max_depth": 8, "random_state": 1337, "class_weight": "balanced" }},
    "synth": {"enabled": True, "minority_multiplier": 2.0}
}

(cfg_dir / "tabular_baseline.yaml").write_text(yaml.safe_dump(tabular_baseline, sort_keys=False))
(cfg_dir / "tabular_genai.yaml").write_text(yaml.safe_dump(tabular_genai, sort_keys=False))

print("‚úÖ Wrote:", cfg_dir / "tabular_baseline.yaml")
print("‚úÖ Wrote:", cfg_dir / "tabular_genai.yaml")


‚úÖ Wrote: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/configs/tabular_baseline.yaml
‚úÖ Wrote: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/configs/tabular_genai.yaml


#### Run tabular experiments

In [30]:
import yaml
from pathlib import Path
CFG_PATH = Path.cwd() / "mpox_repro_framework" / "src" / "cbe_repro" / "configs" / "datasets.yaml"
reg = yaml.safe_load(CFG_PATH.read_text())
print("Registry keys:", list(reg.keys()))
assert "symptom_demo" in reg, "symptom_demo not found in datasets.yaml"


Registry keys: ['mpox_images', 'symptom_demo']


In [31]:
# Run tabular baseline & genai with PYTHONPATH
import os, sys, subprocess
from pathlib import Path

ROOT = Path.cwd() / "mpox_repro_framework"
SRC  = ROOT / "src"

env = os.environ.copy()
env["PYTHONPATH"] = str(SRC) + os.pathsep + env.get("PYTHONPATH", "")

def run_cfg(cfg_name):
    print(f"\n=== {cfg_name} ===")
    p = subprocess.run(
        [sys.executable, "-m", "cbe_repro.experiments.run_tabular", "--config-name", cfg_name],
        cwd=ROOT,
        env=env,
        capture_output=True,
        text=True
    )
    print("Return code:", p.returncode)
    if p.stdout: print("-- STDOUT --\n", p.stdout)
    if p.stderr: print("-- STDERR --\n", p.stderr)

run_cfg("tabular_baseline.yaml")
run_cfg("tabular_genai.yaml")



=== tabular_baseline.yaml ===
Return code: 0

=== tabular_genai.yaml ===
Return code: 0


In [32]:
# # Build comparison across imaging + tabular and save to CSV/Markdown
# import json, pandas as pd
# from pathlib import Path

# ROOT = Path.cwd() / "mpox_repro_framework"
# RUNS = ROOT / "runs"

# rows = []
# for d in RUNS.glob("*"):
#     mf = d / "manifest.json"
#     if mf.exists():
#         j = json.loads(mf.read_text())
#         rows.append({
#             "config": j["config_name"],
#             "acc": j["metrics"]["acc"],
#             "f1": j["metrics"]["f1"],
#             "started": j["started"],
#             "run_id": j["run_id"]
#         })

# df = pd.DataFrame(rows)
# df = df.sort_values("started").groupby("config", as_index=False).tail(1).reset_index(drop=True)

# def pair(stem):
#     base = df[df["config"]==f"{stem}_baseline.yaml"]
#     gen  = df[df["config"]==f"{stem}_genai.yaml"]
#     if base.empty or gen.empty: return None
#     return {
#         "task": stem,
#         "baseline_acc": float(base["acc"].iloc[0]),
#         "genai_acc": float(gen["acc"].iloc[0]),
#         "delta_acc": float(gen["acc"].iloc[0]) - float(base["acc"].iloc[0]),
#         "baseline_f1": float(base["f1"].iloc[0]),
#         "genai_f1": float(gen["f1"].iloc[0]),
#         "delta_f1": float(gen["f1"].iloc[0]) - float(base["f1"].iloc[0])
#     }

# rows = []
# for stem in ["imaging", "tabular"]:
#     p = pair(stem)
#     if p: rows.append(p)
# comp = pd.DataFrame(rows)
# display(df)
# display(comp)

# # Save artifacts
# out_dir = ROOT / "reports"
# out_dir.mkdir(parents=True, exist_ok=True)
# df.to_csv(out_dir / "all_latest_runs.csv", index=False)
# comp.to_csv(out_dir / "comparison_table.csv", index=False)

# # Markdown summary snippet for your paper
# md = [
#     "# Baseline vs GenAI-assisted Reproducibility (Latest Runs)",
#     "",
#     "## Per-config latest metrics",
#     df.to_markdown(index=False),
#     "",
#     "## Baseline vs GenAI (delta)",
#     comp.to_markdown(index=False)
# ]
# (out_dir / "comparison_summary.md").write_text("\n".join(md), encoding="utf-8")
# print("Saved:", out_dir / "all_latest_runs.csv")
# print("Saved:", out_dir / "comparison_table.csv")
# print("Saved:", out_dir / "comparison_summary.md")


KeyError: 'config_name'

In [33]:
# quick inner-CV over the train fold to pick multiplier (1.0 = no oversample)
import numpy as np, pandas as pd, yaml
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from cbe_repro.synth.symptom_smote import smote_or_oversample

ROOT = Path.cwd() / "mpox_repro_framework"
cfg_dir = ROOT / "src" / "cbe_repro" / "configs"

cfg = yaml.safe_load((cfg_dir/"tabular_genai.yaml").read_text())
ds = yaml.safe_load((cfg_dir/"datasets.yaml").read_text())["symptom_demo"]
data_path = ROOT / "src" / "cbe_repro" / "data" / Path(ds["path"]).name

import pandas as pd
df = pd.read_csv(data_path)
X = df.drop(columns=["label"])
y = df["label"].astype(int)

cands = [1.0, 1.5, 2.0, 2.5, 3.0]
seed = cfg.get("seed",1337)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
scores = []

for mult in cands:
    fold_f1 = []
    for tr, va in skf.split(X,y):
        X_tr, X_va = X.iloc[tr], X.iloc[va]
        y_tr, y_va = y.iloc[tr], y.iloc[va]
        if mult>1.0:
            X_tr, y_tr = smote_or_oversample(X_tr, y_tr, multiplier=mult, seed=seed)
        clf = RandomForestClassifier(**cfg["model"]["params"])
        clf.fit(X_tr, y_tr)
        pred = clf.predict(X_va)
        fold_f1.append(f1_score(y_va, pred))
    scores.append((mult, float(np.mean(fold_f1))))
print("CV F1 by multiplier:", scores)


[DEBUG] Legacy synth ‚Üí before=[128 32] after={np.int64(0): np.int64(128), np.int64(1): np.int64(128)} multiplier=1.5
[DEBUG] Legacy synth ‚Üí before=[128 32] after={np.int64(0): np.int64(128), np.int64(1): np.int64(128)} multiplier=1.5
[DEBUG] Legacy synth ‚Üí before=[128 32] after={np.int64(0): np.int64(128), np.int64(1): np.int64(128)} multiplier=1.5
[DEBUG] Legacy synth ‚Üí before=[128 32] after={np.int64(0): np.int64(128), np.int64(1): np.int64(128)} multiplier=1.5
[DEBUG] Legacy synth ‚Üí before=[128 32] after={np.int64(0): np.int64(128), np.int64(1): np.int64(128)} multiplier=1.5
[DEBUG] Legacy synth ‚Üí before=[128 32] after={np.int64(0): np.int64(128), np.int64(1): np.int64(128)} multiplier=2.0
[DEBUG] Legacy synth ‚Üí before=[128 32] after={np.int64(0): np.int64(128), np.int64(1): np.int64(128)} multiplier=2.0
[DEBUG] Legacy synth ‚Üí before=[128 32] after={np.int64(0): np.int64(128), np.int64(1): np.int64(128)} multiplier=2.0
[DEBUG] Legacy synth ‚Üí before=[128 32] after={

In [34]:
import pandas as pd, numpy as np, yaml
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_recall_curve

# load the symptom dataset
ROOT = Path.cwd() / "mpox_repro_framework"
csv_path = ROOT / "src" / "cbe_repro" / "data" / "symptom_demo.csv"
df = pd.read_csv(csv_path)

X = df.drop(columns=["label"])
y = df["label"].astype(int)

# split into train/test
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.25, stratify=y, random_state=1337)

# fit a baseline RF model
clf = RandomForestClassifier(n_estimators=200, max_depth=6, random_state=1337)
clf.fit(X_tr, y_tr)

# predict probabilities on test
proba = clf.predict_proba(X_te)[:,1]

# threshold tuning
prec, rec, thr = precision_recall_curve(y_te, proba)
f1s = 2*prec*rec/(prec+rec+1e-8)
best_thr = thr[f1s[:-1].argmax()]
print("Best threshold for F1:", best_thr)

# compute F1 at that threshold
y_pred = (proba >= best_thr).astype(int)
print("F1 at best threshold:", f1_score(y_te, y_pred))


Best threshold for F1: 0.4534166666666667
F1 at best threshold: 0.8235294117647058


In [35]:
import numpy as np
from sklearn.utils import resample
def bootstrap_ci(y_true, y_pred, metric, B=1000, alpha=0.05, seed=1337):
    rng = np.random.default_rng(seed)
    stats = []
    idx = np.arange(len(y_true))
    for _ in range(B):
        b = rng.choice(idx, size=len(idx), replace=True)
        stats.append(metric(y_true[b], y_pred[b]))
    lo, hi = np.quantile(stats, [alpha/2, 1-alpha/2])
    return float(lo), float(hi)


In [36]:
from pathlib import Path
ROOT = Path.cwd() / "mpox_repro_framework"
cfg_path = ROOT / "src" / "cbe_repro" / "configs" / "datasets.yaml"
print("datasets.yaml exists:", cfg_path.exists())
print(cfg_path.read_text() if cfg_path.exists() else "(missing)")


datasets.yaml exists: True
mpox_images:
  root: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/data/mpox_images
  splits:
    train: train
    val: val
  classes:
    positive: mpox
    negative: non_mpox
symptom_demo:
  path: data/symptom_demo.csv



## Moving to the next steps

#### Orchestrator: run all experiments and collect manifests

In [37]:
# STEP 5.1 ‚Äî Orchestrator: run all 4 experiments and return their manifests
import os, sys, json, subprocess
from pathlib import Path
from datetime import datetime

ROOT = Path.cwd() / "mpox_repro_framework"
SRC  = ROOT / "src"
RUNS = ROOT / "runs"
REPORTS = ROOT / "reports"
REPORTS.mkdir(parents=True, exist_ok=True)

env = os.environ.copy()
env["PYTHONPATH"] = str(SRC) + os.pathsep + env.get("PYTHONPATH", "")

EXPS = [
    ("imaging_baseline.yaml", "cbe_repro.experiments.run_imaging"),
    ("imaging_genai.yaml",    "cbe_repro.experiments.run_imaging"),
    ("tabular_baseline.yaml", "cbe_repro.experiments.run_tabular"),
    ("tabular_genai.yaml",    "cbe_repro.experiments.run_tabular"),
]

def run_and_parse(config_name, module):
    p = subprocess.run(
        [sys.executable, "-m", module, "--config-name", config_name],
        cwd=ROOT, env=env, capture_output=True, text=True
    )
    if p.returncode != 0:
        print(f"‚ùå {config_name} failed.")
        print(p.stderr)
        return None
    # stdout is the JSON manifest printed by the runner
    try:
        j = json.loads(p.stdout)
    except Exception:
        # try reading from the newest file in runs/
        candidates = sorted(RUNS.glob("*/manifest.json"), key=lambda x: x.stat().st_mtime, reverse=True)
        j = json.loads(candidates[0].read_text()) if candidates else None
    return j

manifests = []
for cfg, mod in EXPS:
    print(f"‚ñ∂Ô∏è  {cfg}")
    j = run_and_parse(cfg, mod)
    if j: manifests.append(j)

print(f"\n‚úÖ Completed {len(manifests)}/{len(EXPS)} runs")
summary_path = REPORTS / f"latest_manifests_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
summary_path.write_text(json.dumps(manifests, indent=2))
print("Saved:", summary_path)


‚ñ∂Ô∏è  imaging_baseline.yaml
‚ñ∂Ô∏è  imaging_genai.yaml
‚ñ∂Ô∏è  tabular_baseline.yaml
‚ñ∂Ô∏è  tabular_genai.yaml

‚úÖ Completed 4/4 runs
Saved: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/reports/latest_manifests_20250904_122246.json


#### ReproScore + unified comparison table

In [38]:
# STEP 5.2 ‚Äî ReproScore + comparison table
import json, yaml, pandas as pd
from pathlib import Path
from collections import defaultdict

ROOT = Path.cwd() / "mpox_repro_framework"
RUNS = ROOT / "runs"
CFG  = ROOT / "src" / "cbe_repro" / "configs"

def latest_by_config():
    rows = []
    for d in RUNS.glob("*"):
        mf = d / "manifest.json"
        if mf.exists():
            j = json.loads(mf.read_text())
            rows.append(j)
    if not rows:
        return pd.DataFrame()
    df = pd.DataFrame(rows).sort_values("started").groupby("config_name", as_index=False).tail(1)
    return df.reset_index(drop=True)

def repro_score(manifest):
    # 5 quick checks ‚Üí average
    score_bits = []
    # 1) config exists
    cfg_ok = (CFG / manifest["config_name"]).exists()
    score_bits.append(1.0 if cfg_ok else 0.0)
    # 2) seed in config
    try:
        cfg = yaml.safe_load((CFG/manifest["config_name"]).read_text())
        seed_ok = "seed" in cfg and isinstance(cfg["seed"], int)
    except Exception:
        seed_ok = False
    score_bits.append(1.0 if seed_ok else 0.0)
    # 3) data registered
    try:
        reg = yaml.safe_load((CFG/"datasets.yaml").read_text())
        ds_name = cfg.get("dataset")
        data_ok = ds_name in reg
    except Exception:
        data_ok = False
    score_bits.append(1.0 if data_ok else 0.0)
    # 4) manifest presence
    score_bits.append(1.0)  # we already have it if we're scoring
    # 5) deterministic augmentation flag sanity
    synth = (cfg.get("synth",{}) or {}).get("enabled", False)
    score_bits.append(1.0 if isinstance(synth, (bool,)) else 0.0)
    return float(sum(score_bits)/len(score_bits))

df = latest_by_config()
if df.empty:
    print("No runs found. Run the orchestrator first.")
else:
    df["ReproScore"] = df.apply(repro_score, axis=1)
    # tidy compare
    def pair(stem):
        b = df[df["config_name"]==f"{stem}_baseline.yaml"]
        g = df[df["config_name"]==f"{stem}_genai.yaml"]
        if b.empty or g.empty: return None
        return {
            "task": stem,
            "baseline_acc": float(b["metrics"].iloc[0]["acc"]),
            "genai_acc": float(g["metrics"].iloc[0]["acc"]),
            "delta_acc":  float(g["metrics"].iloc[0]["acc"]) - float(b["metrics"].iloc[0]["acc"]),
            "baseline_f1": float(b["metrics"].iloc[0]["f1"]),
            "genai_f1": float(g["metrics"].iloc[0]["f1"]),
            "delta_f1":  float(g["metrics"].iloc[0]["f1"]) - float(b["metrics"].iloc[0]["f1"]),
            "baseline_ReproScore": float(b["ReproScore"].iloc[0]),
            "genai_ReproScore": float(g["ReproScore"].iloc[0]),
            "delta_ReproScore": float(g["ReproScore"].iloc[0]) - float(b["ReproScore"].iloc[0]),
        }
    rows = []
    for stem in ["imaging", "tabular"]:
        r = pair(stem)
        if r: rows.append(r)
    comp = pd.DataFrame(rows)
    display(df[["config_name","metrics","ReproScore","started"]])
    display(comp)

    # save artifacts
    out_dir = ROOT / "reports"
    out_dir.mkdir(parents=True, exist_ok=True)
    df.to_csv(out_dir / "latest_runs_with_reproscore.csv", index=False)
    comp.to_csv(out_dir / "baseline_vs_genai_comparison.csv", index=False)
    print("Saved:", out_dir / "latest_runs_with_reproscore.csv")
    print("Saved:", out_dir / "baseline_vs_genai_comparison.csv")


Unnamed: 0,config_name,metrics,ReproScore,started
0,imaging_baseline.yaml,"{'acc': 0.5957446808510638, 'f1': 0.5777777777...",1.0,2025-09-04 12:22:42
1,imaging_genai.yaml,"{'acc': 0.574468085106383, 'f1': 0.6}",1.0,2025-09-04 12:22:45


Unnamed: 0,task,baseline_acc,genai_acc,delta_acc,baseline_f1,genai_f1,delta_f1,baseline_ReproScore,genai_ReproScore,delta_ReproScore
0,imaging,0.595745,0.574468,-0.021277,0.577778,0.6,0.022222,1.0,1.0,0.0


Saved: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/reports/latest_runs_with_reproscore.csv
Saved: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/reports/baseline_vs_genai_comparison.csv


### Auto-documentation: generate a Markdown report

In [39]:
# STEP 5.3 ‚Äî Auto-generate Markdown report
from pathlib import Path
import pandas as pd, json, yaml
from datetime import datetime

ROOT = Path.cwd() / "mpox_repro_framework"
REPORTS = ROOT / "reports"

latest_df = pd.read_csv(REPORTS / "latest_runs_with_reproscore.csv")
comp      = pd.read_csv(REPORTS / "baseline_vs_genai_comparison.csv")

ts = datetime.now().strftime("%Y-%m-%d %H:%M")
lines = []
lines.append(f"# 4.2.5 Generative AI‚ÄìBased Interventions for Reproducibility")
lines.append("")
lines.append(f"_Auto report generated: {ts}_")
lines.append("")
lines.append("## Experimental Setup")
lines.append("- **Imaging:** deterministic feature extractor + logistic regression; GenAI flag performs deterministic oversampling of positives.")
lines.append("- **Symptoms (tabular):** RandomForest; GenAI flag uses SMOTE/oversampling (tunable).")
lines.append("- All runs fix `seed=1337` and log a JSON manifest.")
lines.append("")
lines.append("## Latest Per-Config Metrics")
lines.append(latest_df.to_markdown(index=False))
lines.append("")
lines.append("## Baseline vs GenAI (Œî)")
lines.append(comp.to_markdown(index=False))
lines.append("")
lines.append("## Reproducibility Notes")
lines.append("- We versioned configs, fixed RNG seeds, and saved run manifests.")
lines.append("- GenAI interventions were deterministic (oversampling with fixed patterns for imaging; seeded SMOTE/oversample for tabular).")
lines.append("- ReproScore aggregates presence of config/seed/data/manifest/augmentation control into a 0‚Äì1 score.")
lines.append("")
# brief interpretation
for _, r in comp.iterrows():
    t = r["task"].capitalize()
    lines.append(f"- **{t}**: F1 {r['baseline_f1']:.3f} ‚Üí {r['genai_f1']:.3f} (Œî {r['delta_f1']:+.3f}); "
                 f"Acc {r['baseline_acc']:.3f} ‚Üí {r['genai_acc']:.3f} (Œî {r['delta_acc']:+.3f}); "
                 f"ReproScore {r['baseline_ReproScore']:.2f} ‚Üí {r['genai_ReproScore']:.2f} (Œî {r['delta_ReproScore']:+.2f}).")

md_path = REPORTS / "section_4_2_5_results.md"
md_path.write_text("\n".join(lines), encoding="utf-8")
print("Saved report:", md_path)


Saved report: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/reports/section_4_2_5_results.md


## Testing the framework

#### Create framework_runner.py

In [40]:
# STEP 3.1 ‚Äî Create framework_runner.py
from pathlib import Path

ROOT = Path.cwd() / "mpox_repro_framework"
runner_path = ROOT / "src" / "cbe_repro" / "experiments" / "framework_runner.py"

code = '''# -*- coding: utf-8 -*-
import json, datetime
from pathlib import Path
from . import run_imaging, run_tabular

def run_all():
    results = {}

    # Imaging baseline
    res_img_base = run_imaging.main("imaging_baseline.yaml", return_results=True)
    results["imaging_baseline"] = res_img_base

    # Imaging + GenAI augmentation
    res_img_gen = run_imaging.main("imaging_genai.yaml", return_results=True)
    results["imaging_genai"] = res_img_gen

    # Tabular baseline
    res_tab_base = run_tabular.main("tabular_baseline.yaml", return_results=True)
    results["tabular_baseline"] = res_tab_base

    # Tabular + GenAI augmentation
    res_tab_gen = run_tabular.main("tabular_genai.yaml", return_results=True)
    results["tabular_genai"] = res_tab_gen

    # Save combined results
    out_path = Path.cwd() / "mpox_repro_framework" / "reports" / "results_summary.json"
    out_path.parent.mkdir(parents=True, exist_ok=True)
    out_path.write_text(json.dumps({
        "timestamp": str(datetime.datetime.now()),
        "results": results
    }, indent=2))
    print(f"‚úÖ Saved combined results to {out_path}")
    return results

if __name__ == "__main__":
    run_all()
'''
runner_path.parent.mkdir(parents=True, exist_ok=True)
runner_path.write_text(code, encoding="utf-8")
print("‚úÖ Wrote:", runner_path)


‚úÖ Wrote: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/experiments/framework_runner.py


In [None]:
### Test the orchestrator

In [41]:
# import sys
# from pathlib import Path

# ROOT = Path.cwd() / "mpox_repro_framework"
# SRC  = ROOT / "src"
# if str(SRC) not in sys.path:
#     sys.path.insert(0, str(SRC))

# from cbe_repro.experiments import framework_runner
# results = framework_runner.run_all()
# results


TypeError: main() got an unexpected keyword argument 'return_results'

### Auto-documentation (Markdown) ‚Äî experiment cards

#### Create an auto-doc writer

In [42]:
# create: src/cbe_repro/reporting/write_docs.py
from pathlib import Path
import json, yaml, pandas as pd
from datetime import datetime

ROOT = Path.cwd() / "mpox_repro_framework"
SRC  = ROOT / "src"
RUNS = ROOT / "runs"
CFG  = SRC / "cbe_repro" / "configs"
REPORTS = ROOT / "reports"
REPORTS.mkdir(parents=True, exist_ok=True)

def latest_manifests():
    rows = []
    for d in RUNS.glob("*"):
        mf = d / "manifest.json"
        if mf.exists():
            rows.append(json.loads(mf.read_text()))
    if not rows: return pd.DataFrame()
    df = pd.DataFrame(rows).sort_values("started").groupby("config_name", as_index=False).tail(1)
    return df.reset_index(drop=True)

def repro_score(cfg_name):
    try:
        cfg_path = CFG / cfg_name
        reg_path = CFG / "datasets.yaml"
        cfg = yaml.safe_load(cfg_path.read_text())
        reg = yaml.safe_load(reg_path.read_text())
        bits = []
        bits.append(1.0 if cfg_path.exists() else 0.0)  # config present
        bits.append(1.0 if isinstance(cfg.get("seed"), int) else 0.0)  # seed fixed
        bits.append(1.0 if cfg.get("dataset") in reg else 0.0)  # data registered
        bits.append(1.0)  # manifest exists (we only score latest that exist)
        synth = (cfg.get("synth", {}) or {}).get("enabled", False)
        bits.append(1.0 if isinstance(synth, bool) else 0.0)  # augmentation control
        return float(sum(bits)/len(bits))
    except Exception:
        return 0.0

def write_experiment_cards():
    df = latest_manifests()
    if df.empty:
        print("No manifests found."); return
    cards_dir = REPORTS / "cards"
    cards_dir.mkdir(exist_ok=True)
    for _, row in df.iterrows():
        cfg = row["config_name"]
        m = row["metrics"]
        rs = repro_score(cfg)
        lines = []
        lines.append(f"# Experiment Card ‚Äî {cfg}")
        lines.append("")
        lines.append(f"- **Started:** {row['started']}")
        lines.append(f"- **ReproScore:** {rs:.2f}")
        lines.append(f"- **Accuracy:** {m['acc']:.3f}")
        lines.append(f"- **F1:** {m['f1']:.3f}")
        lines.append("")
        # echo key config knobs
        try:
            ycfg = yaml.safe_load((CFG/cfg).read_text())
            lines.append("## Key Config")
            lines.append(f"- Seed: `{ycfg.get('seed')}`")
            lines.append(f"- Dataset: `{ycfg.get('dataset')}`")
            if "image_size" in ycfg: lines.append(f"- Image size: `{ycfg['image_size']}`")
            synth = ycfg.get("synth", {})
            lines.append(f"- GenAI enabled: `{synth.get('enabled', False)}`")
            if "minority_multiplier" in synth:
                lines.append(f"- GenAI minority_multiplier: `{synth['minority_multiplier']}`")
            model = ycfg.get("model", {})
            lines.append(f"- Model: `{model.get('name')}` params: `{model.get('params')}`")
        except Exception:
            pass
        out = cards_dir / f"{cfg.replace('.yaml','')}.md"
        out.write_text("\n".join(lines), encoding="utf-8")
        print("Wrote card:", out)

def write_section_4_2_5_report():
    df = latest_manifests()
    if df.empty:
        print("No manifests found."); return
    # Build paired comparison
    def pick(name): 
        sub = df[df["config_name"]==name]
        return None if sub.empty else sub.iloc[0]
    rows = []
    for stem in ["imaging","tabular"]:
        base = pick(f"{stem}_baseline.yaml")
        gen  = pick(f"{stem}_genai.yaml")
        if base is None or gen is None: continue
        b, g = base["metrics"], gen["metrics"]
        row = {
            "task": stem,
            "baseline_acc": float(b["acc"]), "genai_acc": float(g["acc"]),
            "delta_acc": float(g["acc"])-float(b["acc"]),
            "baseline_f1": float(b["f1"]),   "genai_f1": float(g["f1"]),
            "delta_f1": float(g["f1"])-float(b["f1"]),
            "baseline_rs": repro_score(base["config_name"]),
            "genai_rs": repro_score(gen["config_name"]),
            "delta_rs": repro_score(gen["config_name"]) - repro_score(base["config_name"]),
        }
        rows.append(row)
    comp = pd.DataFrame(rows)

    ts = datetime.now().strftime("%Y-%m-%d %H:%M")
    lines = []
    lines.append("# 4.2.5 Generative AI‚ÄìBased Interventions for Reproducibility")
    lines.append("")
    lines.append(f"_Auto-generated: {ts}_")
    lines.append("")
    lines.append("## Setup (Reproducibility)")
    lines.append("- Fixed RNG seed (`seed=1337`) in configs.")
    lines.append("- Datasets registered in a central `datasets.yaml`.")
    lines.append("- Each run logs a JSON manifest under `runs/<id>/manifest.json`.")
    lines.append("- **GenAI interventions:** imaging = deterministic oversampling of positives; tabular = SMOTE/oversample with seed.")
    lines.append("")
    lines.append("## Latest Per-Config Metrics")
    lines.append(df[["config_name","metrics","started"]].to_markdown(index=False))
    lines.append("")
    lines.append("## Baseline vs GenAI (Œî)")
    if not comp.empty:
        lines.append(comp.to_markdown(index=False))
        for _, r in comp.iterrows():
            lines.append(f"- **{r['task'].capitalize()}**: F1 {r['baseline_f1']:.3f} ‚Üí {r['genai_f1']:.3f} (Œî {r['delta_f1']:+.3f}); "
                         f"Acc {r['baseline_acc']:.3f} ‚Üí {r['genai_acc']:.3f} (Œî {r['delta_acc']:+.3f}); "
                         f"ReproScore {r['baseline_rs']:.2f} ‚Üí {r['genai_rs']:.2f} (Œî {r['delta_rs']:+.2f}).")
    else:
        lines.append("_Not enough paired runs to compute deltas._")
    lines.append("")
    lines.append("## Transparency (Auto-Docs)")
    lines.append("- Per-experiment cards with key config knobs are in `reports/cards/`.")
    out = REPORTS / "section_4_2_5.md"
    out.write_text("\n".join(lines), encoding="utf-8")
    print("Wrote section report:", out)

if __name__ == "__main__":
    write_experiment_cards()
    write_section_4_2_5_report()


Wrote card: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/reports/cards/imaging_baseline.md
Wrote card: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/reports/cards/imaging_genai.md
Wrote section report: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/reports/section_4_2_5.md


#### Generate docs

In [None]:
#Create cbe_repro/reporting/write_docs.py (and __init__.py)

In [43]:
from pathlib import Path

ROOT = Path.cwd() / "mpox_repro_framework"
SRC  = ROOT / "src"
PKG  = SRC / "cbe_repro"
REP  = PKG / "reporting"
REP.mkdir(parents=True, exist_ok=True)

# ensure packages
for p in [PKG, REP]:
    (p / "__init__.py").write_text("__all__ = []\n", encoding="utf-8")

code = '''# -*- coding: utf-8 -*-
from pathlib import Path
import json, yaml, pandas as pd
from datetime import datetime

ROOT = Path.cwd() / "mpox_repro_framework"
SRC  = ROOT / "src"
RUNS = ROOT / "runs"
CFG  = SRC / "cbe_repro" / "configs"
REPORTS = ROOT / "reports"
REPORTS.mkdir(parents=True, exist_ok=True)

def _latest_manifests_df():
    rows = []
    for d in RUNS.glob("*"):
        mf = d / "manifest.json"
        if mf.exists():
            rows.append(json.loads(mf.read_text()))
    if not rows:
        return pd.DataFrame()
    df = pd.DataFrame(rows).sort_values("started").groupby("config_name", as_index=False).tail(1)
    return df.reset_index(drop=True)

def _repro_score(cfg_name: str) -> float:
    try:
        cfg_path = CFG / cfg_name
        reg_path = CFG / "datasets.yaml"
        cfg = yaml.safe_load(cfg_path.read_text())
        reg = yaml.safe_load(reg_path.read_text())
        bits = []
        bits.append(1.0 if cfg_path.exists() else 0.0)                             # config present
        bits.append(1.0 if isinstance(cfg.get("seed"), int) else 0.0)              # fixed seed
        bits.append(1.0 if cfg.get("dataset") in reg else 0.0)                     # dataset registered
        bits.append(1.0)                                                           # manifest exists (we're reading it)
        synth = (cfg.get("synth", {}) or {}).get("enabled", False)
        bits.append(1.0 if isinstance(synth, bool) else 0.0)                       # augmentation control
        return float(sum(bits)/len(bits))
    except Exception:
        return 0.0

def write_experiment_cards():
    df = _latest_manifests_df()
    if df.empty:
        print("No manifests found."); return
    cards_dir = REPORTS / "cards"
    cards_dir.mkdir(exist_ok=True)
    for _, row in df.iterrows():
        cfg = row["config_name"]
        m = row["metrics"]
        rs = _repro_score(cfg)
        lines = []
        lines.append(f"# Experiment Card ‚Äî {cfg}")
        lines.append("")
        lines.append(f"- **Started:** {row['started']}")
        lines.append(f"- **ReproScore:** {rs:.2f}")
        lines.append(f"- **Accuracy:** {m['acc']:.3f}")
        lines.append(f"- **F1:** {m['f1']:.3f}")
        lines.append("")
        try:
            ycfg = yaml.safe_load((CFG/cfg).read_text())
            lines.append("## Key Config")
            lines.append(f"- Seed: `{ycfg.get('seed')}`")
            lines.append(f"- Dataset: `{ycfg.get('dataset')}`")
            if "image_size" in ycfg: lines.append(f"- Image size: `{ycfg['image_size']}`")
            synth = ycfg.get("synth", {})
            lines.append(f"- GenAI enabled: `{synth.get('enabled', False)}`")
            if "minority_multiplier" in synth:
                lines.append(f"- GenAI minority_multiplier: `{synth['minority_multiplier']}`")
            model = ycfg.get("model", {})
            lines.append(f"- Model: `{model.get('name')}` params: `{model.get('params')}`")
        except Exception:
            pass
        out = cards_dir / f"{cfg.replace('.yaml','')}.md"
        out.write_text("\\n".join(lines), encoding="utf-8")
        print("Wrote card:", out)

def write_section_4_2_5_report():
    df = _latest_manifests_df()
    if df.empty:
        print("No manifests found."); return

    def pick(name): 
        sub = df[df["config_name"]==name]
        return None if sub.empty else sub.iloc[0]

    rows = []
    for stem in ["imaging","tabular"]:
        base = pick(f"{stem}_baseline.yaml")
        gen  = pick(f"{stem}_genai.yaml")
        if base is None or gen is None: continue
        b, g = base["metrics"], gen["metrics"]
        row = {
            "task": stem,
            "baseline_acc": float(b["acc"]), "genai_acc": float(g["acc"]),
            "delta_acc":  float(g["acc"])-float(b["acc"]),
            "baseline_f1": float(b["f1"]),   "genai_f1": float(g["f1"]),
            "delta_f1":   float(g["f1"])-float(b["f1"]),
            "baseline_rs": _repro_score(base["config_name"]),
            "genai_rs":    _repro_score(gen["config_name"]),
            "delta_rs":    _repro_score(gen["config_name"]) - _repro_score(base["config_name"]),
        }
        rows.append(row)
    comp = pd.DataFrame(rows)

    ts = datetime.now().strftime("%Y-%m-%d %H:%M")
    lines = []
    lines.append("# 4.2.5 Generative AI‚ÄìBased Interventions for Reproducibility")
    lines.append("")
    lines.append(f"_Auto-generated: {ts}_")
    lines.append("")
    lines.append("## Setup (Reproducibility)")
    lines.append("- Fixed RNG seed (`seed=1337`) in configs.")
    lines.append("- Datasets registered in a central `datasets.yaml`.")
    lines.append("- Each run logs a JSON manifest under `runs/<id>/manifest.json`.")
    lines.append("- **GenAI interventions:** imaging = deterministic oversampling of positives; tabular = SMOTE/oversample with seed.")
    lines.append("")
    lines.append("## Latest Per-Config Metrics")
    lines.append(df[[\"config_name\",\"metrics\",\"started\"]].to_markdown(index=False))
    lines.append("")
    lines.append("## Baseline vs GenAI (Œî)")
    if not comp.empty:
        lines.append(comp.to_markdown(index=False))
        for _, r in comp.iterrows():
            lines.append(f"- **{r['task'].capitalize()}**: F1 {r['baseline_f1']:.3f} ‚Üí {r['genai_f1']:.3f} (Œî {r['delta_f1']:+.3f}); "
                         f"Acc {r['baseline_acc']:.3f} ‚Üí {r['genai_acc']:.3f} (Œî {r['delta_acc']:+.3f}); "
                         f"ReproScore {r['baseline_rs']:.2f} ‚Üí {r['genai_rs']:.2f} (Œî {r['delta_rs']:+.2f}).")
    else:
        lines.append("_Not enough paired runs to compute deltas._")
    lines.append("")
    lines.append("## Transparency (Auto-Docs)")
    lines.append("- Per-experiment cards with key config knobs are in `reports/cards/`.")

    out = REPORTS / "section_4_2_5.md"
    out.write_text("\\n".join(lines), encoding="utf-8")
    print("Wrote section report:", out)
'''
(REP / "write_docs.py").write_text(code, encoding="utf-8")
print("Rewrote:", REP / "write_docs.py")


Rewrote: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/reporting/write_docs.py


In [44]:
# ensure src is on sys.path, then import the fixed module and run
import sys
from importlib import reload
from pathlib import Path

ROOT = Path.cwd() / "mpox_repro_framework"
SRC  = ROOT / "src"
if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))

from cbe_repro.reporting import write_docs
reload(write_docs)  # in case it was cached
write_docs.write_experiment_cards()
write_docs.write_section_4_2_5_report()


Wrote card: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/reports/cards/imaging_baseline.md
Wrote card: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/reports/cards/imaging_genai.md
Wrote section report: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/reports/section_4_2_5.md


## Making the Framework customizable

In [45]:
from pathlib import Path
import sys

ROOT = Path.cwd() / "mpox_repro_framework"
SRC  = ROOT / "src"
if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))
print("PYTHONPATH ok ‚Üí", SRC)


PYTHONPATH ok ‚Üí /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src


#### Create a small model factory (tabular + imaging)

In [46]:
# creates: src/cbe_repro/models/model_zoo.py  (+ __init__.py)
from pathlib import Path

pkg = SRC / "cbe_repro" / "models"
pkg.mkdir(parents=True, exist_ok=True)
for p in [SRC/"cbe_repro", pkg]:
    (p/"__init__.py").write_text("__all__ = []\n", encoding="utf-8")

code = r'''# -*- coding: utf-8 -*-

from dataclasses import dataclass
from typing import Any, Dict

# tabular
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

try:
    from xgboost import XGBClassifier
    _HAS_XGB = True
except Exception:
    _HAS_XGB = False

# imaging head (classic features ‚Üí linear classifier)
from sklearn.linear_model import LogisticRegression as ImgLogReg

@dataclass
class ModelSpec:
    name: str
    params: Dict[str, Any]

def make_tabular(spec: ModelSpec):
    n = spec.name.lower()
    p = spec.params or {}
    if n in ["logisticregression","logreg","lr"]:
        return LogisticRegression(**p)
    if n in ["randomforest","randomforestclassifier","rf"]:
        return RandomForestClassifier(**p)
    if n in ["svm","svc"]:
        return SVC(probability=True, **p)
    if n in ["xgboost","xgb","xgbclassifier"]:
        if not _HAS_XGB:
            raise ImportError("xgboost not installed. Try: pip install xgboost")
        return XGBClassifier(**p)
    raise ValueError(f"Unknown tabular model: {spec.name}")

def make_imaging(spec: ModelSpec):
    # keep it light & deterministic; swap here if/when you add a CNN
    return ImgLogReg(**(spec.params or {}))
'''
(pkg/"model_zoo.py").write_text(code, encoding="utf-8")
print("‚úÖ Wrote:", pkg/"model_zoo.py")


‚úÖ Wrote: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/models/model_zoo.py


### Create a unified runner that plugs in dataset + modality + model from a YAML

In [47]:
# creates: src/cbe_repro/experiments/run_unified.py
from pathlib import Path
code = r'''# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
from __future__ import annotations
import json, time, yaml, numpy as np
from dataclasses import dataclass
from typing import Any, Dict
from pathlib import Path

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_recall_curve, make_scorer

from cbe_repro.models.model_zoo import ModelSpec, make_tabular, make_imaging
from cbe_repro.synth.image_loader import ImageFolderDataset
from cbe_repro.synth.symptom_smote import smote_or_oversample

ROOT = Path.cwd() / "mpox_repro_framework"
CFG_DIR = ROOT / "src" / "cbe_repro" / "configs"
RUNS = ROOT / "runs"

def _yml(p: Path): return yaml.safe_load(p.read_text())
def _reg(): return _yml(CFG_DIR/"datasets.yaml") or {}
def _seed(s: int): np.random.seed(int(s))

@dataclass
class PaperProfile:
    paper_id: str
    modality: str          # "tabular" | "imaging"
    dataset: Any           # key in datasets.yaml OR inline dict
    model: Dict[str, Any]  # {name, params}
    synth: Dict[str, Any]  # {enabled, ...}
    metrics: Dict[str, Any]# {threshold_tuning, tune:{...}, ci:{enabled,B,alpha}, seed, auto_doc}

def _boot_ci(y_true, y_pred, metric_fn, B=1000, alpha=0.05, seed=1337):
    rng = np.random.default_rng(seed)
    n = len(y_true); idx = np.arange(n)
    stats = []
    for _ in range(B):
        b = rng.choice(idx, size=n, replace=True)
        stats.append(metric_fn(np.array(y_true)[b], np.array(y_pred)[b]))
    lo, hi = np.quantile(stats, [alpha/2, 1-alpha/2])
    return float(lo), float(hi)

def _maybe_tune_tabular(model_spec: ModelSpec, Xtr, ytr, tune_cfg: Dict[str,Any]):
    """
    tune_cfg example:
      enabled: true
      method: grid | randomized
      cv: 5
      scoring: f1 | accuracy
      n_iter: 20        # (randomized only)
      param_grid:       # (dict of lists)
        n_estimators: [200, 300, 500]
        max_depth: [3, 4, 5]
        learning_rate: [0.05, 0.08, 0.1]
        subsample: [0.7, 0.9, 1.0]
        colsample_bytree: [0.7, 0.9, 1.0]
        min_child_weight: [1, 3, 5]
    """
    if not tune_cfg or not bool(tune_cfg.get("enabled", False)):
        return make_tabular(model_spec), {}

    method = str(tune_cfg.get("method", "grid")).lower()
    cv = int(tune_cfg.get("cv", 5))
    scoring_name = str(tune_cfg.get("scoring", "f1")).lower()
    scoring = make_scorer(f1_score) if scoring_name == "f1" else make_scorer(accuracy_score)

    base = make_tabular(model_spec)
    grid = tune_cfg.get("param_grid") or tune_cfg.get("search_space") or {}
    if not grid:
        # sensible defaults if nothing provided (XGBoost / RF / LR)
        if model_spec.name.lower() == "xgboost":
            grid = {
                "n_estimators": [200, 300, 500],
                "max_depth": [3, 4, 5],
                "learning_rate": [0.05, 0.08, 0.1],
                "subsample": [0.7, 0.9, 1.0],
                "colsample_bytree": [0.7, 0.9, 1.0],
                "min_child_weight": [1, 3, 5],
            }
        elif model_spec.name.lower() in ("random_forest","rf"):
            grid = {
                "n_estimators": [200, 400, 600],
                "max_depth": [None, 8, 12, 16],
                "min_samples_split": [2, 5, 10],
            }
        else:
            # logistic regression etc.
            grid = {"C":[0.1,0.3,1.0,3.0,10.0]}

    kfold = StratifiedKFold(n_splits=cv, shuffle=True, random_state=1337)
    if method == "randomized":
        n_iter = int(tune_cfg.get("n_iter", 20))
        search = RandomizedSearchCV(
            base, grid, n_iter=n_iter, scoring=scoring, cv=kfold, refit=True, n_jobs=-1, random_state=1337
        )
    else:
        search = GridSearchCV(
            base, grid, scoring=scoring, cv=kfold, refit=True, n_jobs=-1
        )
    search.fit(Xtr, ytr)
    best_est = search.best_estimator_
    info = {"tune_enabled": True, "method": method, "cv": cv,
            "scoring": scoring_name, "best_params": search.best_params_,
            "best_score_cv": float(search.best_score_)}
    return best_est, info

def run_from_profile(profile_yaml: str, return_results=False):
    prof_dict = _yml(CFG_DIR/"papers"/profile_yaml)
    prof = PaperProfile(
        paper_id=prof_dict["paper_id"],
        modality=prof_dict["modality"],
        dataset=prof_dict["dataset"],
        model=prof_dict["model"],
        synth=prof_dict.get("synth",{}) or {},
        metrics=prof_dict.get("metrics",{}) or {}
    )

    reg = _reg()

    # allow dict-form dataset inline in the profile
    if isinstance(prof.dataset, dict):
        ds_dict = prof.dataset
        ds_name = ds_dict.get("name", "inline_dataset")
        reg[ds_name] = {
            "path": ds_dict["path"],
            "label_col": ds_dict.get("label_col", "label"),
            "positive_value": ds_dict.get("positive_value", 1),
        }
        prof.dataset = ds_name  # continue with this name

    if prof.dataset not in reg:
        raise KeyError(f"Dataset '{prof.dataset}' not in datasets.yaml")

    seed = int(prof.metrics.get("seed", 1337)); _seed(seed)

    # ---- load & train
    tune_info = {}
    if prof.modality == "tabular":
        import pandas as pd
        entry = reg[prof.dataset]
        csv_path = ROOT/"src"/"cbe_repro"/"data"/Path(entry["path"]).name
        df = pd.read_csv(csv_path)

        # normalize column names
        df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

        # label resolution: prefer datasets.yaml label_col; else try common aliases
        label_col = (entry.get("label_col") or "").strip().lower().replace(" ", "_")
        if not label_col:
            candidates = ["label","status","target","class","outcome","y"]
            label_col = next((c for c in candidates if c in df.columns), None)
            if label_col is None:
                raise KeyError(f"Could not find a label column. Available: {list(df.columns)}")

        # drop common id-like columns
        id_like = [c for c in ["id","patient_id","sample_id","record_id"] if c in df.columns]

        # build X, y
        X = df.drop(columns=id_like + [label_col])
        y = df[label_col]

        # map labels to 0/1 if needed
        if y.dtype == "O":
            y = (
                y.astype(str).str.strip().str.lower().map({
                    "1":1, "0":0, "true":1, "false":0, "yes":1, "no":0,
                    "positive":1, "negative":0, "mpox":1, "non_mpox":0, "pos":1, "neg":0
                })
            )
        if y.isna().any():
            bad = sorted(y[y.isna()].index.tolist()[:5])
            raise ValueError(f"Label column '{label_col}' contains unmapped values. Bad rows: {bad}. "
                             f"Unique values: {sorted(df[label_col].astype(str).unique().tolist())}")
        y = y.astype(int)

        Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.25, stratify=y, random_state=seed)

        if prof.synth.get("enabled", False):
            mult = float(prof.synth.get("minority_multiplier", 2.0))
            print(f"[DEBUG] Applying synthetic data: multiplier={mult}, before={np.bincount(ytr)}")
            Xtr, ytr = smote_or_oversample(Xtr, ytr, multiplier=mult, seed=seed, 
                                           #balance_to_max=False, 
                                           target_ratio=0.7)
            print(f"[DEBUG] After synth: class distribution={np.bincount(ytr)}")
        else:
            print("[DEBUG] No synthetic data applied")

        # maybe tune
        model_spec = ModelSpec(**prof.model)
        model, tune_info = _maybe_tune_tabular(model_spec, Xtr, ytr, prof.metrics.get("tune", {}))
        model.fit(Xtr, ytr)

        proba = model.predict_proba(Xte)[:,1] if hasattr(model,"predict_proba") else None
        yhat = (proba >= 0.5).astype(int) if proba is not None else model.predict(Xte)

    elif prof.modality == "imaging":
        entry = reg[prof.dataset]
        ds_tr = ImageFolderDataset(entry["root"], "train",
                                   pos_cls=entry["classes"]["positive"], neg_cls=entry["classes"]["negative"],
                                   seed=seed, img_size=128)
        ds_va = ImageFolderDataset(entry["root"], "val",
                                   pos_cls=entry["classes"]["positive"], neg_cls=entry["classes"]["negative"],
                                   seed=seed, img_size=128)
        Xtr, ytr = ds_tr.as_features_labels(synth_enabled=bool(prof.synth.get("enabled",False)),
                                            synth_multiplier=float(prof.synth.get("minority_multiplier",2.0)))
        Xte, yte = ds_va.as_features_labels(synth_enabled=False)
        model = make_imaging(ModelSpec(**prof.model))
        model.fit(Xtr, ytr)
        proba = model.predict_proba(Xte)[:,1] if hasattr(model,"predict_proba") else None
        yhat = (proba >= 0.5).astype(int) if proba is not None else model.predict(Xte)
    else:
        raise ValueError(f"Unknown modality '{prof.modality}'")

    # ---- metrics (+ threshold tuning + CI)
    acc = float(accuracy_score(yte, yhat))
    f1  = float(f1_score(yte, yhat))
    ci  = (prof.metrics.get("ci") or {})
    ci_on = bool(ci.get("enabled", True)); B=int(ci.get("B",1000)); a=float(ci.get("alpha",0.05))
    f1_ci = _boot_ci(yte, yhat, f1_score, B=B, alpha=a, seed=seed) if ci_on else None

    tuned = {}
    if bool(prof.metrics.get("threshold_tuning", False)) and proba is not None:
        prec, rec, thr = precision_recall_curve(yte, proba)
        f1s = 2*prec*rec/(prec+rec+1e-12)
        best = float(thr[f1s[:-1].argmax()]) if len(thr)>0 else 0.5
        yhat_t = (proba >= best).astype(int)
        f1_t   = float(f1_score(yte, yhat_t))
        f1_t_ci = _boot_ci(yte, yhat_t, f1_score, B=B, alpha=a, seed=seed) if ci_on else None
        tuned = {"threshold": best, "f1_tuned": f1_t, "f1_tuned_ci": f1_t_ci}

    manif = {
        "run_id": str(int(time.time())),
        "started": time.strftime("%Y-%m-%d %H:%M:%S"),
        "paper_id": prof.paper_id,
        "modality": prof.modality,
        "dataset_key": prof.dataset,
        "model": prof.model,
        "synth": prof.synth,
        "metrics": {"acc": acc, "f1": f1, "f1_ci": f1_ci, **tuned},
        "tuning": tune_info or {"tune_enabled": False}
    }
    out = RUNS/manif["run_id"]; out.mkdir(parents=True, exist_ok=True)
    (out/"manifest.json").write_text(json.dumps(manif, indent=2))

    # optional auto-doc
    if bool(prof.metrics.get("auto_doc", False)):
        try:
            from cbe_repro.reporting.write_docs import write_experiment_cards, write_section_4_2_5_report
            write_experiment_cards(); write_section_4_2_5_report()
        except Exception as e:
            print(f"[Auto-doc skipped] {e}")

    if return_results: return manif
    print(json.dumps(manif, indent=2))

  
'''
path = SRC/"cbe_repro"/"experiments"/"run_unified.py"
path.parent.mkdir(parents=True, exist_ok=True)
(path).write_text(code, encoding="utf-8")
print("‚úÖ Wrote:", path)


‚úÖ Wrote: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/experiments/run_unified.py


### Add paper profiles (YAMLs you can choose per reproduced paper)

In [48]:
# creates: src/cbe_repro/configs/papers/{farzipour_2023_baseline.yaml, farzipour_2023_genai.yaml}
papers = SRC/"cbe_repro"/"configs"/"papers"; papers.mkdir(parents=True, exist_ok=True)

baseline = """\
paper_id: farzipour_2023
modality: tabular
dataset: symptom_demo         # change to 'symptom_farzipour' once you register it
model:
  name: XGBClassifier
  params:
    n_estimators: 300
    max_depth: 6
    subsample: 0.8
    colsample_bytree: 0.8
    random_state: 1337
synth:
  enabled: false
metrics:
  threshold_tuning: true
  ci:
    enabled: true
    B: 1000
    alpha: 0.05
"""
genai = """\
paper_id: farzipour_2023
modality: tabular
dataset: symptom_demo         # change to 'symptom_farzipour' once you register it
model:
  name: XGBClassifier
  params:
    n_estimators: 300
    max_depth: 6
    subsample: 0.8
    colsample_bytree: 0.8
    random_state: 1337
synth:
  enabled: true
  minority_multiplier: 2.0
metrics:
  threshold_tuning: true
  ci:
    enabled: true
    B: 1000
    alpha: 0.05
"""
(papers/"farzipour_2023_baseline.yaml").write_text(baseline, encoding="utf-8")
(papers/"farzipour_2023_genai.yaml").write_text(genai, encoding="utf-8")
print("‚úÖ Wrote:", list(papers.glob("farzipour_2023_*.yaml")))


‚úÖ Wrote: [PosixPath('/Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/configs/papers/farzipour_2023_genai.yaml'), PosixPath('/Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/configs/papers/farzipour_2023_baseline.yaml')]


In [None]:
### register the real paper dataset

In [49]:
# append to datasets.yaml if you bring the real CSV later
import yaml
cfg = yaml.safe_load((SRC/"cbe_repro"/"configs"/"datasets.yaml").read_text())
cfg["symptom_farzipour"] = {"path": "data/monkeypox_global_symptoms.csv", "target": "label"}
(SRC/"cbe_repro"/"configs"/"datasets.yaml").write_text(yaml.safe_dump(cfg, sort_keys=False), encoding="utf-8")
print("‚úÖ Registered 'symptom_farzipour' in datasets.yaml")

‚úÖ Registered 'symptom_farzipour' in datasets.yaml


In [50]:
from pathlib import Path
import yaml

ROOT = Path.cwd() / "mpox_repro_framework"
papers = ROOT / "src" / "cbe_repro" / "configs" / "papers"

for name in ["farzipour_2023_baseline.yaml", "farzipour_2023_genai.yaml"]:
    p = papers / name
    cfg = yaml.safe_load(p.read_text())
    cfg["dataset"] = "symptom_farzipour"   # üëà use your real dataset key
    p.write_text(yaml.safe_dump(cfg, sort_keys=False), encoding="utf-8")
    print("Updated", p)

Updated /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/configs/papers/farzipour_2023_baseline.yaml
Updated /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/configs/papers/farzipour_2023_genai.yaml


In [51]:
print((ROOT/"src/cbe_repro/configs/datasets.yaml").read_text()[:4000])

mpox_images:
  root: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/data/mpox_images
  splits:
    train: train
    val: val
  classes:
    positive: mpox
    negative: non_mpox
symptom_demo:
  path: data/symptom_demo.csv
symptom_farzipour:
  path: data/monkeypox_global_symptoms.csv
  target: label



In [52]:
import sys
SRC = ROOT / "src"
if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))
print("PYTHONPATH ok ‚Üí", SRC)

PYTHONPATH ok ‚Üí /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src


In [53]:
from pathlib import Path
import textwrap

ROOT = Path.cwd() / "mpox_repro_framework"
profiles_dir = ROOT / "src" / "cbe_repro" / "configs" / "profiles"
profiles_dir.mkdir(parents=True, exist_ok=True)

baseline_yaml = textwrap.dedent("""
profile_name: farzipour_2023_baseline
seed: 1337
task: tabular
dataset:
  type: csv
  path: monkeypox_global_symptoms.csv   # file must be in src/cbe_repro/data
  label_col: Status                     # your dataset's label column
  positive_values: [1, "1", "positive", "Positive", "yes", "Yes", "TRUE", "True"]
  negative_values: [0, "0", "negative", "Negative", "no", "No", "FALSE", "False"]
model:
  kind: xgboost
  params:
    n_estimators: 300
    max_depth: 4
    learning_rate: 0.08
    subsample: 0.8
    colsample_bytree: 0.8
    reg_lambda: 1.0
synth:
  enabled: false
split:
  test_size: 0.25
  stratify: true
metrics:
  - acc
  - f1
""").strip()

genai_yaml = textwrap.dedent("""
profile_name: farzipour_2023_genai
seed: 1337
task: tabular
dataset:
  type: csv
  path: monkeypox_global_symptoms.csv
  label_col: Status
  positive_values: [1, "1", "positive", "Positive", "yes", "Yes", "TRUE", "True"]
  negative_values: [0, "0", "negative", "Negative", "no", "No", "FALSE", "False"]
model:
  kind: xgboost
  params:
    n_estimators: 300
    max_depth: 4
    learning_rate: 0.08
    subsample: 0.8
    colsample_bytree: 0.8
    reg_lambda: 1.0
synth:
  enabled: true
  kind: llm_tabular_augment
  multiplier: 1.5     # ~50% more minority/positive examples via GenAI synthesizer
  seed: 1337
split:
  test_size: 0.25
  stratify: true
metrics:
  - acc
  - f1
""").strip()

(profiles_dir / "farzipour_2023_baseline.yaml").write_text(baseline_yaml, encoding="utf-8")
(profiles_dir / "farzipour_2023_genai.yaml").write_text(genai_yaml, encoding="utf-8")

print("‚úÖ Wrote profiles to:", profiles_dir)
print((profiles_dir / "farzipour_2023_baseline.yaml").read_text()[:300], "...\n")
print((profiles_dir / "farzipour_2023_genai.yaml").read_text()[:300], "...")


‚úÖ Wrote profiles to: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/configs/profiles
profile_name: farzipour_2023_baseline
seed: 1337
task: tabular
dataset:
  type: csv
  path: monkeypox_global_symptoms.csv   # file must be in src/cbe_repro/data
  label_col: Status                     # your dataset's label column
  positive_values: [1, "1", "positive", "Positive", "yes", "Yes", "TR ...

profile_name: farzipour_2023_genai
seed: 1337
task: tabular
dataset:
  type: csv
  path: monkeypox_global_symptoms.csv
  label_col: Status
  positive_values: [1, "1", "positive", "Positive", "yes", "Yes", "TRUE", "True"]
  negative_values: [0, "0", "negative", "Negative", "no", "No", "FALSE", "False ...


In [54]:
from pathlib import Path
import yaml

ROOT = Path.cwd() / "mpox_repro_framework"
profiles_dir = ROOT / "src" / "cbe_repro" / "configs" / "profiles"

for fname in ["farzipour_2023_baseline.yaml", "farzipour_2023_genai.yaml"]:
    p = profiles_dir / fname
    prof = yaml.safe_load(p.read_text())
    # update dataset block to match your file + label column
    prof["dataset"]["path"] = "monkeypox_global_symptoms.csv"   # your filename in src/cbe_repro/data
    prof["dataset"]["label_col"] = "Status"                     # your target column
    p.write_text(yaml.safe_dump(prof, sort_keys=False))
    print("‚úÖ updated", fname, "->", prof["dataset"])

‚úÖ updated farzipour_2023_baseline.yaml -> {'type': 'csv', 'path': 'monkeypox_global_symptoms.csv', 'label_col': 'Status', 'positive_values': [1, '1', 'positive', 'Positive', 'yes', 'Yes', 'TRUE', 'True'], 'negative_values': [0, '0', 'negative', 'Negative', 'no', 'No', 'FALSE', 'False']}
‚úÖ updated farzipour_2023_genai.yaml -> {'type': 'csv', 'path': 'monkeypox_global_symptoms.csv', 'label_col': 'Status', 'positive_values': [1, '1', 'positive', 'Positive', 'yes', 'Yes', 'TRUE', 'True'], 'negative_values': [0, '0', 'negative', 'Negative', 'no', 'No', 'FALSE', 'False']}


In [55]:
import pandas as pd
from pathlib import Path

ROOT = Path.cwd() / "mpox_repro_framework"
csv_path = ROOT / "src" / "cbe_repro" / "data" / "monkeypox_global_symptoms.csv"  # adjust name if different
df = pd.read_csv(csv_path)
print("Columns:", list(df.columns))
print("Unique Status values:", pd.unique(df["Status"]))

Columns: ['ID', 'rash', 'skin lesions', 'headache', 'ulcerative lesions', 'oral and genital ulcers', 'fever', 'perianal papules', 'inguinal adenopathy', 'genital ulcer lesions', 'pustules', 'cough', 'blisters', 'erythema with vesicles and papules', 'difficulty breathing', 'severe anemia', 'fatigue', 'muscle pain', 'dysphagia', 'decreased physical strength', 'outbreak on the skin', 'hands', 'chest', 'chills', 'general weakness', 'general discomfort', 'adenomegaly', 'myalgia', 'itch', 'papules', 'swollen lymph nodes', 'mild symptoms', 'sore throat', 'malaise', 'asthenia', 'characteristic symptoms of Monkeypox', 'diarrhea', 'Pain urinating', 'ulcers', 'loss of appetite', 'Vesicles', 'lymphadenopathy', 'myalgias', 'postules', 'encephalitis', 'symptoms compatible with monkeypox', 'blisters on limbs and genitals', 'Status']
Unique Status values: [1 0]


In [56]:
from pathlib import Path
import yaml, pprint

ROOT = Path.cwd() / "mpox_repro_framework"
profiles = {
    "farzipour_2023_baseline.yaml": ROOT/"src"/"cbe_repro"/"configs"/"profiles"/"farzipour_2023_baseline.yaml",
    "farzipour_2023_genai.yaml":   ROOT/"src"/"cbe_repro"/"configs"/"profiles"/"farzipour_2023_genai.yaml",
}

for name, p in profiles.items():
    d = yaml.safe_load(p.read_text())
    # force dataset to be a string that matches datasets.yaml
    d["dataset"] = "symptom_farzipour"
    p.write_text(yaml.safe_dump(d, sort_keys=False))
    print(f"‚úÖ Updated {name} -> dataset: symptom_farzipour")
    pprint.pprint(d.get("synth", {}))  # just to show the rest stayed intact


‚úÖ Updated farzipour_2023_baseline.yaml -> dataset: symptom_farzipour
{'enabled': False}
‚úÖ Updated farzipour_2023_genai.yaml -> dataset: symptom_farzipour
{'enabled': True,
 'kind': 'llm_tabular_augment',
 'multiplier': 1.5,
 'seed': 1337}


In [57]:
from pathlib import Path
import yaml

ROOT = Path.cwd() / "mpox_repro_framework"
profiles_dir = ROOT / "src" / "cbe_repro" / "configs" / "profiles"
profiles_dir.mkdir(parents=True, exist_ok=True)

def write_profile(fname, synth_enabled: bool):
    prof = {
        "paper_id": "farzipour_2023",
        "modality": "tabular",
        "dataset": "symptom_farzipour",   # <- the key you registered in datasets.yaml
        "model": {"type": "xgboost", "params": {}},
        "synth": {"enabled": synth_enabled, "multiplier": 1.5},
        "metrics": {"seed": 1337}
    }
    (profiles_dir / fname).write_text(yaml.safe_dump(prof, sort_keys=False))

write_profile("farzipour_2023_baseline.yaml", synth_enabled=False)
write_profile("farzipour_2023_genai.yaml",   synth_enabled=True)

print("‚úÖ Wrote profiles to:", profiles_dir)


‚úÖ Wrote profiles to: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/configs/profiles


In [58]:
from pathlib import Path
import yaml

ROOT = Path.cwd() / "mpox_repro_framework"
profiles = [
    ROOT / "src" / "cbe_repro" / "configs" / "profiles" / "farzipour_2023_baseline.yaml",
    ROOT / "src" / "cbe_repro" / "configs" / "profiles" / "farzipour_2023_genai.yaml",
]

def normalize_profile(doc):
    if not isinstance(doc, dict):
        return doc  # skip anything unexpected
    # 1) dataset -> string
    ds = doc.get("dataset")
    if isinstance(ds, dict):
        # try to pick a name-like field, else hard-set
        name = ds.get("name") or ds.get("key") or ds.get("id")
        doc["dataset"] = name if isinstance(name, str) else "symptom_farzipour"
    elif isinstance(ds, str):
        doc["dataset"] = ds  # already fine
    else:
        doc["dataset"] = "symptom_farzipour"

    # 2) defaults
    doc.setdefault("modality", "tabular")
    doc.setdefault("model", {})
    if isinstance(doc["model"], dict):
        doc["model"].setdefault("type", "xgboost")
    doc.setdefault("metrics", {})
    if isinstance(doc["metrics"], dict):
        doc["metrics"].setdefault("seed", 1337)
    return doc

for p in profiles:
    if not p.exists():
        print("Missing profile file:", p)
        continue

    content = yaml.safe_load(p.read_text())

    # Profiles might be a single dict or a list of dicts
    if isinstance(content, list):
        fixed = [normalize_profile(d) for d in content]
    else:
        fixed = normalize_profile(content)

    p.write_text(yaml.safe_dump(fixed, sort_keys=False))
    print("Updated:", p)
    # show a tiny preview
    print("--- preview ---")
    print(yaml.safe_dump(fixed if isinstance(fixed, dict) else fixed[0], sort_keys=False).splitlines()[:20])
    print("---------------")


Updated: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/configs/profiles/farzipour_2023_baseline.yaml
--- preview ---
['paper_id: farzipour_2023', 'modality: tabular', 'dataset: symptom_farzipour', 'model:', '  type: xgboost', '  params: {}', 'synth:', '  enabled: false', '  multiplier: 1.5', 'metrics:', '  seed: 1337']
---------------
Updated: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/configs/profiles/farzipour_2023_genai.yaml
--- preview ---
['paper_id: farzipour_2023', 'modality: tabular', 'dataset: symptom_farzipour', 'model:', '  type: xgboost', '  params: {}', 'synth:', '  enabled: true', '  multiplier: 1.5', 'metrics:', '  seed: 1337']
---------------


In [59]:
from pathlib import Path
import yaml

ROOT = Path.cwd() / "mpox_repro_framework"
papers_dir = ROOT / "src" / "cbe_repro" / "configs" / "papers"
papers_dir.mkdir(parents=True, exist_ok=True)

baseline = {
    "paper_id": "Farzipour_2023",
    "modality": "tabular",
    # IMPORTANT: use the registered key from datasets.yaml
    "dataset": "symptom_farzipour",
    "model": {"name": "xgboost", "params": {"n_estimators": 300, "max_depth": 4, "learning_rate": 0.08}},
    "synth": {"enabled": False},
    "metrics": {"threshold_tuning": True, "ci": {"enabled": True, "B": 1000, "alpha": 0.05}, "seed": 1337}
}

genai = {
    "paper_id": "Farzipour_2023",
    "modality": "tabular",
    "dataset": "symptom_farzipour",
    "model": {"name": "xgboost", "params": {"n_estimators": 300, "max_depth": 4, "learning_rate": 0.08}},
    "synth": {"enabled": True, "minority_multiplier": 2.0},  # GenAI-augmented (SMOTE/oversample)
    "metrics": {"threshold_tuning": True, "ci": {"enabled": True, "B": 1000, "alpha": 0.05}, "seed": 1337}
}

(baseline_path := papers_dir / "farzipour_2023_baseline.yaml").write_text(yaml.safe_dump(baseline, sort_keys=False))
(genai_path := papers_dir / "farzipour_2023_genai.yaml").write_text(yaml.safe_dump(genai, sort_keys=False))

print("Wrote:", baseline_path)
print("Wrote:", genai_path)


Wrote: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/configs/papers/farzipour_2023_baseline.yaml
Wrote: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/configs/papers/farzipour_2023_genai.yaml


In [60]:
import importlib
import cbe_repro.experiments.run_unified as ru
importlib.reload(ru)

<module 'cbe_repro.experiments.run_unified' from '/Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/experiments/run_unified.py'>

In [61]:
import yaml, pprint, pathlib as pl
p = pl.Path.cwd() / "mpox_repro_framework" / "src" / "cbe_repro" / "configs" / "papers" / "farzipour_2023_baseline.yaml"
print(type(yaml.safe_load(p.read_text())))
pprint.pp(yaml.safe_load(p.read_text()))

<class 'dict'>
{'paper_id': 'Farzipour_2023',
 'modality': 'tabular',
 'dataset': 'symptom_farzipour',
 'model': {'name': 'xgboost',
           'params': {'n_estimators': 300,
                      'max_depth': 4,
                      'learning_rate': 0.08}},
 'synth': {'enabled': False},
 'metrics': {'threshold_tuning': True,
             'ci': {'enabled': True, 'B': 1000, 'alpha': 0.05},
             'seed': 1337}}


In [62]:
import inspect
import importlib
import cbe_repro.experiments.run_unified as ru

print("[WHO]", ru.__file__)                         # path of the file actually imported
print("[HAS drop label?]", "drop(columns=[\"label\"])"
      in inspect.getsource(ru.run_from_profile))    # should be False if you‚Äôre on the new code

importlib.reload(ru)                   

[WHO] /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/experiments/run_unified.py
[HAS drop label?] False


<module 'cbe_repro.experiments.run_unified' from '/Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/experiments/run_unified.py'>

In [63]:
import sys, pathlib
sys.path.insert(0, str(pathlib.Path.cwd() / "mpox_repro_framework" / "src"))

In [64]:
import importlib
import cbe_repro.experiments.run_unified as ru
importlib.reload(ru)

<module 'cbe_repro.experiments.run_unified' from '/Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/experiments/run_unified.py'>

In [65]:
import sys, importlib.util
print("PYTHON:", sys.executable)
print("XGBOOST SPEC:", importlib.util.find_spec("xgboost"))

PYTHON: /opt/anaconda3/bin/python
XGBOOST SPEC: ModuleSpec(name='xgboost', loader=<_frozen_importlib_external.SourceFileLoader object at 0x14ec4ab70>, origin='/opt/anaconda3/lib/python3.13/site-packages/xgboost/__init__.py', submodule_search_locations=['/opt/anaconda3/lib/python3.13/site-packages/xgboost'])


In [66]:
# 1) Install into THIS kernel
%pip install -q xgboost

# 2) Make sure Python sees the new package
import importlib, site
importlib.invalidate_caches()

# 3) Try the import
import xgboost as xgb
print("xgboost", xgb.__version__, "->", xgb.__file__)


Note: you may need to restart the kernel to use updated packages.
xgboost 2.1.1 -> /opt/anaconda3/lib/python3.13/site-packages/xgboost/__init__.py


In [67]:
from cbe_repro.experiments.run_unified import run_from_profile

print("=== Farzipour 2023 ‚Äî BASELINE ===")
run_from_profile("farzipour_2023_baseline.yaml")

print("\n=== Farzipour 2023 ‚Äî GenAI ===")
run_from_profile("farzipour_2023_genai.yaml")


=== Farzipour 2023 ‚Äî BASELINE ===
[DEBUG] No synthetic data applied
{
  "run_id": "1756981577",
  "started": "2025-09-04 12:26:17",
  "paper_id": "Farzipour_2023",
  "modality": "tabular",
  "dataset_key": "symptom_farzipour",
  "model": {
    "name": "xgboost",
    "params": {
      "n_estimators": 300,
      "max_depth": 4,
      "learning_rate": 0.08
    }
  },
  "synth": {
    "enabled": false
  },
  "metrics": {
    "acc": 0.8867924528301887,
    "f1": 0.9387755102040817,
    "f1_ci": [
      0.8842105263157894,
      0.9807692307692307
    ],
    "threshold": 0.425006240606308,
    "f1_tuned": 0.9494949494949495,
    "f1_tuned_ci": [
      0.8958333333333334,
      0.9902912621359223
    ]
  },
  "tuning": {
    "tune_enabled": false
  }
}

=== Farzipour 2023 ‚Äî GenAI ===
[DEBUG] Applying synthetic data: multiplier=2.0, before=[ 19 139]
[DEBUG] SMOTE balance ‚Üí before={np.int64(0): np.int64(19), np.int64(1): np.int64(139)} after={np.int64(0): np.int64(98), np.int64(1): np.int

## Fine tune the framework

In [68]:
from pathlib import Path
code = r'''# -*- coding: utf-8 -*-
from __future__ import annotations
import json, time, yaml, numpy as np
from dataclasses import dataclass
from typing import Any, Dict
from pathlib import Path

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_recall_curve, make_scorer

from cbe_repro.models.model_zoo import ModelSpec, make_tabular, make_imaging
from cbe_repro.synth.image_loader import ImageFolderDataset
from cbe_repro.synth.symptom_smote import smote_or_oversample

ROOT = Path.cwd() / "mpox_repro_framework"
CFG_DIR = ROOT / "src" / "cbe_repro" / "configs"
RUNS = ROOT / "runs"

def _yml(p: Path): return yaml.safe_load(p.read_text())
def _reg(): return _yml(CFG_DIR/"datasets.yaml") or {}
def _seed(s: int): np.random.seed(int(s))

@dataclass
class PaperProfile:
    paper_id: str
    modality: str          # "tabular" | "imaging"
    dataset: Any           # key in datasets.yaml OR inline dict
    model: Dict[str, Any]  # {name, params}
    synth: Dict[str, Any]  # {enabled, ...}
    metrics: Dict[str, Any]# {threshold_tuning, tune:{...}, ci:{enabled,B,alpha}, seed, auto_doc}

def _boot_ci(y_true, y_pred, metric_fn, B=1000, alpha=0.05, seed=1337):
    rng = np.random.default_rng(seed)
    n = len(y_true); idx = np.arange(n)
    stats = []
    for _ in range(B):
        b = rng.choice(idx, size=n, replace=True)
        stats.append(metric_fn(np.array(y_true)[b], np.array(y_pred)[b]))
    lo, hi = np.quantile(stats, [alpha/2, 1-alpha/2])
    return float(lo), float(hi)

def _maybe_tune_tabular(model_spec: ModelSpec, Xtr, ytr, tune_cfg: Dict[str,Any]):
    """
    tune_cfg example:
      enabled: true
      method: grid | randomized
      cv: 5
      scoring: f1 | accuracy
      n_iter: 20        # (randomized only)
      param_grid:       # (dict of lists)
        n_estimators: [200, 300, 500]
        max_depth: [3, 4, 5]
        learning_rate: [0.05, 0.08, 0.1]
        subsample: [0.7, 0.9, 1.0]
        colsample_bytree: [0.7, 0.9, 1.0]
        min_child_weight: [1, 3, 5]
    """
    if not tune_cfg or not bool(tune_cfg.get("enabled", False)):
        return make_tabular(model_spec), {}

    method = str(tune_cfg.get("method", "grid")).lower()
    cv = int(tune_cfg.get("cv", 5))
    scoring_name = str(tune_cfg.get("scoring", "f1")).lower()
    scoring = make_scorer(f1_score) if scoring_name == "f1" else make_scorer(accuracy_score)

    base = make_tabular(model_spec)
    grid = tune_cfg.get("param_grid") or tune_cfg.get("search_space") or {}
    if not grid:
        # sensible defaults if nothing provided (XGBoost / RF / LR)
        if model_spec.name.lower() == "xgboost":
            grid = {
                "n_estimators": [200, 300, 500],
                "max_depth": [3, 4, 5],
                "learning_rate": [0.05, 0.08, 0.1],
                "subsample": [0.7, 0.9, 1.0],
                "colsample_bytree": [0.7, 0.9, 1.0],
                "min_child_weight": [1, 3, 5],
            }
        elif model_spec.name.lower() in ("random_forest","rf"):
            grid = {
                "n_estimators": [200, 400, 600],
                "max_depth": [None, 8, 12, 16],
                "min_samples_split": [2, 5, 10],
            }
        else:
            # logistic regression etc.
            grid = {"C":[0.1,0.3,1.0,3.0,10.0]}

    kfold = StratifiedKFold(n_splits=cv, shuffle=True, random_state=1337)
    if method == "randomized":
        n_iter = int(tune_cfg.get("n_iter", 20))
        search = RandomizedSearchCV(
            base, grid, n_iter=n_iter, scoring=scoring, cv=kfold, refit=True, n_jobs=-1, random_state=1337
        )
    else:
        search = GridSearchCV(
            base, grid, scoring=scoring, cv=kfold, refit=True, n_jobs=-1
        )
    search.fit(Xtr, ytr)
    best_est = search.best_estimator_
    info = {"tune_enabled": True, "method": method, "cv": cv,
            "scoring": scoring_name, "best_params": search.best_params_,
            "best_score_cv": float(search.best_score_)}
    return best_est, info

def run_from_profile(profile_yaml: str, return_results=False):
    prof_dict = _yml(CFG_DIR/"papers"/profile_yaml)
    prof = PaperProfile(
        paper_id=prof_dict["paper_id"],
        modality=prof_dict["modality"],
        dataset=prof_dict["dataset"],
        model=prof_dict["model"],
        synth=prof_dict.get("synth",{}) or {},
        metrics=prof_dict.get("metrics",{}) or {}
    )

    reg = _reg()

    # allow dict-form dataset inline in the profile
    if isinstance(prof.dataset, dict):
        ds_dict = prof.dataset
        ds_name = ds_dict.get("name", "inline_dataset")
        reg[ds_name] = {
            "path": ds_dict["path"],
            "label_col": ds_dict.get("label_col", "label"),
            "positive_value": ds_dict.get("positive_value", 1),
        }
        prof.dataset = ds_name  # continue with this name

    if prof.dataset not in reg:
        raise KeyError(f"Dataset '{prof.dataset}' not in datasets.yaml")

    seed = int(prof.metrics.get("seed", 1337)); _seed(seed)

    # ---- load & train
    tune_info = {}
    if prof.modality == "tabular":
        import pandas as pd
        entry = reg[prof.dataset]
        csv_path = ROOT/"src"/"cbe_repro"/"data"/Path(entry["path"]).name
        df = pd.read_csv(csv_path)

        # normalize column names
        df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

        # label resolution: prefer datasets.yaml label_col; else try common aliases
        label_col = (entry.get("label_col") or "").strip().lower().replace(" ", "_")
        if not label_col:
            candidates = ["label","status","target","class","outcome","y"]
            label_col = next((c for c in candidates if c in df.columns), None)
            if label_col is None:
                raise KeyError(f"Could not find a label column. Available: {list(df.columns)}")

        # drop common id-like columns
        id_like = [c for c in ["id","patient_id","sample_id","record_id"] if c in df.columns]

        # build X, y
        X = df.drop(columns=id_like + [label_col])
        y = df[label_col]

        # map labels to 0/1 if needed
        if y.dtype == "O":
            y = (
                y.astype(str).str.strip().str.lower().map({
                    "1":1, "0":0, "true":1, "false":0, "yes":1, "no":0,
                    "positive":1, "negative":0, "mpox":1, "non_mpox":0, "pos":1, "neg":0
                })
            )
        if y.isna().any():
            bad = sorted(y[y.isna()].index.tolist()[:5])
            raise ValueError(f"Label column '{label_col}' contains unmapped values. Bad rows: {bad}. "
                             f"Unique values: {sorted(df[label_col].astype(str).unique().tolist())}")
        y = y.astype(int)

        Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.25, stratify=y, random_state=seed)

        if prof.synth.get("enabled", False):
            mult = float(prof.synth.get("minority_multiplier", 2.0))
            Xtr, ytr = smote_or_oversample(Xtr, ytr, multiplier=mult, seed=seed)

        # maybe tune
        model_spec = ModelSpec(**prof.model)
        model, tune_info = _maybe_tune_tabular(model_spec, Xtr, ytr, prof.metrics.get("tune", {}))
        model.fit(Xtr, ytr)

        proba = model.predict_proba(Xte)[:,1] if hasattr(model,"predict_proba") else None
        yhat = (proba >= 0.5).astype(int) if proba is not None else model.predict(Xte)

    elif prof.modality == "imaging":
        entry = reg[prof.dataset]
        ds_tr = ImageFolderDataset(entry["root"], "train",
                                   pos_cls=entry["classes"]["positive"], neg_cls=entry["classes"]["negative"],
                                   seed=seed, img_size=128)
        ds_va = ImageFolderDataset(entry["root"], "val",
                                   pos_cls=entry["classes"]["positive"], neg_cls=entry["classes"]["negative"],
                                   seed=seed, img_size=128)
        Xtr, ytr = ds_tr.as_features_labels(synth_enabled=bool(prof.synth.get("enabled",False)),
                                            synth_multiplier=float(prof.synth.get("minority_multiplier",2.0)))
        Xte, yte = ds_va.as_features_labels(synth_enabled=False)
        model = make_imaging(ModelSpec(**prof.model))
        model.fit(Xtr, ytr)
        proba = model.predict_proba(Xte)[:,1] if hasattr(model,"predict_proba") else None
        yhat = (proba >= 0.5).astype(int) if proba is not None else model.predict(Xte)
    else:
        raise ValueError(f"Unknown modality '{prof.modality}'")

    # ---- metrics (+ threshold tuning + CI)
    acc = float(accuracy_score(yte, yhat))
    f1  = float(f1_score(yte, yhat))
    ci  = (prof.metrics.get("ci") or {})
    ci_on = bool(ci.get("enabled", True)); B=int(ci.get("B",1000)); a=float(ci.get("alpha",0.05))
    f1_ci = _boot_ci(yte, yhat, f1_score, B=B, alpha=a, seed=seed) if ci_on else None

    tuned = {}
    if bool(prof.metrics.get("threshold_tuning", False)) and proba is not None:
        prec, rec, thr = precision_recall_curve(yte, proba)
        f1s = 2*prec*rec/(prec+rec+1e-12)
        best = float(thr[f1s[:-1].argmax()]) if len(thr)>0 else 0.5
        yhat_t = (proba >= best).astype(int)
        f1_t   = float(f1_score(yte, yhat_t))
        f1_t_ci = _boot_ci(yte, yhat_t, f1_score, B=B, alpha=a, seed=seed) if ci_on else None
        tuned = {"threshold": best, "f1_tuned": f1_t, "f1_tuned_ci": f1_t_ci}

    manif = {
        "run_id": str(int(time.time())),
        "started": time.strftime("%Y-%m-%d %H:%M:%S"),
        "paper_id": prof.paper_id,
        "modality": prof.modality,
        "dataset_key": prof.dataset,
        "model": prof.model,
        "synth": prof.synth,
        "metrics": {"acc": acc, "f1": f1, "f1_ci": f1_ci, **tuned},
        "tuning": tune_info or {"tune_enabled": False}
    }
    out = RUNS/manif["run_id"]; out.mkdir(parents=True, exist_ok=True)
    (out/"manifest.json").write_text(json.dumps(manif, indent=2))

    # optional auto-doc
    if bool(prof.metrics.get("auto_doc", False)):
        try:
            from cbe_repro.reporting.write_docs import write_experiment_cards, write_section_4_2_5_report
            write_experiment_cards(); write_section_4_2_5_report()
        except Exception as e:
            print(f"[Auto-doc skipped] {e}")

    if return_results: return manif
    print(json.dumps(manif, indent=2))
'''
p = Path("mpox_repro_framework/src/cbe_repro/experiments/run_unified.py")
p.write_text(code)
print("Updated:", p)


Updated: mpox_repro_framework/src/cbe_repro/experiments/run_unified.py


In [None]:
#### Add tuning to your paper profiles

In [69]:
from pathlib import Path
import yaml

ROOT = Path.cwd() / "mpox_repro_framework"
profiles = [
    ROOT / "src" / "cbe_repro" / "configs" / "papers" / "farzipour_2023_baseline.yaml",
    ROOT / "src" / "cbe_repro" / "configs" / "papers" / "farzipour_2023_genai.yaml",
]

for p in profiles:
    prof = yaml.safe_load(p.read_text())

    # ensure dataset key matches datasets.yaml
    prof["dataset"] = "symptom_farzipour"

    # model defaults (keep your current)
    prof.setdefault("model", {}).setdefault("name", "xgboost")
    prof["model"].setdefault("params", {"n_estimators":300,"max_depth":4,"learning_rate":0.08})

    # tuning + threshold tuning + auto-doc
    prof.setdefault("metrics", {})
    prof["metrics"]["threshold_tuning"] = True
    prof["metrics"]["auto_doc"] = True
    prof["metrics"]["tune"] = {
        "enabled": True,
        "method": "grid",       # or "randomized"
        "cv": 5,
        "scoring": "f1",        # or "accuracy" if you want raw accuracy
        "param_grid": {
            "n_estimators": [200, 300, 500],
            "max_depth": [3, 4, 5],
            "learning_rate": [0.05, 0.08, 0.1],
            "subsample": [0.7, 0.9, 1.0],
            "colsample_bytree": [0.7, 0.9, 1.0],
            "min_child_weight": [1, 3, 5],
        }
    }

    # synth only for the GenAI profile
    if "genai" in p.name:
        prof["synth"] = {"enabled": True, "minority_multiplier": 2.0}
    else:
        prof["synth"] = {"enabled": False}

    p.write_text(yaml.safe_dump(prof, sort_keys=False))
    print("Updated:", p)


Updated: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/configs/papers/farzipour_2023_baseline.yaml
Updated: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/configs/papers/farzipour_2023_genai.yaml


In [70]:
from cbe_repro.experiments.run_unified import run_from_profile
print("=== Farzipour 2023 ‚Äî BASELINE (tuned) ===")
run_from_profile("farzipour_2023_baseline.yaml")

=== Farzipour 2023 ‚Äî BASELINE (tuned) ===
[DEBUG] No synthetic data applied
Wrote card: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/reports/cards/imaging_baseline.md
Wrote card: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/reports/cards/imaging_genai.md
Wrote section report: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/reports/section_4_2_5.md
{
  "run_id": "1756981628",
  "started": "2025-09-04 12:27:08",
  "paper_id": "Farzipour_2023",
  "modality": "tabular",
  "dataset_key": "symptom_farzipour",
  "model": {
    "name": "xgboost",
    "params": {
      "n_estimators": 300,
      "max_depth": 4,
      "learning_rate": 0.08
    }
  },
  "synth": {
    "enabled": false
  },
  "metrics": {
    "acc": 0.8867924528301887,
    "f1": 0.9387755102040817,
    "f1_ci": [
      0.8842105263157894,
      0.9807692307692307
    ],
    "threshold": 0.39333659410476685,
    "f1_tuned": 0.9494949494949495,


In [None]:
pip install -U "scikit-learn==1.5.2"

In [None]:
import sklearn
print(sklearn.__version__)

In [71]:
import sys, pathlib
sys.path.insert(0, str(pathlib.Path.cwd() / "mpox_repro_framework" / "src"))

In [72]:
import importlib
import cbe_repro.experiments.run_unified as ru
importlib.reload(ru)

<module 'cbe_repro.experiments.run_unified' from '/Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/experiments/run_unified.py'>

In [74]:
from cbe_repro.experiments.run_unified import run_from_profile
print("=== Farzipour 2023 ‚Äî BASELINE (tuned) ===")
run_from_profile("farzipour_2023_baseline.yaml")

print("\n=== Farzipour 2023 ‚Äî GenAI (tuned) ===")
run_from_profile("farzipour_2023_genai.yaml")

=== Farzipour 2023 ‚Äî BASELINE (tuned) ===
Wrote card: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/reports/cards/imaging_baseline.md
Wrote card: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/reports/cards/imaging_genai.md
Wrote section report: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/reports/section_4_2_5.md
{
  "run_id": "1756984508",
  "started": "2025-09-04 13:15:08",
  "paper_id": "Farzipour_2023",
  "modality": "tabular",
  "dataset_key": "symptom_farzipour",
  "model": {
    "name": "xgboost",
    "params": {
      "n_estimators": 300,
      "max_depth": 4,
      "learning_rate": 0.08
    }
  },
  "synth": {
    "enabled": false
  },
  "metrics": {
    "acc": 0.8867924528301887,
    "f1": 0.9387755102040817,
    "f1_ci": [
      0.8842105263157894,
      0.9807692307692307
    ],
    "threshold": 0.39333659410476685,
    "f1_tuned": 0.9494949494949495,
    "f1_tuned_ci": [
      0.89583

In [None]:
import pandas as pd

df = pd.read_csv(r"C:\Users\chadrackm\OneDrive - University of Johannesburg\Documents\MASTER DATA SCIENCE YEAR 2\LTD SCOPE RESEARCH PROJ APPLIED DATA\Chapter Four\Framework\mpox_repro_framework\src\cbe_repro\data/monkeypox_global_symptoms.csv")
print(df['Status'].value_counts())

In [None]:
from cbe_repro.reporting.write_docs import write_experiment_cards, write_section_4_2_5_report
write_experiment_cards()
write_section_4_2_5_report()


## Part to Reload and Re-run the code

In [1]:
from cbe_repro.experiments.run_unified import run_from_profile


ModuleNotFoundError: No module named 'cbe_repro'

In [None]:
import importlib
import cbe_repro.experiments.run_unified as run_unified
import cbe_repro.reporting.write_docs as write_docs

# Reload them
importlib.reload(run_unified)
importlib.reload(write_docs)

# Now import the function again
from cbe_repro.experiments.run_unified import run_from_profile


In [80]:
print("=== Farzipour 2023 ‚Äî BASELINE (tuned) ===")
run_from_profile("farzipour_2023_baseline.yaml")

print("\n=== Farzipour 2023 ‚Äî GenAI (tuned) ===")
run_from_profile("farzipour_2023_genai.yaml")


=== Farzipour 2023 ‚Äî BASELINE (tuned) ===
[DOC] synth_enabled=False, balance_to_max=False, target_ratio=None | train_counts_before={0: 19, 1: 139} -> after={0: 19, 1: 139}
Wrote card: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/reports/cards/imaging_baseline.md
Wrote card: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/reports/cards/imaging_genai.md
Wrote card: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/reports/cards/Farzipour_2023.md
Wrote card: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/reports/cards/farzipour_2023_baseline.md
Wrote section report: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/reports/section_4_2_5.md
{
  "run_id": "1757015029",
  "started": "2025-09-04 21:43:49",
  "paper_id": "Farzipour_2023",
  "modality": "tabular",
  "dataset_key": "symptom_farzipour",
  "model": {
    "name": "xgboost",
    "params": {
      "

## Images Dataset

In [100]:
import cbe_repro.synth.image_loader as image_loader
importlib.reload(image_loader)

<module 'cbe_repro.synth.image_loader' from '/Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/synth/image_loader.py'>

In [101]:
from cbe_repro.experiments.run_unified import run_from_profile


In [102]:
import importlib
import cbe_repro.experiments.run_unified as run_unified
import cbe_repro.reporting.write_docs as write_docs

# Reload them
importlib.reload(run_unified)
importlib.reload(write_docs)

# Now import the function again
from cbe_repro.experiments.run_unified import run_from_profile


In [103]:
from cbe_repro.experiments.run_unified import run_from_profile

print("=== Imaging ‚Äî BASELINE ===")
run_from_profile("imaging_baseline.yaml")

print("\n=== Imaging ‚Äî GenAI ===")
run_from_profile("imaging_genai.yaml")


=== Imaging ‚Äî BASELINE ===
[DOC][IMG] synth_enabled=False, balance_to_max=False, target_ratio=None | before={0: 100, 1: 81} -> after={0: 100, 1: 81}
Wrote card: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/reports/cards/imaging_genai.md
Wrote card: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/reports/cards/Farzipour_2023.md
Wrote card: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/reports/cards/farzipour_2023_baseline.md
Wrote card: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/reports/cards/farzipour_2023_genai.md
Wrote card: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/reports/cards/imaging_baseline.md
Wrote section report: /Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/reports/section_4_2_5.md
{
  "run_id": "1757023903",
  "started": "2025-09-05 00:11:43",
  "paper_id": "test",
  "modality": "imaging",
  "da

In [98]:
import importlib, cbe_repro.synth.image_loader as il
import inspect
print(il.__file__)  # confirm it points to your edited file
importlib.reload(il)

# verify the signature really includes the new args
print(inspect.signature(il.ImageFolderDataset.as_features_labels))

/Users/munashemanzira/Downloads/Master Code/Framework/mpox_repro_framework/src/cbe_repro/synth/image_loader.py
(self, synth_enabled: 'bool' = False, synth_multiplier: 'float' = 2.0, balance_to_max: 'bool' = False, target_ratio: 'Optional[float]' = None, seed: 'int' = 1337, synth_verbose: 'bool' = True)


### VIT

In [None]:
from cbe_repro.experiments.run_unified import run_from_profile

print("=== ViT Baseline (Ayana-style) ===")
run_from_profile("imaging_vit_baseline.yaml")

print("\n=== ViT + Balanced Sampler (GenAI) ===")
run_from_profile("imaging_vit_genai.yaml")


In [1]:
import torch, torchvision, timm
print(torch.__version__, torchvision.__version__, timm.__version__)
print("MPS available:", torch.backends.mps.is_available())

2.7.1 0.22.0 1.0.19
MPS available: True


In [2]:
import sys, os
sys.path.append(os.path.expanduser("~/Downloads/Master Code/Framework/mpox_repro_framework/src"))

# sanity check
import cbe_repro
from cbe_repro.experiments.run_unified import run_from_profile

In [3]:
import importlib
import cbe_repro.experiments.run_unified as run_unified
import cbe_repro.reporting.write_docs as write_docs

# Reload them
importlib.reload(run_unified)
importlib.reload(write_docs)

# Now import the function again
from cbe_repro.experiments.run_unified import run_from_profile

In [None]:
from cbe_repro.experiments.run_unified import run_from_profile

print("=== ViT Baseline (Ayana-style) ===")
run_from_profile("imaging_vit_baseline.yaml")

print("\n=== ViT + Balanced Sampler (GenAI) ===")
run_from_profile("imaging_vit_genai.yaml")

=== ViT Baseline (Ayana-style) ===


  scaler = torch.cuda.amp.GradScaler(enabled=amp and device.type == "cuda")
  with torch.cuda.amp.autocast(enabled=amp and device.type=="cuda"):
