In [None]:
# ============================
# CatBoost Multi-Output (CPU) — Stable, Web-App-Safe Training + Export
# ============================

# 0) Install deps (Colab)
!pip -q install catboost joblib pandas numpy scikit-learn

import os, json, time, platform, zipfile
from pathlib import Path

import numpy as np
import pandas as pd
import joblib

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from catboost import CatBoostClassifier
from google.colab import files

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# 1) Exact web-app feature names (KEEP THESE EXACT)
RAW_FEATURES = [
    "Gender_ambiguous","Gender_female","Gender_male",
    "Blood test result_abnormal","Blood test result_inconclusive","Blood test result_normal","Blood test result_slightly abnormal",
    "Patient Age","Blood cell count (mcL)","Mother's age","Father's age",
    "Test 1","Test 2","Test 3","Test 4","Test 5",
    "No. of previous abortion",
    "White Blood cell count (thousand per microliter)",
    "Symptom 1","Symptom 2","Symptom 3","Symptom 4","Symptom 5",
    "Parental Age Diff","Symptom Score",
    "Genes in mother's side","Inherited from father","Maternal gene","Paternal gene",
    "Status","Respiratory Rate (breaths/min)","Heart Rate (rates/min",  # NOTE: no closing ')'
    "Follow-up","Birth asphyxia","Autopsy shows birth defect (if applicable)",
    "Folic acid details (peri-conceptional)",
    "H/O serious maternal illness","H/O radiation exposure (x-ray)",
    "H/O substance abuse","Assisted conception IVF/ART",
    "History of anomalies in previous pregnancies","Birth defects"
]
TARGETS = ["Genetic Disorder", "Disorder Subclass"]

# 2) Human class names (index 0..K-1)
DISORDER_NAMES = [
    "Mitochondrial genetic inheritance disorders",
    "Multifactorial genetic inheritance disorders",
    "Single-gene inheritance diseases"
]
SUBCLASS_NAMES = [
    "Cancer","Cystic fibrosis","Diabetes","Down syndrome","Huntington's disease",
    "Klinefelter syndrome","Leber's hereditary optic neuropathy","Leigh syndrome","Turner syndrome"
]
NAME_CATALOG = {
    "Genetic Disorder": DISORDER_NAMES,
    "Disorder Subclass": SUBCLASS_NAMES
}

# 3) Load data (expects the 42 features + 2 targets in train_encoded.csv)
train_df = pd.read_csv("/content/drive/MyDrive/train_test_data/train_encoded.csv")

missing_feats = [c for c in RAW_FEATURES if c not in train_df.columns]
missing_tgts  = [t for t in TARGETS if t not in train_df.columns]
assert not missing_feats, f"Missing features in train_encoded.csv: {missing_feats}"
assert not missing_tgts,  f"Missing targets in train_encoded.csv: {missing_tgts}"

X_train = train_df[RAW_FEATURES].copy()
Y_train = train_df[TARGETS].copy()

# (Optional) coerce to numeric just in case source file had strings
for c in X_train.columns:
    X_train[c] = pd.to_numeric(X_train[c], errors="coerce").fillna(0.0)

# 4) CatBoost CPU (use tuned params if you have them; else safe defaults)
best = None
# Example tuned dict if you have it:
# best = {
#   "clf__iterations": 800, "clf__depth": 6, "clf__learning_rate": 0.05,
#   "clf__l2_leaf_reg": 3.0, "clf__random_strength": 1.0,
#   "clf__bagging_temperature": 0.5, "clf__border_count": 128
# }

def make_catboost_cpu(best_params):
    if best_params:
        return CatBoostClassifier(
            loss_function="MultiClass", eval_metric="TotalF1",
            iterations=best_params["clf__iterations"],
            depth=best_params["clf__depth"],
            learning_rate=best_params["clf__learning_rate"],
            l2_leaf_reg=best_params["clf__l2_leaf_reg"],
            random_strength=best_params["clf__random_strength"],
            bagging_temperature=best_params["clf__bagging_temperature"],
            border_count=best_params["clf__border_count"],
            random_seed=RANDOM_STATE, verbose=100,
            allow_writing_files=False, thread_count=-1   # CPU
        )
    # Safe defaults
    return CatBoostClassifier(
        loss_function="MultiClass", eval_metric="TotalF1",
        iterations=800, depth=6, learning_rate=0.05, l2_leaf_reg=3.0,
        random_seed=RANDOM_STATE, verbose=100,
        allow_writing_files=False, thread_count=-1      # CPU
    )

cat_cpu = make_catboost_cpu(best)

# 5) Explicit column selector (NO remainder='passthrough')
selector = ColumnTransformer(
    transformers=[("keep", "passthrough", RAW_FEATURES)],
    remainder="drop",
    verbose_feature_names_out=False,
)

# 6) Final pipeline: select -> MultiOutput(CatBoost CPU)
full_pipe = Pipeline([
    ("select", selector),
    ("clf",    MultiOutputClassifier(cat_cpu)),
])

print("[FINAL FIT] Training CPU CatBoost pipeline on ALL data …")
full_pipe.fit(X_train, Y_train)

# 7) Sanity check
probe = pd.DataFrame([{k: 0.0 for k in RAW_FEATURES}])
_ = full_pipe.predict(probe)

# 8) Export bundle
version   = time.strftime("v%Y-%m-%d_%H-%M")
task_name = "genetic_disorder"
root      = Path("/content/model_bundles")
out_dir   = root / task_name / version
out_dir.mkdir(parents=True, exist_ok=True)

# 8.1 pipeline.joblib
joblib.dump(full_pipe, out_dir / "pipeline.joblib", compress=3)

# 8.2 schema.json
schema = {
    "features": [{"name": k, "dtype": "float64", "allow_null": False} for k in RAW_FEATURES],
    "required": RAW_FEATURES
}
(out_dir / "schema.json").write_text(json.dumps(schema, indent=2), encoding="utf-8")

# 8.3 targets.json (map estimator classes -> human labels)
def _to_int_like(x):
    if isinstance(x, (np.integer, int)): return int(x)
    if isinstance(x, str) and x.isdigit(): return int(x)
    raise ValueError(f"Estimator class '{x}' not int-like; got {x!r}")

estimators = full_pipe.named_steps["clf"].estimators_
human_classes, raw_classes = {}, {}
for i, target in enumerate(TARGETS):
    est = estimators[i]
    raw = list(est.classes_)
    raw_classes[target] = [str(x) for x in raw]
    cat = NAME_CATALOG[target]
    human_classes[target] = [cat[_to_int_like(x)] for x in raw]

targets_payload = {
    "targets": TARGETS,
    "classes": human_classes,
    "raw_estimator_classes": raw_classes  # optional for auditing
}
(out_dir / "targets.json").write_text(json.dumps(targets_payload, indent=2), encoding="utf-8")

# 8.4 metrics.json (fill with OOF if you computed it elsewhere)
(out_dir / "metrics.json").write_text(
    json.dumps({"oof": {}, "notes": "CPU CatBoost; explicit column selection; no remainder passthrough."}, indent=2),
    encoding="utf-8"
)

# 8.5 model_info.yaml (helps server-side diagnostics)
try:
    import sklearn as sk; skver = sk.__version__
except Exception:
    skver = "unknown"

model_info = f"""\
name: genetic_disorder_catboost_cpu_select
version: {version}
trained_at: {time.strftime("%Y-%m-%d %H:%M:%S")}
framework: sklearn+catboost
sklearn_version: {skver}
python_version: {platform.python_version()}
platform: {platform.platform()}
notes: "Stable export: explicit ColumnTransformer passthrough on RAW_FEATURES with remainder='drop'; no notebook-local functions; CPU CatBoost."
"""
(out_dir / "model_info.yaml").write_text(model_info, encoding="utf-8")

# 8.6 Zip & download
zip_path = root / f"{task_name}_{version}.zip"
if zip_path.exists(): zip_path.unlink()
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as z:
    for p in out_dir.rglob("*"):
        z.write(p, p.relative_to(root))

print("Bundle ready at:", zip_path)
files.download(zip_path.as_posix())

print("sklearn_version used for training:", skver)


[FINAL FIT] Training CPU CatBoost pipeline on ALL data …
0:	learn: 0.5544976	total: 31.7ms	remaining: 25.3s
100:	learn: 0.6146241	total: 3.07s	remaining: 21.2s
200:	learn: 0.6374719	total: 5.85s	remaining: 17.4s
300:	learn: 0.6585415	total: 7.22s	remaining: 12s
400:	learn: 0.6779890	total: 8.58s	remaining: 8.54s
500:	learn: 0.6926012	total: 9.93s	remaining: 5.92s
600:	learn: 0.7080699	total: 11.3s	remaining: 3.74s
700:	learn: 0.7222898	total: 12.7s	remaining: 1.79s
799:	learn: 0.7370004	total: 14s	remaining: 0us
0:	learn: 0.3521167	total: 39.5ms	remaining: 31.5s
100:	learn: 0.4444412	total: 5.56s	remaining: 38.5s
200:	learn: 0.4963341	total: 9.24s	remaining: 27.5s
300:	learn: 0.5339999	total: 12.9s	remaining: 21.4s
400:	learn: 0.5712618	total: 17.6s	remaining: 17.5s
500:	learn: 0.6057266	total: 22.1s	remaining: 13.2s
600:	learn: 0.6367444	total: 25.8s	remaining: 8.53s
700:	learn: 0.6644839	total: 29.4s	remaining: 4.16s
799:	learn: 0.6907340	total: 34.9s	remaining: 0us
Bundle ready at: 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

sklearn_version used for training: 1.6.1
