In [None]:
# ===== 1) Setup =====
!pip -q install scikit-learn==1.7.2 pandas==2.2.2 numpy==1.26.4 joblib==1.4.2 pyyaml==6.0.2

import os, json, joblib, time, shutil, yaml, platform
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# ---- paths: upload your CSVs to /content or mount Drive and point here
TRAIN_CSV = "/content/drive/MyDrive/train_test_data/genome_train_cleaned.csv"   # <-- put your file here

# ---- model version folder
STAMP = time.strftime("v%Y-%m-%d_%H-%M")
TASK  = "genetic_disorder"
EXPORT_DIR = f"/content/{TASK}/{STAMP}"
os.makedirs(EXPORT_DIR, exist_ok=True)

# ===== 2) Define EXACT 27 feature columns =====
# These must match your frontend and backend schema EXACTLY (names and casing).
FEATURE_COLS = [
    "Gender_ambiguous", "Gender_female", "Gender_male",
    "Blood test result_abnormal", "Blood test result_inconclusive",
    "Blood test result_normal", "Blood test result_slightly abnormal",
    "Patient Age", "Blood cell count (mcL)", "Mother's age", "Father's age",
    "No. of previous abortion",
    "White Blood cell count (thousand per microliter)",
    "Symptom 1", "Symptom 2", "Symptom 3", "Symptom 4", "Symptom 5",
    "Parental Age Diff", "Symptom Score",
    "Genes in mother's side", "Inherited from father",
    "Maternal gene", "Paternal gene",
    "Status", "Respiratory Rate (breaths/min)",
    "Heart Rate (rates/min)"  # <-- ensure the closing parenthesis is present
]

TARGET_PARENT = "Genetic Disorder"
TARGET_CHILD  = "Disorder Subclass"

# ===== 3) Load & clean =====
df = pd.read_csv(TRAIN_CSV)

# Keep only the needed columns (drop anything else)
needed = FEATURE_COLS + [TARGET_PARENT, TARGET_CHILD]
missing = [c for c in needed if c not in df.columns]
if missing:
    raise ValueError(f"Columns missing from CSV: {missing}")

df = df[needed].copy()

# Enforce numeric for feature cols
for c in FEATURE_COLS:
    df[c] = pd.to_numeric(df[c], errors="coerce")

# Drop rows with any NaN in features or targets (prevents 'nan' becoming a class)
df = df.dropna(subset=FEATURE_COLS + [TARGET_PARENT, TARGET_CHILD]).reset_index(drop=True)

# (Optional) constrain targets to expected sets (if you want to enforce clean classes)
PARENT_ALLOWED = [
    "Mitochondrial genetic inheritance disorders",
    "Multifactorial genetic inheritance disorders",
    "Single-gene inheritance diseases"
]
CHILD_ALLOWED = [
    "Cancer", "Cystic fibrosis", "Diabetes", "Down syndrome",
    "Huntington's disease", "Klinefelter syndrome",
    "Leber's hereditary optic neuropathy", "Leigh syndrome", "Turner syndrome"
]
df = df[df[TARGET_PARENT].isin(PARENT_ALLOWED) & df[TARGET_CHILD].isin(CHILD_ALLOWED)].reset_index(drop=True)

X = df[FEATURE_COLS].astype(float)
y_parent_str = df[TARGET_PARENT].astype(str)
y_child_str  = df[TARGET_CHILD].astype(str)

# ===== 4) Label encoders (avoid np.nan as class) =====
le_parent = LabelEncoder().fit(y_parent_str)
le_child  = LabelEncoder().fit(y_child_str)

y_parent = le_parent.transform(y_parent_str)
y_child  = le_child.transform(y_child_str)

# ===== 5) Train/val split =====
X_train, X_val, yp_train, yp_val, yc_train, yc_val = train_test_split(
    X, y_parent, y_child, test_size=0.2, random_state=42, stratify=y_parent
)

# ===== 6) Train models (fixed params, no CV) =====
# Use robust, balanced-ish defaults; adjust if you like.
parent_model = RandomForestClassifier(
    n_estimators=400, max_depth=14, min_samples_split=2, min_samples_leaf=1,
    n_jobs=-1, random_state=42, class_weight=None
)
child_model = RandomForestClassifier(
    n_estimators=500, max_depth=18, min_samples_split=2, min_samples_leaf=1,
    n_jobs=-1, random_state=42, class_weight=None
)

parent_model.fit(X_train, yp_train)
child_model.fit(X_train, yc_train)

# ===== 7) Evaluate (simple metrics) =====
yp_pred = parent_model.predict(X_val)
yc_pred = child_model.predict(X_val)

metrics = {
    "parent": {
        "accuracy": float(accuracy_score(yp_val, yp_pred)),
        "f1_macro": float(f1_score(yp_val, yp_pred, average="macro")),
        "report": classification_report(yp_val, yp_pred, output_dict=True, zero_division=0),
    },
    "child": {
        "accuracy": float(accuracy_score(yc_val, yc_pred)),
        "f1_macro": float(f1_score(yc_val, yc_pred, average="macro")),
        "report": classification_report(yc_val, yc_pred, output_dict=True, zero_division=0),
    },
}
# ensure JSON-serializable keys (cast any np.int64 to str)
def _to_jsonable(d):
    if isinstance(d, dict):
        return {str(k): _to_jsonable(v) for k, v in d.items()}
    if isinstance(d, (np.floating, np.float32, np.float64)):
        return float(d)
    if isinstance(d, (np.integer, np.int32, np.int64)):
        return int(d)
    if isinstance(d, (list, tuple)):
        return [_to_jsonable(x) for x in d]
    return d
metrics = _to_jsonable(metrics)

# ===== 8) Minimal identity pipeline (so backend .transform() is safe) =====
# We just save an identity object; the backend already aligns columns.
from sklearn.base import BaseEstimator, TransformerMixin

class IdentityTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X): return X

pipeline = IdentityTransformer()

# ===== 9) Export artifacts =====
joblib.dump(parent_model, f"{EXPORT_DIR}/parent_model.joblib", compress=3)
joblib.dump(child_model,  f"{EXPORT_DIR}/child_model.joblib",  compress=3)
joblib.dump(le_parent,    f"{EXPORT_DIR}/le_parent.joblib",   compress=3)
joblib.dump(le_child,     f"{EXPORT_DIR}/le_child.joblib",    compress=3)
joblib.dump(pipeline,     f"{EXPORT_DIR}/pipeline.joblib",     compress=3)

# schema.json (27 features)
schema = {
    "features": [{"name": c, "dtype": "float"} for c in FEATURE_COLS]
}
with open(f"{EXPORT_DIR}/schema.json", "w", encoding="utf-8") as f:
    json.dump(schema, f, indent=2)

# targets.json (string class names, no NaN)
targets = {
    "targets": [TARGET_PARENT, TARGET_CHILD],
    "classes": {
        TARGET_PARENT: [str(x) for x in le_parent.classes_.tolist()],
        TARGET_CHILD:  [str(x) for x in le_child.classes_.tolist()],
    }
}
with open(f"{EXPORT_DIR}/targets.json", "w", encoding="utf-8") as f:
    json.dump(targets, f, indent=2)

# metrics.json
with open(f"{EXPORT_DIR}/metrics.json", "w", encoding="utf-8") as f:
    json.dump(metrics, f, indent=2)

# model_info.yaml
info = {
    "task": TASK,
    "version": STAMP,
    "created_at": time.strftime("%Y-%m-%d %H:%M:%S"),
    "feature_count": len(FEATURE_COLS),
    "sklearn_version": "1.7.2",
    "python": platform.python_version(),
    "params": {
        "parent_model": dict(n_estimators=400, max_depth=14, min_samples_split=2, min_samples_leaf=1, random_state=42),
        "child_model":  dict(n_estimators=500, max_depth=18, min_samples_split=2, min_samples_leaf=1, random_state=42),
    }
}
with open(f"{EXPORT_DIR}/model_info.yaml", "w", encoding="utf-8") as f:
    yaml.safe_dump(info, f, sort_keys=False)

# bundle_meta.json (optional helper)
bundle_meta = {
    "paths": {
        "parent_model": f"{EXPORT_DIR}/parent_model.joblib",
        "child_model":  f"{EXPORT_DIR}/child_model.joblib",
        "le_parent":    f"{EXPORT_DIR}/le_parent.joblib",
        "le_child":     f"{EXPORT_DIR}/le_child.joblib",
        "pipeline":     f"{EXPORT_DIR}/pipeline.joblib",
        "schema":       f"{EXPORT_DIR}/schema.json",
        "targets":      f"{EXPORT_DIR}/targets.json",
        "metrics":      f"{EXPORT_DIR}/metrics.json",
        "model_info":   f"{EXPORT_DIR}/model_info.yaml",
    }
}
with open(f"{EXPORT_DIR}/bundle_meta.json", "w", encoding="utf-8") as f:
    json.dump(bundle_meta, f, indent=2)

print("Exported to:", EXPORT_DIR)

# ===== 10) Make a zip for download =====
ZIP_PATH = shutil.make_archive(f"/content/{TASK}_{STAMP}", "zip", root_dir=f"/content/{TASK}", base_dir=STAMP)
ZIP_PATH

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m61.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m64.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.8/301.8 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m767.5/767.5 kB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you h

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject