## AUROC, C-index / OR, HR adjusted with variables included in the CAIDE score

In [None]:
import os, warnings
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.metrics import roc_auc_score
import statsmodels.api as sm
from lifelines import CoxPHFitter
from lifelines.utils import concordance_index

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

# ------------------
WORK_DIR  = "/home/hch/dementia"
INFER_DIR = os.path.join(WORK_DIR, "infer_out")   
OUT_SUMMARY = os.path.join(INFER_DIR, "summary_metrics_pretty.csv")

os.chdir(WORK_DIR)

# ------------------
def fmt_p(p):
    if pd.isna(p): return np.nan
    try:
        return "<0.001" if float(p) < 0.001 else float(f"{float(p):.3f}")
    except Exception:
        return p

def safe_auc(y, s):
    try:
        if len(y) > 0 and pd.Series(y).nunique() == 2:
            return float(roc_auc_score(y, s))
    except Exception:
        pass
    return np.nan

def cindex_from_risk(times, risk, events):
    try:
        return concordance_index(event_times=np.asarray(times),
                                 predicted_scores=-np.asarray(risk),
                                 event_observed=np.asarray(events))
    except Exception:
        return np.nan

# ------------------
det_all = pd.read_csv("dementia_detection.csv", low_memory=False)
det_test = det_all[det_all["test"] == True].reset_index(drop=True)

surv_all = pd.read_csv("dementia_prediction.csv", low_memory=False).reset_index(drop=True)

def add_common_covs(df):
    out = df.copy()

    if "STDY_DT" in out.columns:
        out["STDY_DT"] = pd.to_datetime(out["STDY_DT"], errors="coerce")

    if "SEXINT" not in out.columns:
        if "SEX" in out.columns:
            out["SEXINT"] = (out["SEX"] == "M").astype(int)
        else:
            out["SEXINT"] = np.nan

    if "EXERCISE_STATUS_bin" not in out.columns:
        if "EXERCISE_STATUS" in out.columns:
            ex = pd.to_numeric(out["EXERCISE_STATUS"], errors="coerce")
            out["EXERCISE_STATUS_bin"] = np.where(ex >= 2, 1, np.where(ex <= 1, 0, np.nan))
        else:
            out["EXERCISE_STATUS_bin"] = np.nan

    for c in ["STDY_AGE", "cholesterol_updated", "sbp", "bmi", "days_diff"]:
        if c in out.columns:
            out[c] = pd.to_numeric(out[c], errors="coerce")
        else:
            out[c] = np.nan

    return out

det_test = add_common_covs(det_test)
surv_all = add_common_covs(surv_all)

def recompute_survival_frame(frame: pd.DataFrame) -> pd.DataFrame:
    fut = frame.copy()
    fut["event"] = (fut["days_diff"] >= 730).astype(int)
    fut = fut[fut["event"].eq(1) | fut["days_diff"].isna()].copy()
    ref = pd.Timestamp("2019-01-01")
    fut["obs_time"] = np.where(
        fut.event == 1,
        (fut["days_diff"] - 730).clip(lower=0),
        (ref - fut["STDY_DT"]).dt.days - 730
    )
    fut.loc[fut.obs_time > 3650, "obs_time"] = 3650
    fut.loc[(fut.event == 1) & (fut.obs_time == 3650), "event"] = 0
    return fut

surv_all = recompute_survival_frame(surv_all)

def calc_caide_napoe(frame: pd.DataFrame) -> pd.Series:
    x = frame.copy()
    for col in ["STDY_AGE","SEXINT","sbp","bmi","cholesterol_updated","EXERCISE_STATUS"]:
        if col in x.columns:
            x[col] = pd.to_numeric(x[col], errors="coerce")
        else:
            x[col] = np.nan
    age = x["STDY_AGE"]
    age_pts = np.select([age < 47, (47 <= age) & (age <= 53), age > 53], [0,3,4], default=np.nan)
    sex_pts = np.where(x["SEXINT"]==1, 1, 0)
    edu_pts = np.zeros(len(x), dtype=float)  
    sbp_pts = np.where(x["sbp"] >= 140, 2, 0)
    bmi_pts = np.where(x["bmi"] >= 30, 2, 0)
    chol_mmol = x["cholesterol_updated"] * 0.02586
    chol_pts  = np.where(chol_mmol >= 6.5, 2, 0)
    ex = x["EXERCISE_STATUS"]
    pa_pts = np.where(ex >= 2, 0, 1)
    score = age_pts + sex_pts + edu_pts + sbp_pts + bmi_pts + chol_pts + pa_pts
    return pd.Series(score, index=frame.index, name="CAIDE_noAPOE")

def caide_valid_mask(frame: pd.DataFrame) -> pd.Series:
    req = ["STDY_AGE","SEXINT","sbp","bmi","cholesterol_updated",'EXERCISE_STATUS']
    return frame[req].notna().all(axis=1)

det_CAIDE = calc_caide_napoe(det_test)
det_CAIDE_mask = caide_valid_mask(det_test)

surv_CAIDE = calc_caide_napoe(surv_all)
surv_CAIDE_mask = caide_valid_mask(surv_all) & surv_all["obs_time"].notna() & surv_all["event"].notna()

COVARS = ["STDY_AGE","SEXINT","cholesterol_updated","sbp","bmi","EXERCISE_STATUS_bin"]

X_test_base = det_test[COVARS]
y_test      = pd.to_numeric(det_test.get("label", det_test.get("label_det", np.nan)), errors="coerce") \
              if "label" in det_test.columns or "label_det" in det_test.columns else None

test_cov_mask = X_test_base.notna().all(axis=1)

X_fut_base = surv_all[COVARS]
fut_cov_mask = X_fut_base.notna().all(axis=1)
fut_obs_time = surv_all["obs_time"].to_numpy()
fut_event    = surv_all["event"].to_numpy()

# ------------------
model_dirs = sorted([p for p in Path(INFER_DIR).glob("*") if p.is_dir() and (p/"test_preds.csv").exists()])

rows = []
for mdir in model_dirs:
    desc = mdir.name

    test_csv = mdir / "test_preds.csv"
    try:
        tp = pd.read_csv(
            test_csv,
            usecols=["idx","label","pred"],
            dtype={"idx": np.int64, "label": np.float64, "pred": np.float64}
        )
    except Exception as e:
        print(f"[WARN] skip {desc} (bad test_preds.csv): {e}")
        continue

    idx_t = tp["idx"].to_numpy()
    label_det = tp["label"].to_numpy(dtype=float)
    pred_t    = tp["pred"].to_numpy(dtype=float)
    pred10_t  = pred_t * 10.0

    auc_model = safe_auc(label_det, pred_t)

    mask_caide_t = det_CAIDE_mask.to_numpy()[idx_t]
    auc_caide = safe_auc(label_det[mask_caide_t], det_CAIDE.to_numpy()[idx_t][mask_caide_t])

    # Adjusted OR (per 10% ↑)
    OR_a = ORa_low = ORa_hi = ORa_p = np.nan
    try:
        mask_logit = (
            test_cov_mask.to_numpy()[idx_t] &
            np.isfinite(pred10_t) & np.isfinite(label_det)
        )
        if mask_logit.any():
            Xlog = X_test_base.iloc[idx_t[mask_logit]].copy()
            Xlog = sm.add_constant(pd.concat([pd.Series(pred10_t[mask_logit], name="pred10", index=Xlog.index), Xlog], axis=1))
            ylog = pd.Series(label_det[mask_logit], index=Xlog.index, name="label_det")
            lg = sm.Logit(ylog, Xlog).fit(disp=False)
            OR_a = float(np.exp(lg.params["pred10"]))
            ci = lg.conf_int().loc["pred10"]
            ORa_low, ORa_hi = float(np.exp(ci[0])), float(np.exp(ci[1]))
            ORa_p = fmt_p(lg.pvalues["pred10"])
    except Exception as e:
        print(f"[WARN] Adjusted OR failed @ {desc}: {e}")

    fut_csv = mdir / "prediction_preds.csv"
    if not fut_csv.exists():
        # 일부 폴더는 prediction이 없을 수 있음
        row = dict(
            model_desc = desc,
            Test_AUC_MODEL = None if pd.isna(auc_model) else float(f"{auc_model:.3f}"),
            Test_AUC_CAIDE = None if pd.isna(auc_caide) else float(f"{auc_caide:.3f}"),
            OR_adj = OR_a, OR_adj_low = ORa_low, OR_adj_hi = ORa_hi, OR_adj_p = ORa_p,
            Future_cindex_MODEL = np.nan,
            Future_cindex_CAIDE = np.nan,
            HR_adj = np.nan, HR_adj_low = np.nan, HR_adj_hi = np.nan, HR_adj_p = np.nan,
            n_test_used = int(len(idx_t)),
            n_fut_used  = 0,
        )
        rows.append(row)
        continue

    try:
        fp = pd.read_csv(
            fut_csv,
            usecols=["idx","pred"], 
            dtype={"idx": np.int64, "pred": np.float64}
        )
    except Exception as e:
        print(f"[WARN] skip {desc} (bad prediction_preds.csv): {e}")
        continue

    idx_f = fp["idx"].to_numpy()
    pred_f = fp["pred"].to_numpy(dtype=float)
    pred10_f = pred_f * 10.0

    cidx_model = cindex_from_risk(
        fut_obs_time[idx_f],
        pred_f,
        fut_event[idx_f]
    )

    mask_caide_f = surv_CAIDE_mask.to_numpy()[idx_f]
    cidx_caide = cindex_from_risk(
        fut_obs_time[idx_f][mask_caide_f],
        surv_CAIDE.to_numpy()[idx_f][mask_caide_f],
        fut_event[idx_f][mask_caide_f]
    )

    HR_a = HRa_low = HRa_hi = HRa_p = np.nan
    try:
        mask_cox = (
            fut_cov_mask.to_numpy()[idx_f] &
            np.isfinite(pred10_f) &
            np.isfinite(fut_obs_time[idx_f]) &
            np.isfinite(fut_event[idx_f])
        )
        if mask_cox.any():
            Xcox = X_fut_base.iloc[idx_f[mask_cox]].copy()
            df_cox = pd.concat(
                [
                    pd.Series(fut_obs_time[idx_f][mask_cox], name="obs_time", index=Xcox.index),
                    pd.Series(fut_event[idx_f][mask_cox],    name="event",    index=Xcox.index),
                    pd.Series(pred10_f[mask_cox],            name="pred10",   index=Xcox.index),
                    Xcox
                ],
                axis=1
            )
            cph = CoxPHFitter()
            cph.fit(df_cox, duration_col="obs_time", event_col="event", show_progress=False)
            HR_a = float(np.exp(cph.params_["pred10"]))
            ci = cph.confidence_intervals_.loc["pred10"]
            HRa_low, HRa_hi = float(np.exp(ci[0])), float(np.exp(ci[1]))
            HRa_p = fmt_p(cph.summary.loc["pred10","p"])
    except Exception as e:
        print(f"[WARN] Adjusted HR failed @ {desc}: {e}")

    row = dict(
        model_desc = desc,
        Test_AUC_MODEL = None if pd.isna(auc_model) else float(f"{auc_model:.3f}"),
        Test_AUC_CAIDE = None if pd.isna(auc_caide) else float(f"{auc_caide:.3f}"),
        OR_adj = OR_a, OR_adj_low = ORa_low, OR_adj_hi = ORa_hi, OR_adj_p = ORa_p,
        Future_cindex_MODEL = None if pd.isna(cidx_model) else float(f"{cidx_model:.3f}"),
        Future_cindex_CAIDE = None if pd.isna(cidx_caide) else float(f"{cidx_caide:.3f}"),
        HR_adj = HR_a, HR_adj_low = HRa_low, HR_adj_hi = HRa_hi, HRa_p = HRa_p,
        n_test_used = int(len(idx_t)),
        n_fut_used  = int(len(idx_f)),
    )
    rows.append(row)

summary = pd.DataFrame(rows).sort_values("model_desc").reset_index(drop=True)

for col in ["OR_adj","OR_adj_low","OR_adj_hi","HR_adj","HR_adj_low","HR_adj_hi"]:
    if col in summary.columns:
        summary[col] = summary[col].apply(lambda v: "" if pd.isna(v) else f"{float(v):.3f}")

summary.to_csv(OUT_SUMMARY, index=False, encoding="utf-8")
print(f"[Saved] {OUT_SUMMARY}")

summary

In [None]:
# === Make compact OR/HR table from summary_metrics_pretty.csv ===
import os
import pandas as pd
from pathlib import Path

WORK_DIR  = "/home/hch/dementia"
INFER_DIR = os.path.join(WORK_DIR, "infer_out")
SRC = os.path.join(INFER_DIR, "summary_metrics_pretty.csv")
OUT_CSV = os.path.join(INFER_DIR, "summary_or_hr_table.csv")
OUT_XLSX = os.path.join(INFER_DIR, "summary_or_hr_table.xlsx")  # 원하면 저장

# ----- helpers -----
def fmt_ci(val, lo, hi):
    if pd.isna(val) or pd.isna(lo) or pd.isna(hi):
        return ""
    return f"{float(val):.3f} ({float(lo):.3f} - {float(hi):.3f})"

def fmt_p(p):
    if pd.isna(p):
        return ""
    try:
        pnum = float(p)
        return "<0.001" if pnum < 0.001 else f"{pnum:.3f}"
    except Exception:
        # 이미 "<0.001" 같은 문자열일 수 있음
        return str(p)

# ----- load -----
df = pd.read_csv(SRC)

# ----- build compact table -----
out = pd.DataFrame({
    "model_desc": df.get("model_desc", pd.Series([""]*len(df))),
    "adjusted OR": [fmt_ci(a, l, h) for a,l,h in zip(df.get("OR_adj"), df.get("OR_adj_low"), df.get("OR_adj_hi"))],
    "adjusted OR p-val": [fmt_p(p) for p in df.get("OR_adj_p")],
    "adjusted HR": [fmt_ci(a, l, h) for a,l,h in zip(df.get("HR_adj"), df.get("HR_adj_low"), df.get("HR_adj_hi"))],
    "adjusted HR p-val": [fmt_p(p) for p in df.get("HRa_p")],
})

cols = ["model_desc", "adjusted OR", "adjusted OR p-val", "adjusted HR", "adjusted HR p-val"]
out = out[cols]

# ----- save -----
Path(INFER_DIR).mkdir(parents=True, exist_ok=True)
out.to_csv(OUT_CSV, index=False, encoding="utf-8")

with pd.ExcelWriter(OUT_XLSX, engine="xlsxwriter") as w:
    out.to_excel(w, index=False, sheet_name="OR_HR")

print(f"[Saved] {OUT_CSV}")
print(f"[Saved] {OUT_XLSX}")

out.head()


In [None]:
# === Append compact OR/HR columns while keeping all original columns ===
import os
import pandas as pd
from pathlib import Path

WORK_DIR  = "/home/hch/dementia"
INFER_DIR = os.path.join(WORK_DIR, "infer_out")
SRC = os.path.join(INFER_DIR, "summary_metrics_pretty.csv")
OUT = os.path.join(INFER_DIR, "summary_metrics_with_orhr.csv")

# ----- helpers -----
def fmt_ci(val, lo, hi):
    if pd.isna(val) or pd.isna(lo) or pd.isna(hi):
        return ""
    return f"{float(val):.3f} ({float(lo):.3f} - {float(hi):.3f})"

def fmt_p(p):
    if pd.isna(p):
        return ""
    try:
        pnum = float(p)
        return "<0.001" if pnum < 0.001 else f"{pnum:.3f}"
    except Exception:
        return str(p)

# ----- load -----
df = pd.read_csv(SRC)

# ----- add new formatted columns -----
df["adjusted OR"]       = [fmt_ci(a, l, h) for a,l,h in zip(df["OR_adj"], df["OR_adj_low"], df["OR_adj_hi"])]
df["adjusted OR p-val"] = [fmt_p(p) for p in df["OR_adj_p"]]
df["adjusted HR"]       = [fmt_ci(a, l, h) for a,l,h in zip(df["HR_adj"], df["HR_adj_low"], df["HR_adj_hi"])]
df["adjusted HR p-val"] = [fmt_p(p) for p in df["HRa_p"]]

# ----- save -----
Path(INFER_DIR).mkdir(parents=True, exist_ok=True)
df.to_csv(OUT, index=False, encoding="utf-8")

print(f"[Saved] {OUT}")
df.head()


## Unadjusted OR, HR / age sex adjusted OR, HR

In [None]:
import os, warnings
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.metrics import roc_auc_score
import statsmodels.api as sm
from lifelines import CoxPHFitter
from lifelines.utils import concordance_index

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

# ------------------
WORK_DIR  = "/home/hch/dementia"
INFER_DIR = os.path.join(WORK_DIR, "infer_out")   
OUT_SUMMARY = os.path.join(INFER_DIR, "summary_metrics_pretty_extened.csv")

os.chdir(WORK_DIR)

# ------------------
def fmt_p(p):
    if pd.isna(p): return np.nan
    try:
        return "<0.001" if float(p) < 0.001 else float(f"{float(p):.3f}")
    except Exception:
        return p

def safe_auc(y, s):
    try:
        if len(y) > 0 and pd.Series(y).nunique() == 2:
            return float(roc_auc_score(y, s))
    except Exception:
        pass
    return np.nan

def cindex_from_risk(times, risk, events):
    try:
        return concordance_index(event_times=np.asarray(times),
                                 predicted_scores=-np.asarray(risk),
                                 event_observed=np.asarray(events))
    except Exception:
        return np.nan

# ------------------
det_all = pd.read_csv("dementia_detection.csv", low_memory=False)
det_test = det_all[det_all["test"] == True].reset_index(drop=True)

surv_all = pd.read_csv("dementia_prediction.csv", low_memory=False).reset_index(drop=True)

def add_common_covs(df):
    out = df.copy()

    if "STDY_DT" in out.columns:
        out["STDY_DT"] = pd.to_datetime(out["STDY_DT"], errors="coerce")

    if "SEXINT" not in out.columns:
        sex = out["SEX"] if "SEX" in out.columns else pd.Series(np.nan, index=out.index)
        out["SEXINT"] = np.where(sex.isna(), np.nan, (sex == "M").astype(int))

    if "EXERCISE_STATUS_bin" not in out.columns:
        if "EXERCISE_STATUS" in out.columns:
            ex = pd.to_numeric(out["EXERCISE_STATUS"], errors="coerce")
            out["EXERCISE_STATUS_bin"] = np.where(ex >= 2, 1, np.where(ex <= 1, 0, np.nan))
        else:
            out["EXERCISE_STATUS_bin"] = np.nan

    for c in ["STDY_AGE", "cholesterol_updated", "sbp", "bmi", "days_diff"]:
        if c in out.columns:
            out[c] = pd.to_numeric(out[c], errors="coerce")
        else:
            out[c] = np.nan

    return out

det_test = add_common_covs(det_test)
surv_all = add_common_covs(surv_all)

def recompute_survival_frame(frame: pd.DataFrame) -> pd.DataFrame:
    fut = frame.copy()
    fut["event"] = (fut["days_diff"] >= 730).astype(int)
    fut = fut[fut["event"].eq(1) | fut["days_diff"].isna()].copy()
    ref = pd.Timestamp("2019-01-01")
    fut["obs_time"] = np.where(
        fut.event == 1,
        (fut["days_diff"] - 730).clip(lower=0),
        (ref - fut["STDY_DT"]).dt.days - 730
    )
    fut.loc[fut.obs_time > 3650, "obs_time"] = 3650
    fut.loc[(fut.event == 1) & (fut.obs_time == 3650), "event"] = 0
    return fut

surv_all = recompute_survival_frame(surv_all)

def calc_caide_napoe(frame: pd.DataFrame) -> pd.Series:
    x = frame.copy()
    for col in ["STDY_AGE","SEXINT","sbp","bmi","cholesterol_updated","EXERCISE_STATUS"]:
        if col in x.columns:
            x[col] = pd.to_numeric(x[col], errors="coerce")
        else:
            x[col] = np.nan
    age = x["STDY_AGE"]
    age_pts = np.select([age < 47, (47 <= age) & (age <= 53), age > 53], [0,3,4], default=np.nan)
    sex_pts = np.where(x["SEXINT"]==1, 1, 0)
    edu_pts = np.zeros(len(x), dtype=float)
    sbp_pts = np.where(x["sbp"] >= 140, 2, 0)
    bmi_pts = np.where(x["bmi"] >= 30, 2, 0)
    chol_mmol = x["cholesterol_updated"] * 0.02586
    chol_pts  = np.where(chol_mmol >= 6.5, 2, 0)
    ex = x["EXERCISE_STATUS"]
    pa_pts = np.where(ex >= 2, 0, 1)
    score = age_pts + sex_pts + edu_pts + sbp_pts + bmi_pts + chol_pts + pa_pts
    return pd.Series(score, index=frame.index, name="CAIDE_noAPOE")

def caide_valid_mask(frame: pd.DataFrame) -> pd.Series:
    req = ["STDY_AGE","SEXINT","sbp","bmi","cholesterol_updated",'EXERCISE_STATUS']
    return frame[req].notna().all(axis=1)

det_CAIDE = calc_caide_napoe(det_test)
det_CAIDE_mask = caide_valid_mask(det_test)

surv_CAIDE = calc_caide_napoe(surv_all)
surv_CAIDE_mask = caide_valid_mask(surv_all) & surv_all["obs_time"].notna() & surv_all["event"].notna()

COVARS_ALL = ["STDY_AGE","SEXINT","cholesterol_updated","sbp","bmi","EXERCISE_STATUS_bin"]
COVARS_AS  = ["STDY_AGE","SEXINT"] 

# Test
X_test_all = det_test[COVARS_ALL]
X_test_as  = det_test[COVARS_AS]
y_test_ref = pd.to_numeric(det_test.get("label", det_test.get("label_det", np.nan)), errors="coerce") \
             if "label" in det_test.columns or "label_det" in det_test.columns else None

mask_test_all = X_test_all.notna().all(axis=1)
mask_test_as  = X_test_as.notna().all(axis=1)

# Future
X_fut_all = surv_all[COVARS_ALL]
X_fut_as  = surv_all[COVARS_AS]

mask_fut_all = X_fut_all.notna().all(axis=1)
mask_fut_as  = X_fut_as.notna().all(axis=1)

fut_obs_time = surv_all["obs_time"].to_numpy()
fut_event    = surv_all["event"].to_numpy()

# ------------------
model_dirs = sorted([p for p in Path(INFER_DIR).glob("*") if p.is_dir() and (p/"test_preds.csv").exists()])

rows = []
for mdir in model_dirs:
    desc = mdir.name

    test_csv = mdir / "test_preds.csv"
    try:
        tp = pd.read_csv(
            test_csv,
            usecols=["idx","label","pred"],
            dtype={"idx": np.int64, "label": np.float64, "pred": np.float64}
        )
    except Exception as e:
        print(f"[WARN] skip {desc} (bad test_preds.csv): {e}")
        continue

    idx_t = tp["idx"].to_numpy()
    label_det = tp["label"].to_numpy(dtype=float)
    pred_t    = tp["pred"].to_numpy(dtype=float)
    pred10_t  = pred_t * 10.0

    # AUROC (model)
    auc_model = safe_auc(label_det, pred_t)

    # CAIDE (test)
    mask_caide_t = det_CAIDE_mask.to_numpy()[idx_t]
    auc_caide = safe_auc(label_det[mask_caide_t], det_CAIDE.to_numpy()[idx_t][mask_caide_t])

    OR_unadj = ORu_low = ORu_hi = ORu_p = np.nan
    OR_as    = ORas_low = ORas_hi = ORas_p = np.nan
    OR_full  = ORa_low = ORa_hi = ORa_p = np.nan

    try:
        mask_u = np.isfinite(pred10_t) & np.isfinite(label_det)
        if mask_u.any():
            Xu = sm.add_constant(pd.Series(pred10_t[mask_u], name="pred10"))
            yu = pd.Series(label_det[mask_u], name="label_det")
            lg = sm.Logit(yu, Xu).fit(disp=False)
            OR_unadj = float(np.exp(lg.params["pred10"]))
            ci = lg.conf_int().loc["pred10"]
            ORu_low, ORu_hi = float(np.exp(ci[0])), float(np.exp(ci[1]))
            ORu_p = fmt_p(lg.pvalues["pred10"])
    except Exception as e:
        print(f"[WARN] Unadj OR failed @ {desc}: {e}")

    # Age/Sex-adjusted OR
    try:
        mask_as = (
            mask_test_as.to_numpy()[idx_t] &
            np.isfinite(pred10_t) & np.isfinite(label_det)
        )
        if mask_as.any():
            Xas = X_test_as.iloc[idx_t[mask_as]].copy()
            Xas = sm.add_constant(pd.concat([pd.Series(pred10_t[mask_as], name="pred10", index=Xas.index), Xas], axis=1))
            yas = pd.Series(label_det[mask_as], index=Xas.index, name="label_det")
            lg = sm.Logit(yas, Xas).fit(disp=False)
            OR_as = float(np.exp(lg.params["pred10"]))
            ci = lg.conf_int().loc["pred10"]
            ORas_low, ORas_hi = float(np.exp(ci[0])), float(np.exp(ci[1]))
            ORas_p = fmt_p(lg.pvalues["pred10"])
    except Exception as e:
        print(f"[WARN] Age/Sex-adj OR failed @ {desc}: {e}")

    try:
        mask_full = (
            mask_test_all.to_numpy()[idx_t] &
            np.isfinite(pred10_t) & np.isfinite(label_det)
        )
        if mask_full.any():
            Xlog = X_test_all.iloc[idx_t[mask_full]].copy()
            Xlog = sm.add_constant(pd.concat([pd.Series(pred10_t[mask_full], name="pred10", index=Xlog.index), Xlog], axis=1))
            ylog = pd.Series(label_det[mask_full], index=Xlog.index, name="label_det")
            lg = sm.Logit(ylog, Xlog).fit(disp=False)
            OR_full = float(np.exp(lg.params["pred10"]))
            ci = lg.conf_int().loc["pred10"]
            ORa_low, ORa_hi = float(np.exp(ci[0])), float(np.exp(ci[1]))
            ORa_p = fmt_p(lg.pvalues["pred10"])
    except Exception as e:
        print(f"[WARN] Fully-adj OR failed @ {desc}: {e}")

    fut_csv = mdir / "prediction_preds.csv"
    if not fut_csv.exists():
        row = dict(
            model_desc = desc,
            Test_AUC_MODEL = None if pd.isna(auc_model) else float(f"{auc_model:.3f}"),
            Test_AUC_CAIDE = None if pd.isna(auc_caide) else float(f"{auc_caide:.3f}"),

            OR_unadj = OR_unadj, OR_unadj_low = ORu_low, OR_unadj_hi = ORu_hi, OR_unadj_p = ORu_p,
            OR_age_sex = OR_as, OR_age_sex_low = ORas_low, OR_age_sex_hi = ORas_hi, OR_age_sex_p = ORas_p,
            OR_adj = OR_full, OR_adj_low = ORa_low, OR_adj_hi = ORa_hi, OR_adj_p = ORa_p,

            Future_cindex_MODEL = np.nan,
            Future_cindex_CAIDE = np.nan,

            HR_unadj = np.nan, HR_unadj_low = np.nan, HR_unadj_hi = np.nan, HR_unadj_p = np.nan,
            HR_age_sex = np.nan, HR_age_sex_low = np.nan, HR_age_sex_hi = np.nan, HR_age_sex_p = np.nan,
            HR_adj = np.nan, HR_adj_low = np.nan, HR_adj_hi = np.nan, HR_adj_p = np.nan,

            n_test_used = int(len(idx_t)),
            n_fut_used  = 0,
        )
        rows.append(row)
        continue

    try:
        fp = pd.read_csv(
            fut_csv,
            usecols=["idx","pred"],
            dtype={"idx": np.int64, "pred": np.float64}
        )
    except Exception as e:
        print(f"[WARN] skip {desc} (bad prediction_preds.csv): {e}")
        continue

    idx_f = fp["idx"].to_numpy()
    pred_f = fp["pred"].to_numpy(dtype=float)
    pred10_f = pred_f * 10.0

    # C-index (model)
    cidx_model = cindex_from_risk(
        fut_obs_time[idx_f],
        pred_f,
        fut_event[idx_f]
    )

    # CAIDE (future)
    mask_caide_f = surv_CAIDE_mask.to_numpy()[idx_f]
    cidx_caide = cindex_from_risk(
        fut_obs_time[idx_f][mask_caide_f],
        surv_CAIDE.to_numpy()[idx_f][mask_caide_f],
        fut_event[idx_f][mask_caide_f]
    )

    HR_unadj = HRu_low = HRu_hi = HRu_p = np.nan
    HR_as    = HRas_low = HRas_hi = HRas_p = np.nan
    HR_full  = HRa_low = HRa_hi = HRa_p = np.nan

    try:
        mask_u = (
            np.isfinite(pred10_f) &
            np.isfinite(fut_obs_time[idx_f]) &
            np.isfinite(fut_event[idx_f])
        )
        if mask_u.any():
            df_u = pd.DataFrame({
                "obs_time": fut_obs_time[idx_f][mask_u],
                "event":    fut_event[idx_f][mask_u],
                "pred10":   pred10_f[mask_u],
            })
            cph = CoxPHFitter()
            cph.fit(df_u, duration_col="obs_time", event_col="event", show_progress=False)
            HR_unadj = float(np.exp(cph.params_["pred10"]))
            ci = cph.confidence_intervals_.loc["pred10"]
            HRu_low, HRu_hi = float(np.exp(ci[0])), float(np.exp(ci[1]))
            HRu_p = fmt_p(cph.summary.loc["pred10","p"])
    except Exception as e:
        print(f"[WARN] Unadj HR failed @ {desc}: {e}")

    # Age/Sex-adjusted HR
    try:
        mask_as = (
            mask_fut_as.to_numpy()[idx_f] &
            np.isfinite(pred10_f) &
            np.isfinite(fut_obs_time[idx_f]) &
            np.isfinite(fut_event[idx_f])
        )
        if mask_as.any():
            Xas = X_fut_as.iloc[idx_f[mask_as]].copy()
            df_as = pd.concat([
                pd.Series(fut_obs_time[idx_f][mask_as], name="obs_time", index=Xas.index),
                pd.Series(fut_event[idx_f][mask_as],    name="event",    index=Xas.index),
                pd.Series(pred10_f[mask_as],            name="pred10",   index=Xas.index),
                Xas
            ], axis=1)
            cph = CoxPHFitter()
            cph.fit(df_as, duration_col="obs_time", event_col="event", show_progress=False)
            HR_as = float(np.exp(cph.params_["pred10"]))
            ci = cph.confidence_intervals_.loc["pred10"]
            HRas_low, HRas_hi = float(np.exp(ci[0])), float(np.exp(ci[1]))
            HRas_p = fmt_p(cph.summary.loc["pred10","p"])
    except Exception as e:
        print(f"[WARN] Age/Sex-adj HR failed @ {desc}: {e}")

    try:
        mask_full = (
            mask_fut_all.to_numpy()[idx_f] &
            np.isfinite(pred10_f) &
            np.isfinite(fut_obs_time[idx_f]) &
            np.isfinite(fut_event[idx_f])
        )
        if mask_full.any():
            Xcox = X_fut_all.iloc[idx_f[mask_full]].copy()
            df_cox = pd.concat(
                [
                    pd.Series(fut_obs_time[idx_f][mask_full], name="obs_time", index=Xcox.index),
                    pd.Series(fut_event[idx_f][mask_full],    name="event",    index=Xcox.index),
                    pd.Series(pred10_f[mask_full],            name="pred10",   index=Xcox.index),
                    Xcox
                ],
                axis=1
            )
            cph = CoxPHFitter()
            cph.fit(df_cox, duration_col="obs_time", event_col="event", show_progress=False)
            HR_full = float(np.exp(cph.params_["pred10"]))
            ci = cph.confidence_intervals_.loc["pred10"]
            HRa_low, HRa_hi = float(np.exp(ci[0])), float(np.exp(ci[1]))
            HRa_p = fmt_p(cph.summary.loc["pred10","p"])
    except Exception as e:
        print(f"[WARN] Fully-adj HR failed @ {desc}: {e}")

    row = dict(
        model_desc = desc,

        # AUCs
        Test_AUC_MODEL = None if pd.isna(auc_model) else float(f"{auc_model:.3f}"),
        Test_AUC_CAIDE = None if pd.isna(auc_caide) else float(f"{auc_caide:.3f}"),

        # ---- OR set (Test)
        OR_unadj = OR_unadj, OR_unadj_low = ORu_low, OR_unadj_hi = ORu_hi, OR_unadj_p = ORu_p,
        OR_age_sex = OR_as, OR_age_sex_low = ORas_low, OR_age_sex_hi = ORas_hi, OR_age_sex_p = ORas_p,
        OR_adj = OR_full, OR_adj_low = ORa_low, OR_adj_hi = ORa_hi, OR_adj_p = ORa_p,

        # ---- C-index (Future)
        Future_cindex_MODEL = None if pd.isna(cidx_model) else float(f"{cidx_model:.3f}"),
        Future_cindex_CAIDE = None if pd.isna(cidx_caide) else float(f"{cidx_caide:.3f}"),

        # ---- HR set (Future)
        HR_unadj = HR_unadj, HR_unadj_low = HRu_low, HR_unadj_hi = HRu_hi, HR_unadj_p = HRu_p,
        HR_age_sex = HR_as, HR_age_sex_low = HRas_low, HR_age_sex_hi = HRas_hi, HR_age_sex_p = HRas_p,
        HR_adj = HR_full, HR_adj_low = HRa_low, HR_adj_hi = HRa_hi, HR_adj_p = HRa_p,

        n_test_used = int(len(idx_t)),
        n_fut_used  = int(len(idx_f)),
    )
    rows.append(row)

summary = pd.DataFrame(rows).sort_values("model_desc").reset_index(drop=True)

for col in [
    "OR_unadj","OR_unadj_low","OR_unadj_hi",
    "OR_age_sex","OR_age_sex_low","OR_age_sex_hi",
    "OR_adj","OR_adj_low","OR_adj_hi",
    "HR_unadj","HR_unadj_low","HR_unadj_hi",
    "HR_age_sex","HR_age_sex_low","HR_age_sex_hi",
    "HR_adj","HR_adj_low","HR_adj_hi",
]:
    if col in summary.columns:
        summary[col] = summary[col].apply(lambda v: "" if pd.isna(v) else f"{float(v):.3f}")

for col in ["OR_unadj_p","OR_age_sex_p","OR_adj_p","HR_unadj_p","HR_age_sex_p","HR_adj_p"]:
    if col in summary.columns:
        summary[col] = summary[col].apply(fmt_p)

summary.to_csv(OUT_SUMMARY, index=False, encoding="utf-8")
print(f"[Saved] {OUT_SUMMARY}")

summary


In [None]:
import pandas as pd
from pathlib import Path
import numpy as np

INFER_DIR = Path("/home/hch/dementia/infer_out")
IN_PATH = INFER_DIR / "summary_metrics_pretty_extened.csv"
OUT_PATH = INFER_DIR / "summary_metrics_formatted.csv"

df = pd.read_csv(IN_PATH)

def combine_ci(est, low, hi):
    """1.100 (1.050 - 1.150) 형식으로 묶기"""
    if pd.isna(est) or pd.isna(low) or pd.isna(hi):
        return ""
    try:
        return f"{float(est):.3f} ({float(low):.3f} - {float(hi):.3f})"
    except Exception:
        return ""

df["OR_unadj_fmt"]   = df.apply(lambda r: combine_ci(r["OR_unadj"], r["OR_unadj_low"], r["OR_unadj_hi"]), axis=1)
df["OR_age_sex_fmt"] = df.apply(lambda r: combine_ci(r["OR_age_sex"], r["OR_age_sex_low"], r["OR_age_sex_hi"]), axis=1)
df["OR_adj_fmt"]     = df.apply(lambda r: combine_ci(r["OR_adj"], r["OR_adj_low"], r["OR_adj_hi"]), axis=1)

df["HR_unadj_fmt"]   = df.apply(lambda r: combine_ci(r["HR_unadj"], r["HR_unadj_low"], r["HR_unadj_hi"]), axis=1)
df["HR_age_sex_fmt"] = df.apply(lambda r: combine_ci(r["HR_age_sex"], r["HR_age_sex_low"], r["HR_age_sex_hi"]), axis=1)
df["HR_adj_fmt"]     = df.apply(lambda r: combine_ci(r["HR_adj"], r["HR_adj_low"], r["HR_adj_hi"]), axis=1)

cols_show = [
    "model_desc",
    "Test_AUC_MODEL", "Test_AUC_CAIDE",
    "OR_unadj_fmt", "OR_unadj_p",
    "OR_age_sex_fmt", "OR_age_sex_p",
    "OR_adj_fmt", "OR_adj_p",
    "Future_cindex_MODEL", "Future_cindex_CAIDE",
    "HR_unadj_fmt", "HR_unadj_p",
    "HR_age_sex_fmt", "HR_age_sex_p",
    "HR_adj_fmt", "HR_adj_p",
]

for c in cols_show:
    if c not in df.columns:
        df[c] = ""

df_out = df[cols_show].copy()

df_out.to_csv(OUT_PATH, index=False, encoding="utf-8-sig")
print(f"[Saved] {OUT_PATH}")
df_out.head()
