In [None]:
from pathlib import Path
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
RS = 373777

In [None]:
path = '20250518_master4_merged2.csv'
IMG_DIR = "/home/hch/opportunistic/AutoMorph_Data/Results/M0/images/"

In [None]:
df = pd.read_csv(path, low_memory=False)

In [None]:
def sanitize(fn): 
    b, e = os.path.splitext(fn.strip())
    return b.replace(".","_") + e

# ---------- CAIDE 계산 함수 (APOE 미포함) ----------
def calc_caide_napoe(frame: pd.DataFrame) -> pd.Series:
    """필요 컬럼: STDY_AGE, SEXINT, sbp, bmi, cholesterol_updated, EXERCISE_STATUS(선택)"""
    x = frame.copy()
    for col in ["STDY_AGE","SEXINT","sbp","bmi","cholesterol_updated","EXERCISE_STATUS"]:
        if col in x.columns:
            x[col] = pd.to_numeric(x[col], errors="coerce")
        else:
            x[col] = np.nan

    # 나이 점수
    age = x["STDY_AGE"]
    age_pts = np.select(
        [age < 47, (age >= 47) & (age <= 53), age > 53],
        [0, 3, 4],
        default=np.nan
    )
    # 성별 점수 (남성 1점)
    sex_pts = np.where(x["SEXINT"]==1, 1, 0)
    # 교육 점수: 모두 10년 이상 가정 → 0점
    edu_pts = np.zeros(len(x), dtype=float)
    # SBP 점수 (>=140 → +2)
    sbp_pts = np.where(x["sbp"] >= 140, 2, 0)
    # BMI 점수 (>=30 → +2)
    bmi_pts = np.where(x["bmi"] >= 30, 2, 0)
    # 총콜 점수 (mg/dL → mmol/L 변환 후 >=6.5 → +2)
    chol_mmol = x["cholesterol_updated"] * 0.02586
    chol_pts  = np.where(chol_mmol >= 6.5, 2, 0)
    # 신체활동 점수 (inactive=+1): EXERCISE_STATUS ≥1 → active(0점), 0 또는 NaN → inactive(1점)
    ex = x["EXERCISE_STATUS"].fillna(0)
    pa_pts = np.where(ex >= 1, 0, 1) # 이 "EXERCISE_STATUS" 변수가 0일때, 1일떄, 2일때 각각 뭐를 의미하는지 다시 확인할 필요가 있긴 한데, 일단 1 이상이면 physical activity 1로 취급하였습니다

    score = age_pts + sex_pts + edu_pts + sbp_pts + bmi_pts + chol_pts + pa_pts
    return pd.Series(score, index=frame.index, name="CAIDE_noAPOE")

# ---------- 공통: CAIDE 계산 및 유효 샘플 마스크 ----------
def get_caide_valid_mask(frame: pd.DataFrame) -> pd.Series:
    req = ["STDY_AGE","SEXINT","sbp","bmi","cholesterol_updated"]  # EXERCISE_STATUS는 NaN이면 inactive 처리
    return frame[req].notna().all(axis=1)

In [None]:
df["pngfilename"] = IMG_DIR + df["pngfilename"].apply(sanitize)
valid_png = {f for f in os.listdir(IMG_DIR) if f.lower().endswith(".png")}
df = df[df["pngfilename"].apply(lambda p: os.path.basename(p) in valid_png)].reset_index(drop=True)

## Detection dataset

In [None]:
df["STDY_DT"] = pd.to_datetime(df["STDY_DT"])
df["SEXINT"]  = (df["SEX"] == "M").astype(int)
df["label"]   = (df["days_diff"] <= 730).astype(int)

In [None]:
pos = df[df.label == 1]
neg = df[df.days_diff.isna() & ~df.PAT_ID.isin(pos.PAT_ID)].sample(n=len(pos)*4, random_state=RS)
df_bin = pd.concat([pos, neg]).reset_index(drop=True)

In [None]:
df_bin['CAIDE_noAPOE'] = calc_caide_napoe(df_bin)
df_bin['CAIDE_noAPOE_valid'] = get_caide_valid_mask(df_bin)

In [None]:
g = df_bin.groupby("PAT_ID", as_index=False).label.max()
tr_i, tmp_i = next(StratifiedShuffleSplit(1, test_size=0.4, random_state=RS).split(g.PAT_ID, g.label))
tr_ids = g.iloc[tr_i].PAT_ID
tmp    = g.iloc[tmp_i].reset_index(drop=True)
va_i, te_i = next(StratifiedShuffleSplit(1, test_size=0.5, random_state=RS).split(tmp.PAT_ID, tmp.label))

In [None]:
df_bin['train'] = False
df_bin['valid'] = False
df_bin['test'] = False

In [None]:
df_bin.loc[df_bin.PAT_ID.isin(tr_ids), 'train'] = True
df_bin.loc[df_bin.PAT_ID.isin(tmp.iloc[va_i].PAT_ID), 'valid'] = True
df_bin.loc[df_bin.PAT_ID.isin(tmp.iloc[te_i].PAT_ID), 'test'] = True

In [None]:
train_df = df_bin[df_bin.train==True]
valid_df = df_bin[df_bin.valid==True]
test_df = df_bin[df_bin.test==True]

In [None]:
pct = lambda d: 100*d.label.mean()
print(f"TRAIN {len(train_df)} (pos {pct(train_df):.2f}%) | "
    f"VAL {len(valid_df)} (pos {pct(valid_df):.2f}%) | "
    f"TEST {len(test_df)} (pos {pct(test_df):.2f}%)")

In [None]:
df_bin.to_csv('dementia_detection.csv', index=False)

## Future datset

In [None]:
used = set(pd.concat([train_df, valid_df, test_df]).PAT_ID)

In [None]:
fut = df[~df.PAT_ID.isin(used)].copy()

In [None]:
fut.STDY_DT.max(), fut.STDY_DT.min()

In [None]:
fut["event"] = (fut["days_diff"] > 730).astype(int)
ref = pd.Timestamp("2017-01-01")
fut["obs_time"] = np.where(fut.event==1, 
                           fut["days_diff"] - 730,
                           (ref - fut["STDY_DT"]).dt.days)
fut['obs_time'] = fut['obs_time'].astype('int32')
fut.loc[fut.obs_time > 3650, 'event'] = 0

In [None]:
fut['CAIDE_noAPOE'] = calc_caide_napoe(fut)
fut['CAIDE_noAPOE_valid'] = get_caide_valid_mask(fut)

In [None]:
fut.to_csv('dementia_prediction.csv', index=False)