age.csv

In [None]:
import pandas as pd
from pathlib import Path

ROOT = Path("./data")
ADMIT_CSV = ROOT / "hosp" / "admissions.csv"
PAT_CSV = ROOT / "hosp" / "patients.csv"
OUT_CSV = "age.csv"

# Load admissions and patients data
adm = pd.read_csv(
    ADMIT_CSV,
    usecols=["subject_id", "hadm_id", "admittime"],
    parse_dates=["admittime"],
    dtype={"subject_id": "int32", "hadm_id": "int32"}
)

pat = pd.read_csv(
    PAT_CSV,
    usecols=["subject_id", "anchor_age", "anchor_year"],
    dtype={"subject_id": "int32", "anchor_age": "int16", "anchor_year": "int16"}
)

# Merge and compute age: anchor_age + (year(admittime) - anchor_year)
df = adm.merge(pat, on="subject_id", how="left")
df["age"] = (df["anchor_age"] + (df["admittime"].dt.year - df["anchor_year"])).astype("int16")

# Save output
cols = ["subject_id", "hadm_id", "age"]
df[cols].to_csv(OUT_CSV, index=False)
print(f"Done – {len(df):,} rows → {OUT_CSV}")


Done – 431,088 rows → age.csv


charlson.csv

In [None]:
import pandas as pd
from pathlib import Path
import numpy as np

ROOT      = Path("./data")           
DIAG_CSV  = ROOT / "hosp" / "diagnoses_icd.csv"
ADM_CSV   = ROOT / "hosp" / "admissions.csv"
AGE_CSV   = ROOT / "derived" / "age.csv"    
OUT_CSV   = "charlson.csv"

# Load tables
diag = pd.read_csv(
    DIAG_CSV,
    usecols=["hadm_id","icd_code","icd_version"],
    dtype={"hadm_id":"int32","icd_code":"string","icd_version":"int8"}
)
adm  = pd.read_csv(
    ADM_CSV,
    usecols=["subject_id","hadm_id"],
    dtype={"subject_id":"int32","hadm_id":"int32"}
)
age  = pd.read_csv(
    AGE_CSV,
    usecols=["hadm_id","age"],
    dtype={"hadm_id":"int32","age":"float32"}
)

# Split ICD-9 / ICD-10 for testing
diag["icd9"]  = diag["icd_code"].where(diag["icd_version"] == 9)
diag["icd10"] = diag["icd_code"].where(diag["icd_version"] == 10)

# Helpers
def starts(col, *prefixes):
    return col.str.startswith(prefixes)

def between(col, lo, hi):
    return (col >= lo) & (col <= hi)

# Compute 17 Charlson comorbidities by admission
def agg_bool(mask):                    
    return mask.astype("int8").max()

gb = diag.groupby("hadm_id")

com = gb.apply(lambda s: pd.Series({
    'myocardial_infarct': agg_bool(
        starts(s.icd9.fillna(""), '410', '412') |
        starts(s.icd10.fillna(""), 'I21', 'I22') |
        s.icd10.fillna("").str.startswith('I252', na=False)
    ),
    'congestive_heart_failure': agg_bool(
        starts(s.icd9.fillna(""), '428') |
        s.icd9.fillna("").str[:5].isin(['39891', '40201', '40211', '40291', '40401', '40403',
                                        '40411', '40413', '40491', '40493']) |
        between(s.icd9.fillna("").str[:4], '4254', '4259') |
        starts(s.icd10.fillna(""), 'I43', 'I50') |
        s.icd10.fillna("").str[:4].isin(['I099', 'I110', 'I130', 'I132', 'I255', 'I420',
                                          'I425', 'I426', 'I427', 'I428', 'I429', 'P290'])
    ),
    'peripheral_vascular_disease': agg_bool(
        starts(s.icd9.fillna(""), '440', '441') |
        s.icd9.fillna("").str[:4].isin(['0930', '4373', '4471', '5571', '5579', 'V434']) |
        between(s.icd9.fillna("").str[:4], '4431', '4439') |
        starts(s.icd10.fillna(""), 'I70', 'I71') |
        s.icd10.fillna("").str[:4].isin(['I731', 'I738', 'I739', 'I771', 'I790',
                                          'I792', 'K551', 'K558', 'K559', 'Z958', 'Z959'])
    ),
    'cerebrovascular_disease': agg_bool(
        between(s.icd9.fillna("").str[:3], '430', '438') |
        s.icd9.fillna("").str.startswith('36234', na=False) |
        starts(s.icd10.fillna(""), 'G45', 'G46') |
        between(s.icd10.fillna("").str[:3], 'I60', 'I69') |
        s.icd10.fillna("").str.startswith('H340', na=False)
    ),
    'dementia': agg_bool(
        starts(s.icd9.fillna(""), '290') |
        s.icd9.fillna("").str[:4].isin(['2941', '3312']) |
        starts(s.icd10.fillna(""), 'F00', 'F01', 'F02', 'F03', 'G30') |
        s.icd10.fillna("").str[:4].isin(['F051', 'G311'])
    ),
    'chronic_pulmonary_disease': agg_bool(
        between(s.icd9.fillna("").str[:3], '490', '505') |
        s.icd9.fillna("").str[:4].isin(['4168', '4169', '5064', '5081', '5088']) |
        between(s.icd10.fillna("").str[:3], 'J40', 'J47') |
        between(s.icd10.fillna("").str[:3], 'J60', 'J67') |
        s.icd10.fillna("").str[:4].isin(['I278', 'I279', 'J684', 'J701', 'J703'])
    ),
    'rheumatic_disease': agg_bool(
        starts(s.icd9.fillna(""), '725') |
        s.icd9.fillna("").str[:4].isin(['4465', '7100', '7101', '7102', '7103',
                                        '7104', '7140', '7141', '7142', '7148']) |
        starts(s.icd10.fillna(""), 'M05', 'M06', 'M32', 'M33', 'M34') |
        s.icd10.fillna("").str[:4].isin(['M315', 'M351', 'M353', 'M360'])
    ),
    'peptic_ulcer_disease': agg_bool(
        starts(s.icd9.fillna(""), '531', '532', '533', '534') |
        starts(s.icd10.fillna(""), 'K25', 'K26', 'K27', 'K28')
    ),
    'mild_liver_disease': agg_bool(
        starts(s.icd9.fillna(""), '570', '571') |
        s.icd9.fillna("").str[:4].isin(['0706', '0709', '5733', '5734', '5738', '5739', 'V427']) |
        s.icd9.fillna("").str[:5].isin(['07022', '07023', '07032', '07033', '07044', '07054']) |
        starts(s.icd10.fillna(""), 'B18', 'K73', 'K74') |
        s.icd10.fillna("").str[:4].isin(['K700', 'K701', 'K702', 'K703', 'K709', 'K713', 'K714',
                                          'K715', 'K717', 'K760', 'K762', 'K763', 'K764',
                                          'K768', 'K769', 'Z944'])
    ),
    'diabetes_without_cc': agg_bool(
        s.icd9.fillna("").str[:4].isin(['2500', '2501', '2502', '2503', '2508', '2509']) |
        s.icd10.fillna("").str[:4].isin(['E100', 'E10l', 'E106', 'E108', 'E109', 'E110', 'E111',
                                          'E116', 'E118', 'E119', 'E120', 'E121', 'E126', 'E128',
                                          'E129', 'E130', 'E131', 'E136', 'E138', 'E139', 'E140',
                                          'E141', 'E146', 'E148', 'E149'])
    ),
    'diabetes_with_cc': agg_bool(
        s.icd9.fillna("").str[:4].isin(['2504', '2505', '2506', '2507']) |
        s.icd10.fillna("").str[:4].isin(['E102', 'E103', 'E104', 'E105', 'E107', 'E112', 'E113',
                                          'E114', 'E115', 'E117', 'E122', 'E123', 'E124', 'E125',
                                          'E127', 'E132', 'E133', 'E134', 'E135', 'E137', 'E142',
                                          'E143', 'E144', 'E145', 'E147'])
    ),
    'paraplegia': agg_bool(
        starts(s.icd9.fillna(""), '342', '343') |
        s.icd9.fillna("").str[:4].isin(['3341', '3440', '3441', '3442', '3443', '3444',
                                        '3445', '3446', '3449']) |
        starts(s.icd10.fillna(""), 'G81', 'G82') |
        s.icd10.fillna("").str[:4].isin(['G041', 'G114', 'G801', 'G802', 'G830', 'G831', 'G832',
                                        'G833', 'G834', 'G839'])
    ),
    'renal_disease': agg_bool(
        starts(s.icd9.fillna(""), '582', '585', '586', 'V56') |
        s.icd9.fillna("").str[:4].isin(['5880', 'V420', 'V451']) |
        between(s.icd9.fillna("").str[:4], '5830', '5837') |
        s.icd9.fillna("").str[:5].isin(['40301', '40311', '40391', '40402', '40403', '40412',
                                        '40413', '40492', '40493']) |
        starts(s.icd10.fillna(""), 'N18', 'N19') |
        s.icd10.fillna("").str[:4].isin(['I120', 'I131', 'N032', 'N033', 'N034', 'N035', 'N036',
                                        'N037', 'N052', 'N053', 'N054', 'N055', 'N056', 'N057',
                                        'N250', 'Z490', 'Z491', 'Z492', 'Z940', 'Z992'])
    ),
    'malignant_cancer': agg_bool(
        between(s.icd9.fillna("").str[:3], '140', '172') |
        between(s.icd9.fillna("").str[:4], '1740', '1958') |
        between(s.icd9.fillna("").str[:3], '200', '208') |
        s.icd9.fillna("").str.startswith('2386', na=False) |
        starts(s.icd10.fillna(""), 'C43', 'C88') |
        between(s.icd10.fillna("").str[:3], 'C00', 'C26') |
        between(s.icd10.fillna("").str[:3], 'C30', 'C34') |
        between(s.icd10.fillna("").str[:3], 'C37', 'C41') |
        between(s.icd10.fillna("").str[:3], 'C45', 'C58') |
        between(s.icd10.fillna("").str[:3], 'C60', 'C76') |
        between(s.icd10.fillna("").str[:3], 'C81', 'C85') |
        between(s.icd10.fillna("").str[:3], 'C90', 'C97')
    ),
    'severe_liver_disease': agg_bool(
        s.icd9.fillna("").str[:4].isin(['4560', '4561', '4562']) |
        between(s.icd9.fillna("").str[:4], '5722', '5728') |
        s.icd10.fillna("").str[:4].isin(['I850', 'I859', 'I864', 'I982', 'K704', 'K711',
                                        'K721', 'K729', 'K765', 'K766', 'K767'])
    ),
    'metastatic_solid_tumor': agg_bool(
        starts(s.icd9.fillna(""), '196', '197', '198', '199') |
        starts(s.icd10.fillna(""), 'C77', 'C78', 'C79', 'C80')
    ),
    'aids': agg_bool(
        starts(s.icd9.fillna(""), '042', '043', '044') |
        starts(s.icd10.fillna(""), 'B20', 'B21', 'B22', 'B24')
    )
})).reset_index(drop=False)

# Compute age score
age["age_score"] = pd.cut(
    age["age"],
    bins=[-1,40,50,60,70,999],
    labels=[0,1,2,3,4],
    right=True
).astype("int8")

# Assemble Charlson table
charlson = (
    adm.merge(com, on="hadm_id", how="left")
        .merge(age[["hadm_id","age_score"]], on="hadm_id", how="left")
)
ind_cols = com.columns.drop("hadm_id")
charlson[ind_cols] = charlson[ind_cols].fillna(0).astype("int8")
charlson["age_score"] = charlson["age_score"].fillna(0).astype("int8")

# Calculate Charlson comorbidity index
charlson["charlson_comorbidity_index"] = (
    charlson["age_score"]
  + charlson["myocardial_infarct"]
  + charlson["congestive_heart_failure"]
  + charlson["peripheral_vascular_disease"]
  + charlson["cerebrovascular_disease"]
  + charlson["dementia"]
  + charlson["chronic_pulmonary_disease"]
  + charlson["rheumatic_disease"]
  + charlson["peptic_ulcer_disease"]
  + np.maximum(charlson["mild_liver_disease"], 3*charlson["severe_liver_disease"])
  + np.maximum(2*charlson["diabetes_with_cc"], charlson["diabetes_without_cc"])
  + np.maximum(2*charlson["malignant_cancer"], 6*charlson["metastatic_solid_tumor"])
  + 2*charlson["paraplegia"]
  + 2*charlson["renal_disease"]
  + 6*charlson["aids"]
).astype("int16")

# Save output
charlson.to_csv(OUT_CSV, index=False)
print(f"Done – {len(charlson):,} rows → {OUT_CSV}")


Done – 431,088 rows → charlson.csv
