In [85]:
import pandas as pd

In [87]:

# Load both datasets
diag_events = pd.read_csv("diagnoses_icd.csv")        # columns: subject_id, hadm_id, seq_num, icd_code, icd_version
icd_dict = pd.read_csv("d_icd_diagnoses.csv")         # columns: icd_code, icd_version, long_title



In [88]:

# Normalize codes to handle leading zeros (so '135' == '0135')
diag_events['icd_code_norm'] = diag_events['icd_code'].astype(str).str.lstrip('0')
icd_dict['icd_code_norm'] = icd_dict['icd_code'].astype(str).str.lstrip('0')



In [91]:
# Keep only dictionary rows where icd_code appears in diagnoses_icd.csv
subset_icd = icd_dict[icd_dict['icd_code_norm'].isin(diag_events['icd_code_norm'])]

# Optional: drop the helper column
subset_icd = subset_icd.drop(columns=['icd_code_norm'])

# Save the subset for Neo4j import
subset_icd.to_csv("subset_d_icd_diagnoses.csv", index=False)
print(f"{len(subset_icd)} ICD codes retained")

29049 ICD codes retained


In [93]:
diag_events['icd_code'].nunique()


28562

In [95]:
# Normalize codes
diag_events['icd_code_norm'] = diag_events['icd_code'].astype(str).str.strip().str.lstrip('0')
icd_dict['icd_code_norm'] = icd_dict['icd_code'].astype(str).str.strip().str.lstrip('0')

# Which codes in diagnoses aren't in dictionary
missing_codes = set(diag_events['icd_code_norm']) - set(icd_dict['icd_code_norm'])
print(len(missing_codes), "codes missing from dictionary")
print(list(missing_codes)[:50])  # show first 50


0 codes missing from dictionary
[]


In [97]:
# Before normalization
n_raw = diag_events['icd_code'].nunique()

# After normalization
n_norm = diag_events['icd_code'].astype(str).str.strip().str.lstrip('0').nunique()

print(f"Raw unique ICD codes: {n_raw}")
print(f"Normalized unique ICD codes: {n_norm}")


Raw unique ICD codes: 28562
Normalized unique ICD codes: 28482


In [99]:
import pandas as pd
df = pd.read_csv("diagnoses_icd.csv")
df[df["icd_code"] == "V1302"]


Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version
3013,10003502,21671572,15,V1302,9
5504,10007174,20280072,29,V1302,9
12966,10019517,27896418,15,V1302,9
24455,10037928,23721604,21,V1302,9
24593,10037928,29802992,26,V1302,9
...,...,...,...,...,...
6349448,19978119,24233127,22,V1302,9
6351706,19981610,20359638,4,V1302,9
6355831,19987152,21229906,9,V1302,9
6358532,19991805,23646288,25,V1302,9


In [3]:
diag

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version
0,10000032,22595853,1,5723,9
1,10000032,22595853,2,78959,9
2,10000032,22595853,3,5715,9
3,10000032,22595853,4,07070,9
4,10000032,22595853,5,496,9
...,...,...,...,...,...
6364483,19999987,23865745,7,41401,9
6364484,19999987,23865745,8,78039,9
6364485,19999987,23865745,9,0413,9
6364486,19999987,23865745,10,36846,9


In [48]:

patients = pd.read_csv("patiens.csv")
admissions = pd.read_csv("admissions.csv")

patients_keep = ["subject_id", "gender", "anchor_age"]

admissions_keep_requested = [
    "subject_id",
    "hadm_id",
    "admittime",
    "dischtime",
    "deathtime",
    "admission_type",
    "admit_provider_id",
    "admission_location",
    "discharge_location",
    "edregtime",
    "edouttime",
    "hospital_expire_flag",
    "insurance",
    "language",
    "marital_status",
    "race",
]

admissions_keep = [c for c in admissions_keep_requested if c in admissions.columns]

patients = patients[patients_keep]
admissions = admissions[admissions_keep]

merged = admissions.merge(patients, on="subject_id", how="left")

merged.to_csv("merged_admissions_patients.csv", index=False)

patient_cols_requested = [
    "subject_id",
    "gender",
    "anchor_age",
    "insurance",
    "language",
    "marital_status",
    "race",
    "hospital_expire_flag",
]

patient_cols = [c for c in patient_cols_requested if c in merged.columns]

sort_cols = [c for c in ["subject_id", "admittime"] if c in merged.columns]
if sort_cols:
    merged = merged.sort_values(sort_cols)

patients_clean = (
    merged[patient_cols]
    .drop_duplicates(subset="subject_id", keep="first")
    .copy()
)

rename_map = {"subject_id": "subject_id:ID(Patient)"}

if "anchor_age" in patients_clean.columns:
    rename_map["anchor_age"] = "anchor_age:int"

if "hospital_expire_flag" in patients_clean.columns:
    rename_map["hospital_expire_flag"] = "hospital_expire_flag:int"

patients_clean = patients_clean.rename(columns=rename_map)

patients_clean.to_csv("patients_clean.csv", index=False)

print("Created patients_clean.csv with", len(patients_clean), "patients")

Created patients_clean.csv with 223452 patients
