In [7]:
import os
import pandas as pd

# Your paths
DLABITEMS_PATH = "/Users/fariham/Downloads/mimic-iv-3.1/hosp/d_labitems.csv"

# Output (change if you want)
OUT_DIR = "/Users/fariham/mimic-iv-kg/data/processed/nodes"
OUT_LABITEM_NODES = os.path.join(OUT_DIR, "labitem_nodes.csv")

os.makedirs(OUT_DIR, exist_ok=True)

# Read d_labitems (small file, safe to read all at once)
d = pd.read_csv(DLABITEMS_PATH)

# Keep common useful columns (only keep those that exist)
keep = [c for c in ["itemid", "label", "fluid", "category", "loinc_code"] if c in d.columns]
d = d[keep].drop_duplicates(subset=["itemid"]).copy()

# Admin-import IDs
d[":ID(LabItem)"] = "LABITEM_" + d["itemid"].astype("Int64").astype(str)
d[":LABEL"] = "LabItem"

# Put ID + LABEL first, then the rest
cols = [":ID(LabItem)", ":LABEL"] + [c for c in keep]
d = d[cols]

# Write
d.to_csv(OUT_LABITEM_NODES, index=False)

print("Wrote:", OUT_LABITEM_NODES)
print("Rows:", len(d))
d.head()


Wrote: /Users/fariham/mimic-iv-kg/data/processed/nodes/labitem_nodes.csv
Rows: 1650


Unnamed: 0,:ID(LabItem),:LABEL,itemid,label,fluid,category
0,LABITEM_50801,LabItem,50801,Alveolar-arterial Gradient,Blood,Blood Gas
1,LABITEM_50802,LabItem,50802,Base Excess,Blood,Blood Gas
2,LABITEM_50803,LabItem,50803,"Calculated Bicarbonate, Whole Blood",Blood,Blood Gas
3,LABITEM_50804,LabItem,50804,Calculated Total CO2,Blood,Blood Gas
4,LABITEM_50805,LabItem,50805,Carboxyhemoglobin,Blood,Blood Gas


In [9]:
import os
import pandas as pd

LABEVENTS_PATH = "/Users/fariham/Downloads/mimic-iv-3.1/hosp/labevents.csv"

OUT_DIR = "/Users/fariham/mimic-iv-kg/data/processed/relationships"
OUT_LAB_RELS = os.path.join(OUT_DIR, "admission_has_lab_rels.csv")

os.makedirs(OUT_DIR, exist_ok=True)

# Only read columns we will use (big speedup)
usecols = [
    "hadm_id", "itemid",
    "labevent_id", "specimen_id",
    "charttime", "storetime",
    "value", "valuenum", "valueuom",
    "ref_range_lower", "ref_range_upper",
    "flag", "priority"
]
# In extract is missing some columns, weâ€™ll detect available columns first
sample = pd.read_csv(LABEVENTS_PATH, nrows=5)
usecols = [c for c in usecols if c in sample.columns]

chunksize = 1_000_000
first = True
rows_written = 0

for chunk in pd.read_csv(
    LABEVENTS_PATH,
    usecols=usecols,
    chunksize=chunksize,
    low_memory=False
):
    # Keep only rows that can create an edge
    chunk = chunk.dropna(subset=["hadm_id", "itemid"]).copy()

    # Match your admin-import node ID format (change if your Admission IDs differ)
    chunk[":START_ID(Admission)"] = "ADM_" + chunk["hadm_id"].astype("Int64").astype(str)
    chunk[":END_ID(LabItem)"] = "LABITEM_" + chunk["itemid"].astype("Int64").astype(str)
    chunk[":TYPE"] = "HAS_LAB"

    # ---- Type/clean numeric props (Neo4j admin import supports prop:type headers)
    if "valuenum" in chunk.columns:
        chunk["valuenum:float"] = pd.to_numeric(chunk["valuenum"], errors="coerce")
        chunk.drop(columns=["valuenum"], inplace=True)

    if "ref_range_lower" in chunk.columns:
        chunk["ref_range_lower:float"] = pd.to_numeric(chunk["ref_range_lower"], errors="coerce")
        chunk.drop(columns=["ref_range_lower"], inplace=True)

    if "ref_range_upper" in chunk.columns:
        chunk["ref_range_upper:float"] = pd.to_numeric(chunk["ref_range_upper"], errors="coerce")
        chunk.drop(columns=["ref_range_upper"], inplace=True)

    # ---- Keep times as strings (fast)
    if "charttime" in chunk.columns:
        chunk["charttime:string"] = chunk["charttime"].astype(str)
        chunk.drop(columns=["charttime"], inplace=True)

    if "storetime" in chunk.columns:
        chunk["storetime:string"] = chunk["storetime"].astype(str)
        chunk.drop(columns=["storetime"], inplace=True)

    # ---- Rename text props to typed strings (optional but nice)
    rename_map = {}
    for c in ["value", "valueuom", "flag", "priority"]:
        if c in chunk.columns:
            rename_map[c] = f"{c}:string"
    chunk.rename(columns=rename_map, inplace=True)

    # Final column order for relationship import
    base = [":START_ID(Admission)", ":END_ID(LabItem)", ":TYPE"]
    prop_cols = [c for c in chunk.columns if c not in base and c not in ["hadm_id", "itemid"]]
    out = chunk[base + prop_cols]

    out.to_csv(OUT_LAB_RELS, mode="w" if first else "a", header=first, index=False)
    first = False
    rows_written += len(out)

    # progress print every chunk
    print(f"wrote chunk: {len(out):,}   total written: {rows_written:,}")

print("DONE. Output:", OUT_LAB_RELS)


wrote chunk: 549,369   total written: 549,369
wrote chunk: 523,601   total written: 1,072,970
wrote chunk: 524,449   total written: 1,597,419
wrote chunk: 539,805   total written: 2,137,224
wrote chunk: 523,747   total written: 2,660,971
wrote chunk: 529,558   total written: 3,190,529
wrote chunk: 518,422   total written: 3,708,951
wrote chunk: 543,296   total written: 4,252,247
wrote chunk: 518,735   total written: 4,770,982
wrote chunk: 540,073   total written: 5,311,055
wrote chunk: 539,172   total written: 5,850,227
wrote chunk: 533,894   total written: 6,384,121
wrote chunk: 546,737   total written: 6,930,858
wrote chunk: 556,190   total written: 7,487,048
wrote chunk: 550,894   total written: 8,037,942
wrote chunk: 513,526   total written: 8,551,468
wrote chunk: 532,356   total written: 9,083,824
wrote chunk: 526,588   total written: 9,610,412
wrote chunk: 577,912   total written: 10,188,324
wrote chunk: 529,140   total written: 10,717,464
wrote chunk: 531,941   total written: 11

In [11]:
import os, glob

print("Notebook working directory (cwd):")
print(os.getcwd())

print("\nLooking for labitem_nodes.csv and admission_has_lab_rels.csv under your home folder...\n")

hits = []
for pattern in [
    "/Users/fariham/**/labitem_nodes.csv",
    "/Users/fariham/**/admission_has_lab_rels.csv",
]:
    hits.extend(glob.glob(pattern, recursive=True))

for h in hits:
    print(h)

print("\nTotal found:", len(hits))


Notebook working directory (cwd):
/Users/fariham/Downloads/mimic-iv-3.1

Looking for labitem_nodes.csv and admission_has_lab_rels.csv under your home folder...

/Users/fariham/mimic-iv-kg/data/processed/nodes/labitem_nodes.csv
/Users/fariham/mimic-iv-kg/data/processed/relationships/admission_has_lab_rels.csv

Total found: 2
