bg.csv

In [None]:
"""
Pivot blood-gas / chemistry results from the first 24h of each ICU stay.

Inputs:  ./data/icu/icustays.csv, ./data/hosp/labevents.csv
Output:  ./data/derived/first_day_blood_gas.csv
"""

from pathlib import Path
import pandas as pd
import numpy as np
import polars as pl

ROOT = Path("./data")
ICU_F = ROOT / "icu"  / "icustays.csv"
LAB_F = ROOT / "hosp" / "labevents.csv"
OUT_F = ROOT / "derived" / "first_day_blood_gas.csv"
CHUNK = 1_000_000

# Map ITEMID to lab labels
ID2LBL = {
    50800: 'specimen', 50801: 'aado2', 50802: 'baseexcess', 50803: 'bicarbonate',
    50804: 'totalco2', 50805: 'carboxyhemoglobin', 50806: 'chloride',
    50808: 'calcium', 50809: 'glucose', 50810: 'hematocrit', 50811: 'hemoglobin',
    50812: 'intubated', 50813: 'lactate', 50814: 'methemoglobin', 50815: 'o2flow',
    50816: 'fio2', 50817: 'so2', 50818: 'pco2', 50819: 'peep',
    50820: 'ph', 50821: 'po2', 50822: 'potassium', 50823: 'requiredo2',
    50824: 'sodium', 50825: 'temperature', 50826: 'tidalvolume',
    50827: 'ventilationrate', 50828: 'ventilator',
}
KEEP_IDS = set(ID2LBL.keys()) | {51545}  # extra old view id
LABELS = list(dict.fromkeys(ID2LBL.values()))

# Upper-limit filters
UPPER = {50810: 100, 50816: 100, 50817: 100, 50815: 70, 50821: 800}

# Load icustays data and set index
print("Loading icustays …")
icu = pd.read_csv(ICU_F, usecols=["stay_id", "subject_id", "hadm_id", "intime"],
                  parse_dates=["intime"])
icu.set_index("stay_id", inplace=True)
intime = icu["intime"]

print("Scanning labevents …")
rows = []
usecols = ["subject_id", "hadm_id", "itemid", "charttime", "value", "valuenum"]

for chunk in pd.read_csv(LAB_F, usecols=usecols, parse_dates=["charttime"],
                         chunksize=CHUNK, low_memory=False):
    chunk = chunk[chunk["itemid"].isin(KEEP_IDS)]
    chunk = chunk[pd.notnull(chunk["valuenum"])]
    if chunk.empty:
        continue

    # Merge using subject_id and hadm_id
    chunk = chunk.merge(icu.reset_index(), on=["subject_id", "hadm_id"], how="inner")
    if chunk.empty:
        continue

    # Limit charttime to [-6, +24] hours from ICU intime
    dt = (chunk["charttime"] - chunk["intime"]).dt.total_seconds() / 3600
    chunk = chunk[(dt >= -6) & (dt <= 24)]
    if chunk.empty:
        continue

    # Apply upper-limit filters
    ok = np.ones(len(chunk), bool)
    for iid, limit in UPPER.items():
        ok &= ~((chunk["itemid"] == iid) & (chunk["valuenum"] > limit))
    chunk = chunk[ok]
    if chunk.empty:
        continue

    # Adjust FiO2 and oxygen saturation
    mask_fio2 = chunk["itemid"] == 50816
    chunk.loc[mask_fio2 & (chunk["valuenum"] < 20), "valuenum"] = np.nan
    chunk.loc[mask_fio2 & (chunk["valuenum"] > 100), "valuenum"] = np.nan

    mask_so2 = chunk["itemid"] == 50817
    chunk.loc[mask_so2 & (chunk["valuenum"] > 100), "valuenum"] = np.nan

    # Remove non-allowed negative values (except for BASEEXCESS)
    neg_mask = (chunk["valuenum"] <= 0) & (chunk["itemid"] != 50802)
    chunk.loc[neg_mask, "valuenum"] = np.nan

    # Replace itemid with label
    chunk["label"] = chunk["itemid"].map(ID2LBL)

    # For specimen, use text value instead of numeric
    chunk.loc[chunk["label"] == "specimen", "valuenum"] = np.nan

    rows.append(chunk[["stay_id", "charttime", "label", "value", "valuenum"]])

if not rows:
    raise RuntimeError("No blood-gas rows found!")

df = pd.concat(rows, ignore_index=True)

print("Pivoting …")
wide = (
    df.pivot_table(index=["stay_id", "charttime"],
                   columns="label",
                   values="valuenum",
                   aggfunc="max")
      .reset_index()
)

# Set specimen column
spec = (df[df["label"] == "specimen"]
        .drop_duplicates(subset=["stay_id", "charttime"])
        .set_index(["stay_id", "charttime"])["value"])
wide["specimen"] = wide.set_index(["stay_id", "charttime"]).index.map(spec)

# Merge icu IDs back
wide = wide.merge(icu.reset_index()[["stay_id", "subject_id", "hadm_id"]],
                  on="stay_id", how="left")

# Calculate pCO2 minimum per stay
wide = wide.sort_values(["stay_id", "charttime"])
pco2_min_by_stay = wide.groupby("stay_id", as_index=False)["pco2"].min().rename(columns={"pco2": "pco2_min"})
wide = wide.merge(pco2_min_by_stay, on="stay_id", how="left")

# Order columns and save CSV
cols = ["subject_id", "hadm_id", "stay_id", "charttime", "specimen", "pco2_min"] + LABELS
wide = wide.reindex(columns=cols)

print("Writing CSV …")
wide.to_csv(OUT_F, index=False)
print(f"Wrote {len(wide):,} rows ➜ {OUT_F}")


Loading icustays …
Scanning labevents …
Pivoting …
Writing CSV …
Wrote 222,866 rows ➜ data/derived/first_day_blood_gas.csv


first_day_bg.csv

In [None]:
import pandas as pd
from pathlib import Path

ROOT = Path("./data")
ICU_CSV = ROOT / "icu" / "icustays.csv"
BG_CSV  = ROOT / "derived" / "bg.csv"
OUT_CSV = "first_day_blood_gas.csv"

VARS = ["lactate", "ph", "so2", "po2", "pco2",
    "aado2", "aado2_calc", "pao2fio2ratio",
    "baseexcess", "bicarbonate", "totalco2",
    "hematocrit", "hemoglobin", "carboxyhemoglobin", "methemoglobin",
    "temperature", "chloride", "calcium", "glucose",
    "potassium", "sodium"]

# Load ICU stays
icu = pd.read_csv(
    ICU_CSV,
    usecols=["subject_id", "stay_id", "intime"],
    parse_dates=["intime"]
).astype({"subject_id": "int32", "stay_id": "int32"})

# Load blood-gas data
bg_cols = ["subject_id", "charttime"] + VARS
bg = pd.read_csv(BG_CSV, usecols=bg_cols, parse_dates=["charttime"])
bg["subject_id"] = bg["subject_id"].astype("int32")

# Filter rows within -6h to +24h of ICU admission
bg = bg.merge(icu, on="subject_id", how="left", validate="many_to_many")
lo = bg["intime"] - pd.Timedelta(hours=6)
hi = bg["intime"] + pd.Timedelta(days=1)
bg = bg[(bg["charttime"] >= lo) & (bg["charttime"] <= hi)]

# Aggregate min and max for each variable per stay
agg_map = {}
for v in VARS:
    agg_map[f"{v}_min"] = (v, "min")
    agg_map[f"{v}_max"] = (v, "max")

result = bg.groupby(["subject_id", "stay_id"], as_index=False).agg(**agg_map)

# Save the results
result.to_csv(OUT_CSV, index=False)
print(f"Done – {len(result):,} rows → {OUT_CSV}")


FileNotFoundError: [Errno 2] No such file or directory: 'data/derived/bg.csv'

vitalsign.csv

In [None]:
import pandas as pd
from pathlib import Path
import numpy as np

# Paths
ROOT    = Path("./data")
CE_CSV  = ROOT / "icu" / "chartevents.csv"
OUT_CSV = "vitalsign.csv"

# Map itemid to measurement & valid range
ID_RULE = {
    220045: ("heart_rate",   (0, 300)),
    220050: ("sbp",          (0, 400)),
    220051: ("dbp",          (0, 300)),
    220052: ("mbp",          (0, 300)),
    225309: ("sbp",          (0, 400)),
    225310: ("dbp",          (0, 300)),
    225312: ("mbp",          (0, 300)),
    220179: ("sbp_ni",       (0, 400)),
    220180: ("dbp_ni",       (0, 300)),
    220181: ("mbp_ni",       (0, 300)),
    220210: ("resp_rate",    (0, 70)),
    224690: ("resp_rate",    (0, 70)),
    220277: ("spo2",         (0, 100)),
    225664: ("glucose",      (0, None)),
    220621: ("glucose",      (0, None)),
    226537: ("glucose",      (0, None)),
    223761: ("temp_F",       (70, 120)),   # Fahrenheit
    223762: ("temp_C",       (10,  50)),   # Celsius
    224642: ("temperature_site", None),    # string value
}

KEEP_IDS = list(ID_RULE)
USE = ["subject_id","stay_id","charttime", "itemid","valuenum","value"]

# Load and filter data
ce = (
    pd.read_csv(CE_CSV, usecols=USE, parse_dates=["charttime"],
                dtype={"subject_id":"int32","stay_id":"Int32"})
      .query("stay_id.notna() and itemid in @KEEP_IDS")
      .copy()
)

# Clean values and convert temperature if needed
def clean_row(row):
    name, rng = ID_RULE[row.itemid]
    if name == "temperature_site":
        return row.value
    val = row.valuenum
    if pd.isna(val):
        return np.nan
    if name == "temp_F":
        val = (val - 32) / 1.8
        name = "temp_C"
        row.itemid = 223762
    lo, hi = rng
    if (lo is not None and val <= lo) or (hi is not None and val >= hi):
        return np.nan
    return val

ce["clean"] = ce.apply(clean_row, axis=1)
ce["var"]   = ce["itemid"].map(lambda i: ID_RULE[i][0].replace("temp_F","temp_C"))

# Pivot numeric values
num_df = ce[ce["var"]!="temperature_site"].dropna(subset=["clean"])
num_piv = (
    num_df.pivot_table(index=["subject_id","stay_id","charttime"],
                       columns="var",
                       values="clean",
                       aggfunc="mean")
)

# Process temperature_site string values
site = (
    ce[ce["var"]=="temperature_site"]
      .dropna(subset=["clean"])
      .groupby(["subject_id","stay_id","charttime"])["clean"]
      .last()
      .rename("temperature_site")
)

vital = num_piv.join(site, how="left").reset_index()
vital.rename(columns={"temp_C":"temperature"}, inplace=True)

# Save result
vital.sort_values(["subject_id","stay_id","charttime"]).to_csv(OUT_CSV, index=False)
print(f"Done – {len(vital):,} rows → {OUT_CSV}")


Done – 9,586,051 rows → vitalsign.csv


first_day_vitalsign.csv

In [None]:
import pandas as pd
from pathlib import Path

ROOT   = Path("./data")                      
ICU_CSV = ROOT / "icu"     / "icustays.csv"
VS_CSV  = ROOT / "derived" / "vitalsign.csv"
OUT_CSV = "first_day_vitalsign.csv"

# Load ICU stays
icu = (
  pd.read_csv(ICU_CSV,
        usecols=["subject_id","stay_id","intime"],
        parse_dates=["intime"])
    .astype({"subject_id": "int32", "stay_id": "int32"})
)

# Load vital-sign rows
vs_cols = ["stay_id", "charttime", "heart_rate",
       "sbp", "dbp", "mbp",
       "resp_rate", "temperature", "spo2", "glucose"]
vs = pd.read_csv(VS_CSV, usecols=vs_cols, parse_dates=["charttime"])
vs["stay_id"] = vs["stay_id"].astype("int32")

# Filter rows within -6h and +24h of ICU admission
vs = vs.merge(icu, on="stay_id", how="left", validate="many_to_one")
lo = vs["intime"] - pd.Timedelta(hours=6)
hi = vs["intime"] + pd.Timedelta(days=1)
vs = vs[(vs["charttime"] >= lo) & (vs["charttime"] <= hi)]

# Aggregate min, max, and mean per stay
agg_map = {c: ["min", "max", "mean"] for c in
       ["heart_rate", "sbp", "dbp", "mbp",
      "resp_rate", "temperature", "spo2", "glucose"]}
out = vs.groupby(["subject_id", "stay_id"], as_index=False).agg(agg_map)
out.columns = ["_".join(x).rstrip("_") for x in out.columns.to_flat_index()]

ordered = ["subject_id", "stay_id",
       "heart_rate_min", "heart_rate_max", "heart_rate_mean",
       "sbp_min", "sbp_max", "sbp_mean",
       "dbp_min", "dbp_max", "dbp_mean",
       "mbp_min", "mbp_max", "mbp_mean",
       "resp_rate_min", "resp_rate_max", "resp_rate_mean",
       "temperature_min", "temperature_max", "temperature_mean",
       "spo2_min", "spo2_max", "spo2_mean",
       "glucose_min", "glucose_max", "glucose_mean"]
out = out.reindex(columns=ordered)

# Save result
out.to_csv(OUT_CSV, index=False)
print(f"Done – {len(out):,} rows → {OUT_CSV}")


Done – 73,127 rows → first_day_vitalsign.csv


sirs.csv

In [None]:
import pandas as pd
from pathlib import Path
import numpy as np

ROOT = Path("./data")
ICU  = ROOT / "icu" / "icustays.csv"
VS   = ROOT / "derived" / "first_day_vitalsign.csv"
BG   = ROOT / "derived" / "first_day_blood_gas.csv"
LAB  = ROOT / "derived" / "first_day_lab.csv"
OUT  = "sirs.csv"

# Load base tables
icu  = pd.read_csv(ICU,  usecols=["subject_id", "hadm_id", "stay_id"])
vs   = pd.read_csv(VS,   usecols=["stay_id", "temperature_min", "temperature_max",
                                  "heart_rate_max", "resp_rate_max"])
bg   = pd.read_csv(BG,   usecols=["stay_id", "pco2_min"])
lab  = pd.read_csv(LAB,  usecols=["stay_id", "wbc_min_x", "wbc_max_x", "bands_max"])

# Merge tables
comp = (
    icu.merge(vs, on="stay_id", how="left")
       .merge(bg, on="stay_id", how="left")
       .merge(lab, on="stay_id", how="left")
)

# Temperature score
comp["temp_score"] = np.select(
    [
        comp["temperature_min"] < 36.0,
        comp["temperature_max"] > 38.0
    ],
    [1, 1],
    default=np.where(
        comp[["temperature_min", "temperature_max"]].isna().all(axis=1),
        np.nan,
        0
    )
).astype("float32")

# Heart rate score
comp["heart_rate_score"] = np.where(
    comp["heart_rate_max"].isna(), np.nan,
    np.where(comp["heart_rate_max"] > 90, 1, 0)
).astype("float32")

# Respiratory score
comp["resp_score"] = np.select(
    [
        comp["resp_rate_max"] > 20,
        comp["pco2_min"] < 32
    ],
    [1, 1],
    default=np.where(
        comp[["resp_rate_max", "pco2_min"]].isna().all(axis=1),
        np.nan,
        0
    )
).astype("float32")

# WBC score
comp["wbc_score"] = np.select(
    [
        comp["wbc_min_x"] < 4,
        comp["wbc_max_x"] > 12,
        comp["bands_max"] > 10
    ],
    [1, 1, 1],
    default=np.where(
        comp[["wbc_min_x", "bands_max"]].isna().all(axis=1),
        np.nan,
        0
    )
).astype("float32")

# Total SIRS points
score_cols = ["temp_score", "heart_rate_score", "resp_score", "wbc_score"]
comp["sirs"] = comp[score_cols].fillna(0).sum(axis=1).astype("int8")

# Reorder and save
final_cols = ["subject_id", "hadm_id", "stay_id", "sirs"] + score_cols
comp[final_cols].to_csv(OUT, index=False)
print(f"Done – {len(comp):,} rows → {OUT}")

Done – 247,293 rows → sirs.csv
