In [1]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())
print("Using device:", device)

Using device: cuda


  scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())


In [2]:
# Robust ventilation detection (<= 12 h) using d_items to select vent-specific itemids
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

ICU_PATH = "/kaggle/input/neuro-icu/icu/icu/"

# -------------------------
# 1) load icu stays (neuro)
# -------------------------
icu = pd.read_csv(
    ICU_PATH + "icustays.csv",
    usecols=["stay_id", "hadm_id", "subject_id", "first_careunit", "intime", "outtime", "los"]
)
neuro_units = ["Neuro Intermediate", "Neuro Stepdown", "Neuro Surgical Intensive Care Unit"]
neuro_icu = icu[icu["first_careunit"].isin(neuro_units)].copy()
neuro_icu["intime"] = pd.to_datetime(neuro_icu["intime"])
neuro_stays = set(neuro_icu["stay_id"].unique())
print(f"Total Neuro-ICU stays: {len(neuro_stays)}")

# -------------------------
# 2) inspect d_items for vent-specific labels
# -------------------------
d_items = pd.read_csv(ICU_PATH + "d_items.csv", usecols=["itemid","label","category","param_type"], low_memory=False)
d_items["label_l"] = d_items["label"].str.lower().fillna("")

# ventilation-identifying keywords (conservative)
vent_keywords = [
    r"ventil",        # ventilator, ventilation, ventilatory
    r"endotracheal",  # endotracheal tube / ETT
    r"ett\b",         # ETT abbreviation (word boundary)
    r"intubat",       # intubation / intubated
    r"tracheostom",   # tracheostomy
    r"vent mode",     # ventilator mode
    r"ventilator",    # explicit ventilator labels
    r"airway device", # some datasets
    r"mechanical vent" # mechanical ventilation phrase
]

# build regex
vent_regex = re.compile("|".join(vent_keywords), flags=re.IGNORECASE)

# conservative filter: keep entries where label strongly indicates ventilator/device/intubation
d_items["is_vent_label"] = d_items["label"].str.contains(vent_regex, na=False)
vent_itemids_from_dict = set(d_items.loc[d_items["is_vent_label"], "itemid"].unique())
print(f"Vent-related itemids found in d_items (conservative): {len(vent_itemids_from_dict)}")

# show the matched labels to inspect (top 50)
matched_labels = d_items.loc[d_items["is_vent_label"], ["itemid","label"]].drop_duplicates().sort_values("itemid")
print("Sample matched vent-related d_items rows (inspect these):")
print(matched_labels.head(50).to_string(index=False))

# -------------------------
# 3) procedureevents (explicit)
# -------------------------
proc_cols = ["stay_id","ordercategoryname","starttime"]
procedures = pd.read_csv(ICU_PATH + "procedureevents.csv", usecols=proc_cols, low_memory=False)
procedures = procedures[procedures["stay_id"].isin(neuro_stays)].copy()
procedures["starttime"] = pd.to_datetime(procedures["starttime"], errors="coerce")
procedures = procedures.merge(neuro_icu[["stay_id","intime"]], on="stay_id", how="left")
procedures["hours_from_admit"] = (procedures["starttime"] - procedures["intime"]).dt.total_seconds()/3600

proc_mask = procedures["ordercategoryname"].str.contains(r"Ventilation|Intubation|Extubation", flags=re.IGNORECASE, regex=True, na=False)
vent_stays_proc = set(procedures.loc[proc_mask & (procedures["hours_from_admit"] <= 12), "stay_id"].unique())
print(f"Ventilation within 12 h (procedureevents): {len(vent_stays_proc)}")

# -------------------------
# 4) chartevents â€” only vent-specific itemids from d_items
# -------------------------
# If no vent itemids found (rare), fall back to small known safe list; else use discovered list
if len(vent_itemids_from_dict) == 0:
    # fallback safe list (very conservative; likely small)
    vent_itemids = {220339, 224685}  # ventilator mode/type - fallback
    print("WARNING: no vent itemids found in d_items; using fallback list:", vent_itemids)
else:
    vent_itemids = vent_itemids_from_dict

vent_stays_chart = set()
# also track which itemids contributed most so we can inspect
chart_itemid_counts = {}

usecols_chart = ["stay_id","charttime","itemid","value","valueuom"]
for chunk in tqdm(pd.read_csv(ICU_PATH + "chartevents.csv", usecols=usecols_chart, chunksize=1_000_000, low_memory=False)):
    chunk = chunk[chunk["stay_id"].isin(neuro_stays)]
    if chunk.empty:
        continue
    # filter to vent-specific itemids
    chunk = chunk[chunk["itemid"].isin(vent_itemids)]
    if chunk.empty:
        continue
    chunk["charttime"] = pd.to_datetime(chunk["charttime"], errors="coerce")
    chunk = chunk.merge(neuro_icu[["stay_id","intime"]], on="stay_id", how="left")
    chunk["hours_from_admit"] = (chunk["charttime"] - chunk["intime"]).dt.total_seconds()/3600
    # keep only <= 12 hrs
    chunk = chunk[chunk["hours_from_admit"].between(0, 12)]
    if chunk.empty:
        continue
    # update stay ids
    vent_stays_chart.update(chunk["stay_id"].unique())
    # count itemid contributions
    counts = chunk["itemid"].value_counts().to_dict()
    for k,v in counts.items():
        chart_itemid_counts[k] = chart_itemid_counts.get(k,0) + int(v)

print(f"Ventilation indicators (chartevents <=12 h) using vent-specific d_items: {len(vent_stays_chart)}")

# Print top contributing itemids + labels for inspection
top_items = sorted(chart_itemid_counts.items(), key=lambda x: x[1], reverse=True)[:30]
if top_items:
    print("\nTop chartevent itemids that matched (itemid -> count -> label):")
    for itemid,count in top_items:
        label = d_items.loc[d_items["itemid"]==itemid, "label"].iloc[0] if any(d_items["itemid"]==itemid) else ""
        print(f"  {itemid} -> {count} times -> {label}")
# -------------------------
# 5) inputevents â€” airway / paralytic / sedation orders (proxies, optimized with d_items)
# -------------------------

# Define patterns for sedation / airway / ventilation drugs and procedures
vent_input_keywords = [
    r"ventil", r"airway", r"intubat", r"ett", r"endotracheal", r"tracheostom",
    r"respirat", r"sedation", r"sedative", r"anesth", r"paralytic",
    r"propofol", r"midazolam", r"lorazepam", r"diazepam",
    r"fentanyl", r"morphine", r"rocuronium", r"vecuronium",
    r"succinylcholine", r"etomidate"
]
vent_input_regex = re.compile("|".join(vent_input_keywords), flags=re.IGNORECASE)

# --- Step 1: Identify relevant input itemids from d_items ---
d_items["label_l"] = d_items["label"].str.lower().fillna("")
vent_input_itemids = set(
    d_items.loc[d_items["label_l"].str.contains(vent_input_regex, na=False), "itemid"].unique()
)
print(f"Found {len(vent_input_itemids)} potential vent/sedation itemids in d_items")

# --- Step 2: Process inputevents efficiently ---
input_cols = [
    "stay_id","itemid","starttime","ordercategoryname",
    "secondaryordercategoryname","ordercategorydescription",
    "ordercomponenttypedescription","statusdescription"
]

vent_stays_inputs = set()
chart_itemid_counts = {}

for chunk in tqdm(pd.read_csv(ICU_PATH + "inputevents.csv", usecols=input_cols, chunksize=500_000, low_memory=False)):
    chunk = chunk[chunk["stay_id"].isin(neuro_stays)]
    if chunk.empty:
        continue

    # Filter by itemid first (fast path)
    chunk = chunk[chunk["itemid"].isin(vent_input_itemids)]
    if chunk.empty:
        # fallback to text search (slow path)
        combined_text = (
            chunk["ordercategoryname"].fillna("") + " " +
            chunk["secondaryordercategoryname"].fillna("") + " " +
            chunk["ordercategorydescription"].fillna("") + " " +
            chunk["ordercomponenttypedescription"].fillna("") + " " +
            chunk["statusdescription"].fillna("")
        )
        chunk = chunk[combined_text.str.contains(vent_input_regex, na=False)]
    if chunk.empty:
        continue

    # Time filter (<= 12 h from ICU admit)
    chunk["starttime"] = pd.to_datetime(chunk["starttime"], errors="coerce")
    chunk = chunk.merge(neuro_icu[["stay_id","intime"]], on="stay_id", how="left")
    chunk["hours_from_admit"] = (chunk["starttime"] - chunk["intime"]).dt.total_seconds() / 3600
    valid_stays = chunk.loc[chunk["hours_from_admit"].between(0,12), "stay_id"].unique()

    vent_stays_inputs.update(valid_stays)

    # track which itemids contributed (for debugging)
    counts = chunk["itemid"].value_counts().to_dict()
    for k,v in counts.items():
        chart_itemid_counts[k] = chart_itemid_counts.get(k,0) + int(v)

print(f"âœ… Ventilation-related inputevents (<=12 h): {len(vent_stays_inputs)}")

# Optional: inspect top contributing itemids
if chart_itemid_counts:
    print("\nTop contributing input itemids:")
    for itemid,count in sorted(chart_itemid_counts.items(), key=lambda x: x[1], reverse=True)[:20]:
        label = d_items.loc[d_items["itemid"] == itemid, "label"].iloc[0] if any(d_items["itemid"] == itemid) else ""
        print(f"  {itemid} -> {count} times -> {label}")



# -------------------------
# 6) Combine and save
# -------------------------
vent_all = vent_stays_proc.union(vent_stays_chart).union(vent_stays_inputs)
print("\n============================")
print(f"Total unique ventilated stays (<=12 h): {len(vent_all)} / {len(neuro_stays)}")
print(f"Class ratio: {len(vent_all)/len(neuro_stays):.2%}")
print("============================")

# Save CSV with label = 1 for ventilated within 12h
pd.DataFrame({"stay_id": list(vent_all), "ventilated_12h":1}).to_csv("ventilation_labels_12h.csv", index=False)
print("Saved ventilation_labels_12h.csv")

# Also save per-source stay lists for inspection
pd.DataFrame({"stay_id": list(vent_stays_proc)}).to_csv("vent_proc_12h.csv", index=False)
pd.DataFrame({"stay_id": list(vent_stays_chart)}).to_csv("vent_chart_12h.csv", index=False)
pd.DataFrame({"stay_id": list(vent_stays_inputs)}).to_csv("vent_inputs_12h.csv", index=False)
print("Saved vent_proc_12h.csv, vent_chart_12h.csv, vent_inputs_12h.csv")


Total Neuro-ICU stays: 7197
Vent-related itemids found in d_items (conservative): 81
Sample matched vent-related d_items rows (inspect these):
 itemid                                                label
 222871               Mallampati classification (Intubation)
 223059                                 Intubation - Details
 223837                                        ETT Size (ID)
 223838                                         ETT Location
 223840                                         ETT Re-taped
 223848                                      Ventilator Type
 223849                                      Ventilator Mode
 224385                                           Intubation
 224391                                  ETT Mark (location)
 224392                                  ETT Position Change
 224415                                        ETT Mark (cm)
 224832                                             ETT Type
 225267                                Difficult to Intubate
 22

433it [16:12,  2.25s/it]


Ventilation indicators (chartevents <=12 h) using vent-specific d_items: 703

Top chartevent itemids that matched (itemid -> count -> label):
  223848 -> 2180 times -> Ventilator Type
  224415 -> 2016 times -> ETT Mark (cm)
  229314 -> 2002 times -> Ventilator Mode (Hamilton)
  223837 -> 1966 times -> ETT Size (ID)
  223838 -> 1965 times -> ETT Location
  224832 -> 1954 times -> ETT Type
  226814 -> 1434 times -> Known difficult intubation
  224391 -> 1203 times -> ETT Mark (location)
  227566 -> 521 times -> Ventilator Tank #2
  227565 -> 519 times -> Ventilator Tank #1
  223849 -> 459 times -> Ventilator Mode
  227809 -> 192 times -> ETT Position Change
  227810 -> 40 times -> ETT Re-taped
  230045 -> 32 times -> Intellivent (Hamilton Vent Mode)
Found 132 potential vent/sedation itemids in d_items


22it [01:04,  2.91s/it]

âœ… Ventilation-related inputevents (<=12 h): 803

Top contributing input itemids:
  222168 -> 16014 times -> Propofol
  221744 -> 3797 times -> Fentanyl
  225942 -> 2161 times -> Fentanyl (Concentrate)
  225154 -> 1847 times -> Morphine Sulfate
  221668 -> 1647 times -> Midazolam (Versed)
  221385 -> 960 times -> Lorazepam (Ativan)
  229233 -> 100 times -> Rocuronium
  221623 -> 61 times -> Diazepam (Valium)
  222062 -> 1 times -> Vecuronium

Total unique ventilated stays (<=12 h): 993 / 7197
Class ratio: 13.80%
Saved ventilation_labels_12h.csv
Saved vent_proc_12h.csv, vent_chart_12h.csv, vent_inputs_12h.csv





In [3]:
# ============================================================
# 1. Preprocess ICU Vitals, Inputs, Outputs (first 6h)
# ============================================================
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

ICU_PATH = "/kaggle/input/neuro-icu/icu/icu/"

# neuro_icu and neuro_stays should already exist from Cell 2
# ------------------------------------------------------------

# ============================================================
# 2. Chunked Load for Chartevents (Vitals, first 6h)
# ============================================================
vital_chunks = []
for chunk in tqdm(pd.read_csv(
    ICU_PATH + "chartevents.csv",
    usecols=["stay_id", "charttime", "itemid", "valuenum"],
    chunksize=1_000_000, low_memory=False
)):
    chunk = chunk[chunk["stay_id"].isin(neuro_stays)]
    if chunk.empty:
        continue
    chunk["charttime"] = pd.to_datetime(chunk["charttime"], errors="coerce")
    chunk = chunk.merge(neuro_icu[["stay_id", "intime"]], on="stay_id", how="left")
    chunk["hours_from_admit"] = (chunk["charttime"] - chunk["intime"]).dt.total_seconds() / 3600
    chunk = chunk[chunk["hours_from_admit"].between(0, 6)]
    vital_chunks.append(chunk)

chart = pd.concat(vital_chunks, ignore_index=True)
print(f"Vitals records (first 6h): {len(chart):,}")

# ============================================================
# 3. Chunked Load for Inputevents (Meds/Fluids)
# ============================================================
input_chunks = []
for chunk in tqdm(pd.read_csv(
    ICU_PATH + "inputevents.csv",
    usecols=["stay_id", "starttime", "amount", "ordercategoryname"],
    chunksize=500_000, low_memory=False
)):
    chunk = chunk[chunk["stay_id"].isin(neuro_stays)]
    if chunk.empty:
        continue
    chunk["starttime"] = pd.to_datetime(chunk["starttime"], errors="coerce")
    chunk = chunk.merge(neuro_icu[["stay_id", "intime"]], on="stay_id", how="left")
    chunk["hours_from_admit"] = (chunk["starttime"] - chunk["intime"]).dt.total_seconds() / 3600
    chunk = chunk[chunk["hours_from_admit"].between(0, 6)]
    input_chunks.append(chunk)

inputs = pd.concat(input_chunks, ignore_index=True)
print(f"Inputs (first 6h): {len(inputs):,}")

# ============================================================
# 4. Chunked Load for Outputevents
# ============================================================
output_chunks = []
for chunk in tqdm(pd.read_csv(
    ICU_PATH + "outputevents.csv",
    usecols=["stay_id", "charttime", "itemid", "value"],
    chunksize=500_000, low_memory=False
)):
    chunk = chunk[chunk["stay_id"].isin(neuro_stays)]
    if chunk.empty:
        continue
    chunk["charttime"] = pd.to_datetime(chunk["charttime"], errors="coerce")
    chunk = chunk.merge(neuro_icu[["stay_id", "intime"]], on="stay_id", how="left")
    chunk["hours_from_admit"] = (chunk["charttime"] - chunk["intime"]).dt.total_seconds() / 3600
    chunk = chunk[chunk["hours_from_admit"].between(0, 6)]
    output_chunks.append(chunk)

outputs = pd.concat(output_chunks, ignore_index=True)
print(f"Outputs (first 6h): {len(outputs):,}")

# ============================================================
# 6. Save processed files
# ============================================================
chart.to_csv("processed_vitals_6h.csv", index=False)
inputs.to_csv("processed_inputs_6h.csv", index=False)
outputs.to_csv("processed_outputs_6h.csv", index=False)

print("\nâœ… Memory-safe preprocessing complete! Files saved:")
print("- processed_vitals_6h.csv")
print("- processed_inputs_6h.csv")
print("- processed_outputs_6h.csv")


433it [14:25,  2.00s/it]


Vitals records (first 6h): 2,982,925


22it [00:53,  2.42s/it]


Inputs (first 6h): 26,978


11it [00:09,  1.13it/s]


Outputs (first 6h): 14,974

âœ… Memory-safe preprocessing complete! Files saved:
- processed_vitals_6h.csv
- processed_inputs_6h.csv
- processed_outputs_6h.csv


In [5]:
# ============================================================
# 1. Import Libraries
# ============================================================
import pandas as pd
import numpy as np

# ============================================================
# 2. Load Preprocessed Files (6-hour data)
# ============================================================
chart_6h = pd.read_csv("processed_vitals_6h.csv")       # vitals from first 6h
inputs = pd.read_csv("processed_inputs_6h.csv")          # meds/fluids
outputs = pd.read_csv("processed_outputs_6h.csv")        # urine/drains

print(f"Vitals (6h): {len(chart_6h):,}")
print(f"Inputs (6h): {len(inputs):,}")
print(f"Outputs (6h): {len(outputs):,}")

# ============================================================
# 3. Aggregate Vitals into Hourly Trends per Patient
# ============================================================
chart_6h["hour_bin"] = chart_6h["hours_from_admit"].astype(int)
vital_agg = (
    chart_6h.groupby(["stay_id", "hour_bin"])["valuenum"]
    .agg(["mean", "std", "min", "max", "count"])
    .reset_index()
)

# Aggregate across the 6-hour window per patient
vital_summary = (
    vital_agg.groupby("stay_id")
    .agg({
        "mean": "mean",
        "std": "mean",
        "min": "min",
        "max": "max",
        "count": "sum"
    })
    .reset_index()
)
vital_summary.columns = [
    "stay_id",
    "vital_mean",
    "vital_std",
    "vital_min",
    "vital_max",
    "vital_count"
]

# ============================================================
# 4. Aggregate Inputs and Outputs (Meds, Fluids, Urine)
# ============================================================
input_summary = (
    inputs.groupby("stay_id")
    .agg(
        total_input_ml=("amount", "sum"),
    )
    .reset_index()
)

output_summary = (
    outputs.groupby("stay_id")
    .agg(total_output_ml=("value", "sum"))
    .reset_index()
)

# ============================================================
# Add ventilation labels (<=12h)
# ============================================================
vent_labels = pd.read_csv("ventilation_labels_12h.csv")

# Merge with neuro_icu or your main patient summary to create label
neuro_icu = neuro_icu.merge(vent_labels, on="stay_id", how="left")
neuro_icu["ventilation_within_12h"] = neuro_icu["ventilated_12h"].fillna(0).astype(int)
neuro_icu.drop(columns=["ventilated_12h"], inplace=True)

# ============================================================
# 5. Merge All Modalities into Unified Modeling Dataset
# ============================================================
model_df = (
    neuro_icu.merge(vital_summary, on="stay_id", how="left")
             .merge(input_summary, on="stay_id", how="left")
             .merge(output_summary, on="stay_id", how="left")
)

print("Label (ventilation_within_12h) positive count:",
      model_df["ventilation_within_12h"].sum())

# Replace impossible values for vitals with median (more realistic)
for col in ["vital_mean", "vital_std", "vital_min", "vital_max"]:
    model_df[col] = model_df[col].fillna(model_df[col].median())

# Replace event-style features with 0 (absence = none)
for col in ["vital_count", "total_input_ml", "total_output_ml"]:
    model_df[col] = model_df[col].fillna(0)


# ============================================================
# 6. Sanity Check and Label Distribution
# ============================================================
print("\nâœ… Aggregated 6-hour modeling dataset created.")
print("Shape:", model_df.shape)
print("Columns:", list(model_df.columns))
print("Label (ventilation_within_12h) positive count:",
      model_df["ventilation_within_12h"].sum())

# ============================================================
# 7. Save Final Modeling Dataset
# ============================================================
model_df.to_csv("model_ready_neuro_6h.csv", index=False)
print("\nðŸ’¾ Saved as: model_ready_neuro_6h.csv")


Vitals (6h): 2,982,925
Inputs (6h): 26,978
Outputs (6h): 14,974
Label (ventilation_within_12h) positive count: 993

âœ… Aggregated 6-hour modeling dataset created.
Shape: (7197, 15)
Columns: ['subject_id', 'hadm_id', 'stay_id', 'first_careunit', 'intime', 'outtime', 'los', 'ventilation_within_12h', 'vital_mean', 'vital_std', 'vital_min', 'vital_max', 'vital_count', 'total_input_ml', 'total_output_ml']
Label (ventilation_within_12h) positive count: 993

ðŸ’¾ Saved as: model_ready_neuro_6h.csv


In [13]:
# ============================================================
# 1. Import dependencies
# ============================================================
import pandas as pd
import re
from tqdm import tqdm
tqdm.pandas()

# ============================================================
# 2. Define text cleaning function
# ============================================================
def clean_text(text):
    """
    Cleans raw clinical note text:
    - Lowercases
    - Removes non-alphanumeric chars (except key punctuation)
    - Normalizes spaces
    """
    text = str(text).lower()
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[^a-z0-9.,;:()/%\- ]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


# ============================================================
# 3. Helper: Process notes in chunks (memory-safe)
# ============================================================
def process_notes_in_chunks(path, neuro_hadm_ids, id_col="hadm_id", text_col=None, chunksize=50000):
    """
    Streams large CSV files in chunks:
    - Filters early by relevant hadm_ids
    - Cleans text incrementally
    - Returns combined dataframe of hadm_id + clean_text
    """
    out = []
    for chunk in tqdm(pd.read_csv(path, low_memory=False, chunksize=chunksize), desc=f"Processing {path}"):
        if text_col is None:
            text_col = chunk.columns[-1]  # assume last column contains text
        chunk = chunk[chunk[id_col].isin(neuro_hadm_ids)]
        if chunk.empty:
            continue
        chunk["clean_text"] = chunk[text_col].apply(clean_text)
        out.append(chunk[[id_col, "clean_text"]])
    if not out:
        return pd.DataFrame(columns=[id_col, "clean_text"])
    return pd.concat(out, ignore_index=True)


# ============================================================
# 4. Load Neuro ICU cohort info
# ============================================================
icu = pd.read_csv("/kaggle/input/neuro-icu/icu/icu/icustays.csv", usecols=["stay_id", "hadm_id", "first_careunit"])
neuro_units = ["Neuro Intermediate", "Neuro Stepdown", "Neuro Surgical Intensive Care Unit"]
neuro_icu = icu[icu["first_careunit"].isin(neuro_units)]
neuro_hadm_ids = neuro_icu["hadm_id"].unique().tolist()


# ============================================================
# 5. Process Discharge and Radiology Notes
# ============================================================
NOTE_DIR = "/kaggle/input/neuro-icu/note/note/"

discharge_clean = process_notes_in_chunks(f"{NOTE_DIR}discharge.csv", neuro_hadm_ids)
radiology_clean = process_notes_in_chunks(f"{NOTE_DIR}radiology.csv", neuro_hadm_ids, text_col="text")

notes = pd.concat([discharge_clean, radiology_clean], ignore_index=True)
notes = notes.drop_duplicates(subset=["hadm_id", "clean_text"])
print(f"ðŸ§¾ Total Neuro ICU admissions with notes: {notes['hadm_id'].nunique()}")


# ============================================================
# 6. Condition Tagging (regex patterns)
# ============================================================
conditions = {
    "Ischemic_Stroke": r"\bstroke\b|\bcerebral infarct\b|\bischemic\b",
    "Hemorrhage": r"\bhemorrhage\b|\bbleed\b|\bICH\b|\bSAH\b",
    "Seizure": r"\bseizure\b|\bconvulsion\b|\bstatus epilepticus\b",
    "Brain_Tumor": r"\btumor\b|\bneoplasm\b|\bmeningioma\b|\bglioblastoma\b",
    "Hydrocephalus": r"\bhydrocephalus\b|\bventriculomegaly\b",
}

for cond, pattern in conditions.items():
    notes[cond] = notes["clean_text"].str.contains(pattern, flags=re.IGNORECASE, regex=True, na=False).astype(int)


# ============================================================
# 7. Collapse to per-admission level (avoid double counting)
# ============================================================
# Aggregate to 1 row per hadm_id â†’ any mention in any note = 1
adm_conditions = notes.groupby("hadm_id")[list(conditions.keys())].max().reset_index()

# Merge with neuro ICU reference
notes_merged = neuro_icu.merge(adm_conditions, on="hadm_id", how="left").fillna(0)

# Ensure integer dtype
for c in conditions.keys():
    notes_merged[c] = notes_merged[c].astype(int)


# ============================================================
# 8. Save Cleaned + Tagged Notes (per admission)
# ============================================================
notes_merged.to_csv("clean_neuro_notes_tagged.csv", index=False)
print("âœ… Saved clean + tagged Neuro ICU notes â†’ 'clean_neuro_notes_tagged.csv'")


# ============================================================
# 9. Summary Statistics
# ============================================================
avg_len = notes["clean_text"].str.len().mean()
condition_counts = notes_merged[list(conditions.keys())].sum().sort_values(ascending=False)

print(f"\nðŸ“Š Average note length (chars): {avg_len:.0f}")
print("ðŸ§© Condition distribution (unique admissions):\n", condition_counts)
print(f"\nðŸ§¾ Total Neuro ICU admissions: {notes_merged['hadm_id'].nunique()}")


Processing /kaggle/input/neuro-icu/note/note/discharge.csv: 7it [00:33,  4.82s/it]
Processing /kaggle/input/neuro-icu/note/note/radiology.csv: 47it [00:27,  1.74it/s]


ðŸ§¾ Total Neuro ICU admissions with notes: 3780
âœ… Saved clean + tagged Neuro ICU notes â†’ 'clean_neuro_notes_tagged.csv'

ðŸ“Š Average note length (chars): 2838
ðŸ§© Condition distribution (unique admissions):
 Hemorrhage         3147
Ischemic_Stroke    2287
Seizure            1853
Brain_Tumor        1019
Hydrocephalus      1002
dtype: int64

ðŸ§¾ Total Neuro ICU admissions: 6884


In [14]:
# ============================================================
# 8. Merge condition tags + Feature Scaling + Train/Test Split
# ============================================================
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# ----------------------------------------
# 1. Load base structured dataset
# ----------------------------------------
model_df = pd.read_csv("model_ready_neuro_6h.csv")

# ----------------------------------------
# 2. Load condition tags (from notes)
# ----------------------------------------
cond_df = pd.read_csv("clean_neuro_notes_tagged.csv")

# Keep only required columns for merge
condition_cols = [
    "hadm_id",
    "Ischemic_Stroke", "Hemorrhage", "Seizure", "Brain_Tumor", "Hydrocephalus"
]

cond_df = cond_df[condition_cols].drop_duplicates(subset="hadm_id")

# Merge using hadm_id
model_df = model_df.merge(cond_df, on="hadm_id", how="left")

# Fill missing condition flags with 0 (patients without matching notes)
for c in condition_cols[1:]:
    model_df[c] = model_df[c].fillna(0).astype(int)

print(f"âœ… After merging notes: {model_df.shape}")
print(f"Columns now include: {list(model_df.columns)}")

# ----------------------------------------
# 3. Select feature columns
# ----------------------------------------
feature_cols = [
    "vital_mean", "vital_std", "vital_min", "vital_max", "vital_count",
    "total_input_ml", "total_output_ml", "los"
]
neuro_flags = [
    "Ischemic_Stroke", "Hemorrhage", "Seizure", "Brain_Tumor", "Hydrocephalus"
]

# ----------------------------------------
# 4. Define X and y
# ----------------------------------------
X = model_df[feature_cols + neuro_flags].copy()
y = model_df["ventilation_within_12h"].astype(int)

# Replace inf/nan values
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.median())

# ----------------------------------------
# 5. Train-test split (stratified)
# ----------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ----------------------------------------
# 6. Feature scaling (StandardScaler)
# ----------------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nâœ… Data ready for modeling!")
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")
print(f"Positive label proportion (train): {y_train.mean():.3f}")
print(f"Positive label proportion (test):  {y_test.mean():.3f}")

# ----------------------------------------
# 7. Save Scaled Datasets
# ----------------------------------------
train_df = pd.DataFrame(X_train_scaled, columns=X.columns)
train_df["label"] = y_train.values

test_df = pd.DataFrame(X_test_scaled, columns=X.columns)
test_df["label"] = y_test.values

train_df.to_csv("train_neuro6h_scaled.csv", index=False)
test_df.to_csv("test_neuro6h_scaled.csv", index=False)

print("\nðŸ’¾ Saved:")
print("- train_neuro6h_scaled.csv")
print("- test_neuro6h_scaled.csv")


âœ… After merging notes: (7197, 20)
Columns now include: ['subject_id', 'hadm_id', 'stay_id', 'first_careunit', 'intime', 'outtime', 'los', 'ventilation_within_12h', 'vital_mean', 'vital_std', 'vital_min', 'vital_max', 'vital_count', 'total_input_ml', 'total_output_ml', 'Ischemic_Stroke', 'Hemorrhage', 'Seizure', 'Brain_Tumor', 'Hydrocephalus']

âœ… Data ready for modeling!
Train shape: (5757, 13), Test shape: (1440, 13)
Positive label proportion (train): 0.138
Positive label proportion (test):  0.138

ðŸ’¾ Saved:
- train_neuro6h_scaled.csv
- test_neuro6h_scaled.csv
