In [6]:
import pandas as pd
import json
import ast

# ---------------- paths ----------------
mapping_path = "raw_data/lab_test_mapping.csv"
seq_path = "data/extracted_sequences_with_lab_classification_text.csv"

# 1) Load mapping and keep only exact item-level rows
map_df= pd.read_csv(mapping_path)

# Drop rows without itemid or label (these are panel/group rows like BMP/CMP)
map_df = map_df.dropna(subset=["itemid", "label"]).copy()
map_df["itemid"] = map_df["itemid"].astype(int)
map_df["label"] = map_df["label"].astype(str).str.strip()

# If duplicate itemid exists, keep first label (you can change to 'last' if preferred)
map_df = map_df.drop_duplicates(subset=["itemid"], keep="first")

# Exact mapping: itemid -> label
id_to_label = dict(zip(map_df["itemid"], map_df["label"]))

# 2) Load extracted sequences
df_lab_tests= pd.read_csv(seq_path)

def parse_lab_dict(x):
    if pd.isna(x):
        return {}
    s = str(x)
    try:
        return json.loads(s)          # preferred
    except Exception:
        try:
            return ast.literal_eval(s) # fallback
        except Exception:
            return {}

def replace_ids_with_exact_names(d):
    new_d = {}
    for k, v in d.items():
        try:
            itemid = int(k)
            # exact mapping only; keep original id string if not found
            new_key = id_to_label.get(itemid, str(k))
        except Exception:
            new_key = str(k)

        # avoid overwriting if same name appears multiple times
        if new_key in new_d:
            if not isinstance(new_d[new_key], list):
                new_d[new_key] = [new_d[new_key]]
            new_d[new_key].append(v)
        else:
            new_d[new_key] = v
    return new_d

# 3) Replace keys in lab_tests_classified
parsed = df_lab_tests["lab_tests_classified_text"].apply(parse_lab_dict)
mapped = parsed.apply(replace_ids_with_exact_names)

# overwrite the original column with mapped JSON string
df_lab_tests["lab_tests_classified_text"] = mapped.apply(lambda d: json.dumps(d, ensure_ascii=False))


In [7]:
import pandas as pd
import json
import ast
from collections import defaultdict

# ---------------- paths ----------------
# If your lab test IDs are already replaced with exact names, point this to that file.
radio_path = "raw_data/cholecystitis_hadm_info_first_diag.csv"
out_path = "state_text.json"

# ---------------- helpers ----------------
def parse_obj(x):
    """Parse JSON-ish string to python object."""
    if pd.isna(x):
        return None
    s = str(x).strip()
    if s == "" or s.lower() == "nan":
        return None
    try:
        return json.loads(s)
    except Exception:
        try:
            return ast.literal_eval(s)
        except Exception:
            return s  # keep raw text if not parseable

def build_radiology_sequence(radiology_obj):
    """
    Preserve original radiology test order as a list of events.
    Output example:
    [
      {"modality": "CT", "note_id": ..., "region": ..., "exam_name": ..., "report": ...},
      {"modality": "Ultrasound", ...},
      ...
    ]
    """
    out = []

    if radiology_obj is None:
        return out

    def normalize_item(item):
        if isinstance(item, dict):
            modality = str(item.get("Modality", "other")).strip() or "other"
            return {
                "modality": modality,
                "note_id": item.get("Note ID"),
                "region": item.get("Region"),
                "exam_name": item.get("Exam Name"),
                "report": item.get("Report"),
            }
        return {"modality": "other", "raw": item}

    if isinstance(radiology_obj, list):
        for item in radiology_obj:
            out.append(normalize_item(item))
    elif isinstance(radiology_obj, dict):
        out.append(normalize_item(radiology_obj))
    else:
        out.append({"modality": "other", "raw": radiology_obj})

    return out

# ---------------- load ----------------
rad_df = pd.read_csv(radio_path)

# normalize hadm_id
df_lab_tests["hadm_id"] = pd.to_numeric(df_lab_tests["hadm_id"], errors="coerce").astype("Int64")
rad_df["hadm_id"] = pd.to_numeric(rad_df["hadm_id"], errors="coerce").astype("Int64")

df_lab_tests = df_lab_tests.dropna(subset=["hadm_id"]).copy()
rad_df = rad_df.dropna(subset=["hadm_id"]).copy()

# keep only needed columns
seq_small = df_lab_tests[["hadm_id", "hpi", "lab_tests_classified_text"]].copy()
rad_small = rad_df[["hadm_id", "Radiology"]].copy()

# if duplicate hadm_id exists, keep first row (change if you need different behavior)
seq_small = seq_small.drop_duplicates(subset=["hadm_id"], keep="first")
rad_small = rad_small.drop_duplicates(subset=["hadm_id"], keep="first")

# merge
merged = seq_small.merge(rad_small, on="hadm_id", how="left")

# ---------------- build output ----------------
state = {}

for _, row in merged.iterrows():
    hadm = str(int(row["hadm_id"]))

    hpi_text = "" if pd.isna(row["hpi"]) else str(row["hpi"])

    lab_obj = parse_obj(row["lab_tests_classified_text"])
    if not isinstance(lab_obj, dict):
        lab_obj = {} if lab_obj is None else {"raw": lab_obj}

    radiology_obj = parse_obj(row["Radiology"])
    radiology_seq = build_radiology_sequence(radiology_obj)

    state[hadm] = {
        "hpi": hpi_text,
        "lab_tests": lab_obj,
        "radiology": radiology_seq
    }

# save
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(state, f, ensure_ascii=False, indent=2)

print(f"Saved {len(state)} patients to {out_path}")

Saved 648 patients to state_text.json


In [3]:
import ast
import pandas as pd

# load data
df = pd.read_csv("data/extracted_sequences.csv")

# change this if your real column name differs (e.g., "diagnoses")
col = "diagnoses"

def first_diagnosis_as_lower(value):
    """
    Parse list-like diagnosis cell and return normalized first diagnosis.
    """
    if pd.isna(value):
        return None

    s = str(value).strip()

    # Fix doubled quotes like ""Mirizzi's syndrome"" -> "Mirizzi's syndrome"
    s = s.replace('""', '"')

    # Try Python literal parse first (works for "['a', 'b']")
    try:
        arr = ast.literal_eval(s)
    except Exception:
        return None

    if not isinstance(arr, list) or len(arr) == 0:
        return None

    first = str(arr[0]).strip().lower()  # unify letter case
    return first if first else None

# get unique first diagnoses
first_dx_set = {
    dx for dx in df[col].map(first_diagnosis_as_lower).dropna()
}

print(f"Unique first diagnoses: {len(first_dx_set)}")
print(sorted(first_dx_set))

Unique first diagnoses: 109
['acalculous cholecystitis', 'acalculus cholecystitis', 'active diagnoses', 'acue cholecystitis', 'acute acalculous cholecystitis', 'acute and chronic cholecystitis', 'acute and chronic cholecystitis multiple gallstones', 'acute calculous cholecystitis', 'acute calculus cholecystitis, cholangitis', 'acute cholecystitis', 'acute cholecystitis (gallbladder infection)', 'acute cholecystitis (gangrenous)', 'acute cholecystitis (infected gallbladder)', 'acute cholecystitis (necrosis of gall bladder)', 'acute cholecystitis and cholangitis', 'acute cholecystitis and choledocholithiasis', 'acute cholecystitis and cholelithiasis', 'acute cholecystitis and hydrops of the gallbladder', 'acute cholecystitis complicated by bacteremia and septic shock', 'acute cholecystitis s/p ___ percutaneous cholecystostomy', 'acute cholecystitis s/p lap cholecystectomy', 'acute cholecystitis s/p lap cholecystitis', 'acute cholecystitis s/p laparoscopic cholecystectomy', 'acute cholecy

In [1]:
import ast
import pandas as pd

df = pd.read_csv("data/extracted_sequences_with_lab_classification_text.csv")

col = "modality_region_sequence"

def parse_list_cell(x):
    if pd.isna(x):
        return []
    s = str(x).strip()
    s = s.replace('""', '"')  # defensive cleanup for doubled quotes
    try:
        v = ast.literal_eval(s)
        return v if isinstance(v, list) else []
    except Exception:
        return []

# flatten all list elements -> normalize case -> set
all_modalities = {
    str(item).strip().lower()
    for seq in df[col].map(parse_list_cell)
    for item in seq
    if str(item).strip()
}

print(f"Unique modality_region_sequence elements: {len(all_modalities)}")
print(sorted(all_modalities))

Unique modality_region_sequence elements: 23
['carotid ultrasound_neck', 'ct_abdomen', 'ct_chest', 'ct_head', 'ct_spine', 'ctu_abdomen', 'drainage_abdomen', 'ercp_abdomen', 'fluoroscopy_chest', 'mrcp_abdomen', 'mre_abdomen', 'mri_abdomen', 'mri_head', 'mri_spine', 'radiograph_abdomen', 'radiograph_ankle', 'radiograph_chest', 'radiograph_knee', 'radiograph_venous', 'ultrasound_abdomen', 'ultrasound_neck', 'ultrasound_venous', 'upper gi series_abdomen']


In [6]:
import json
from pathlib import Path

path = Path("data/state_trajectories_denoised.json")

with path.open("r", encoding="utf-8") as f:
    data = json.load(f)

# For each patient: take the last state's modality history
patient_to_last_history = {}
# Option A: one final modality_region per patient (last item in that history)
final_modality_per_patient = {}
# Option B: union of all modalities in each patient's final history
all_modalities_from_final_histories = set()

for pid, traj in data.items():
    if not traj:
        continue

    last_state = traj[-1]

    # handle either key style just in case
    history = last_state.get("Modality_History", last_state.get("modality_history", []))
    if not history:
        continue

    patient_to_last_history[pid] = history
    final_modality_per_patient[pid] = history[-1]      # last modality_region for this patient
    all_modalities_from_final_histories.update(history) # all modality_region types seen in final histories

# If you want exactly one modality_region per patient, then unique across patients:
unique_final_modalities = set(final_modality_per_patient.values())

print("Num patients with non-empty final modality history:", len(patient_to_last_history))
print("Unique FINAL modality_region types (one per patient):")
print(sorted(unique_final_modalities))

print("\nUnique modality_region types from FINAL histories (all entries in each patient's final history):")
print(sorted(all_modalities_from_final_histories))

Num patients with non-empty final modality history: 648
Unique FINAL modality_region types (one per patient):
['CT_Abdomen', 'CT_Chest', 'CT_Head', 'ERCP_Abdomen', 'MRCP_Abdomen', 'MRI_Abdomen', 'MRI_Head', 'Radiograph_Abdomen', 'Radiograph_Chest', 'Radiograph_Venous', 'Ultrasound_Abdomen', 'Ultrasound_Neck', 'Ultrasound_Venous', 'Upper_GI_Series_Abdomen']

Unique modality_region types from FINAL histories (all entries in each patient's final history):
['CTU_Abdomen', 'CT_Abdomen', 'CT_Chest', 'CT_Head', 'CT_Spine', 'Drainage_Abdomen', 'ERCP_Abdomen', 'Fluoroscopy_Chest', 'Lab_Panel', 'MRCP_Abdomen', 'MRE_Abdomen', 'MRI_Abdomen', 'MRI_Head', 'MRI_Spine', 'Radiograph_Abdomen', 'Radiograph_Ankle', 'Radiograph_Chest', 'Radiograph_Knee', 'Radiograph_Venous', 'Ultrasound_Abdomen', 'Ultrasound_Neck', 'Ultrasound_Venous', 'Upper_GI_Series_Abdomen']


In [3]:
import json
from pathlib import Path

BASE = Path(".")
TEXT_PATH = BASE / "data/state_text.json"
TRAJ_PATH = BASE / "data/state_trajectories.json"
OUT_PATH = BASE / "data/state_trajectories_denoised.json"  # keep original unchanged


def norm(x: str, default: str = "Other") -> str:
    x = (x or "").strip()
    if not x:
        return default
    return "_".join(x.split())


def infer_modality(modality_key: str, region: str) -> str:
    # same rule as encode.py
    return f"{norm(modality_key)}_{norm(region)}"


with TEXT_PATH.open("r", encoding="utf-8") as f:
    state_text = json.load(f)

with TRAJ_PATH.open("r", encoding="utf-8") as f:
    traj = json.load(f)

n_patients = 0
n_states_updated = 0
mismatch_patients = []

for pid, states in traj.items():
    patient = state_text.get(pid)
    if patient is None:
        continue

    radiology = patient.get("radiology", [])
    canonical_seq = ["Lab_Panel"] + [
        infer_modality(r.get("modality", "other"), r.get("region", "Other"))
        for r in radiology
    ]

    # Expected from your generation logic: num_states == 1 + len(radiology)
    if len(states) != len(canonical_seq):
        mismatch_patients.append(
            {
                "patient_id": pid,
                "num_states": len(states),
                "expected_states": len(canonical_seq),
            }
        )

    # Overwrite each state's modality history algorithmically
    for i, st in enumerate(states):
        upto = min(i + 1, len(canonical_seq))
        st["Modality_History"] = canonical_seq[:upto]
        n_states_updated += 1

    n_patients += 1

with OUT_PATH.open("w", encoding="utf-8") as f:
    json.dump(traj, f, ensure_ascii=False, indent=2)

print(f"Updated patients: {n_patients}")
print(f"Updated states: {n_states_updated}")
print(f"Patients with length mismatch: {len(mismatch_patients)}")
if mismatch_patients:
    print("First 10 mismatches:")
    for m in mismatch_patients[:10]:
        print(m)

Updated patients: 648
Updated states: 2331
Patients with length mismatch: 0
