In [7]:
import pandas as pd
import json
import ast

# ---------------- paths ----------------
mapping_path = "data/lab_test_mapping.csv"
seq_path = "extracted_sequences_with_lab_classification_text.csv"
out_path = "extracted_sequences_exact_test_names.csv"

# 1) Load mapping and keep only exact item-level rows
map_df= pd.read_csv(mapping_path)

# Drop rows without itemid or label (these are panel/group rows like BMP/CMP)
map_df = map_df.dropna(subset=["itemid", "label"]).copy()
map_df["itemid"] = map_df["itemid"].astype(int)
map_df["label"] = map_df["label"].astype(str).str.strip()

# If duplicate itemid exists, keep first label (you can change to 'last' if preferred)
map_df = map_df.drop_duplicates(subset=["itemid"], keep="first")

# Exact mapping: itemid -> label
id_to_label = dict(zip(map_df["itemid"], map_df["label"]))

# 2) Load extracted sequences
df_lab_tests= pd.read_csv(seq_path)

def parse_lab_dict(x):
    if pd.isna(x):
        return {}
    s = str(x)
    try:
        return json.loads(s)          # preferred
    except Exception:
        try:
            return ast.literal_eval(s) # fallback
        except Exception:
            return {}

def replace_ids_with_exact_names(d):
    new_d = {}
    for k, v in d.items():
        try:
            itemid = int(k)
            # exact mapping only; keep original id string if not found
            new_key = id_to_label.get(itemid, str(k))
        except Exception:
            new_key = str(k)

        # avoid overwriting if same name appears multiple times
        if new_key in new_d:
            if not isinstance(new_d[new_key], list):
                new_d[new_key] = [new_d[new_key]]
            new_d[new_key].append(v)
        else:
            new_d[new_key] = v
    return new_d

# 3) Replace keys in lab_tests_classified
parsed = df_lab_tests["lab_tests_classified_text"].apply(parse_lab_dict)
mapped = parsed.apply(replace_ids_with_exact_names)

# overwrite the original column with mapped JSON string
df_lab_tests["lab_tests_classified_text"] = mapped.apply(lambda d: json.dumps(d, ensure_ascii=False))


In [10]:
import pandas as pd
import json
import ast
from collections import defaultdict

# ---------------- paths ----------------
# If your lab test IDs are already replaced with exact names, point this to that file.
radio_path = "data/cholecystitis_hadm_info_first_diag.csv"
out_path = "state_text.json"

# ---------------- helpers ----------------
def parse_obj(x):
    """Parse JSON-ish string to python object."""
    if pd.isna(x):
        return None
    s = str(x).strip()
    if s == "" or s.lower() == "nan":
        return None
    try:
        return json.loads(s)
    except Exception:
        try:
            return ast.literal_eval(s)
        except Exception:
            return s  # keep raw text if not parseable

def build_radiology_dict(radiology_obj):
    """
    Convert radiology data into dict format:
    {
      "CT": [{"note_id":..., "region":..., "exam_name":..., "report":...}, ...],
      "Ultrasound": [...],
      "other": [...]
    }
    """
    out = defaultdict(list)

    if radiology_obj is None:
        return {}

    # most rows are list[dict], but keep robust handling
    if isinstance(radiology_obj, list):
        for item in radiology_obj:
            if isinstance(item, dict):
                modality = str(item.get("Modality", "other")).strip() or "other"
                out[modality].append({
                    "note_id": item.get("Note ID"),
                    "region": item.get("Region"),
                    "exam_name": item.get("Exam Name"),
                    "report": item.get("Report")
                })
            else:
                out["other"].append({"raw": item})
    elif isinstance(radiology_obj, dict):
        modality = str(radiology_obj.get("Modality", "other")).strip() or "other"
        out[modality].append({
            "note_id": radiology_obj.get("Note ID"),
            "region": radiology_obj.get("Region"),
            "exam_name": radiology_obj.get("Exam Name"),
            "report": radiology_obj.get("Report")
        })
    else:
        out["other"].append({"raw": radiology_obj})

    return dict(out)

# ---------------- load ----------------
rad_df = pd.read_csv(radio_path)

# normalize hadm_id
df_lab_tests["hadm_id"] = pd.to_numeric(df_lab_tests["hadm_id"], errors="coerce").astype("Int64")
rad_df["hadm_id"] = pd.to_numeric(rad_df["hadm_id"], errors="coerce").astype("Int64")

df_lab_tests = df_lab_tests.dropna(subset=["hadm_id"]).copy()
rad_df = rad_df.dropna(subset=["hadm_id"]).copy()

# keep only needed columns
seq_small = df_lab_tests[["hadm_id", "hpi", "lab_tests_classified_text"]].copy()
rad_small = rad_df[["hadm_id", "Radiology"]].copy()

# if duplicate hadm_id exists, keep first row (change if you need different behavior)
seq_small = seq_small.drop_duplicates(subset=["hadm_id"], keep="first")
rad_small = rad_small.drop_duplicates(subset=["hadm_id"], keep="first")

# merge
merged = seq_small.merge(rad_small, on="hadm_id", how="left")

# ---------------- build output ----------------
state = {}

for _, row in merged.iterrows():
    hadm = str(int(row["hadm_id"]))

    hpi_text = "" if pd.isna(row["hpi"]) else str(row["hpi"])

    lab_obj = parse_obj(row["lab_tests_classified_text"])
    if not isinstance(lab_obj, dict):
        lab_obj = {} if lab_obj is None else {"raw": lab_obj}

    radiology_obj = parse_obj(row["Radiology"])
    radiology_dict = build_radiology_dict(radiology_obj)

    state[hadm] = {
        "hpi": hpi_text,
        "lab_tests": lab_obj,
        "radiology": radiology_dict
    }

# save
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(state, f, ensure_ascii=False, indent=2)

print(f"Saved {len(state)} patients to {out_path}")

Saved 648 patients to state_text.json
