## hpi

In [1]:
import pandas as pd
import re

# Read the data
cholecystitis_df = pd.read_csv('raw_data/diverticulitis_hadm_info_first_diag.csv')


def parse_patient_history(text):
    """Extract structured sections from patient history text"""
    if pd.isna(text):
        return {
            'hpi': None,
            'past_medical_history': None,
            'past_surgical_history': None,
            'social_history': None,
            'family_history': None
        }
    
    # Initialize dict
    sections = {}
    
    # Extract Past Medical History (two patterns: "PMH:" and "Past Medical History:")
    pmh_match = re.search(r'Past Medical History:\s*(.+?)(?=\s*PSH:|Social History:|Family History:|$)', text, re.DOTALL)
    if not pmh_match:
        pmh_match = re.search(r'PMH:\s*(.+?)(?=\s*PSH:|Social History:|Family History:|$)', text, re.DOTALL)
    sections['past_medical_history'] = pmh_match.group(1).strip() if pmh_match else None
    
    # Extract Past Surgical History
    psh_match = re.search(r'PSH:\s*(.+?)(?=\s*Social History:|Family History:|$)', text, re.DOTALL)
    sections['past_surgical_history'] = psh_match.group(1).strip() if psh_match else None
    
    # Extract Social History
    social_match = re.search(r'Social History:\s*(.+?)(?=\s*Family History:|$)', text, re.DOTALL)
    sections['social_history'] = social_match.group(1).strip() if social_match else None
    
    # Extract Family History
    family_match = re.search(r'Family History:\s*(.+?)$', text, re.DOTALL)
    sections['family_history'] = family_match.group(1).strip() if family_match else None
    
    # Extract HPI (everything before first section header)
    hpi_match = re.search(r'^(.+?)(?=\s*Past Medical History:|PMH:|Social History:|Family History:|$)', text, re.DOTALL)
    sections['hpi'] = hpi_match.group(1).strip() if hpi_match else text.strip()
    
    return sections

# Apply parsing
history_parsed = cholecystitis_df['Patient History'].apply(parse_patient_history)
history_df = pd.DataFrame(history_parsed.tolist())
history_df['hadm_id'] = cholecystitis_df['hadm_id']

In [2]:
history_df.head()

Unnamed: 0,past_medical_history,past_surgical_history,social_history,family_history,hpi,hadm_id
0,Neuropathy Insomnia Hypercholesteremia Hyperte...,,___,Mother had a large MI at age ___ and died from...,Patient is a ___ M with PMHx of atrial fibrill...,24911566
1,1. Coronary artery disease status post coronar...,,___ ___ History: Mother - CHF Father - died ...,,This is a ___ yo M with late stage Alzheimer's...,24600926
2,Past Medical History: Headaches,,___,Family History: Mother with HTN. Father died a...,"___ presenting with 5 days of LLQ, subjective ...",28694648
3,Hand surgery,,___,___ contributory,This is a ___ year old male who presented to t...,20410636
4,"PMH: IBS type symptoms, recently improved. Po...","lap ccy, c-scope ___ (Multiple mild non-bleedi...",___,Noncontributory,"___ female with h/o IBS (chronic diarrhea, blo...",23877579


## Lab test (textual)

In [3]:
import pandas as pd
import json
import re

def extract_numeric_value(value_str):
    """
    Extract numeric value from string like "62.0 IU/L", "1.0 #/hpf", etc.
    Returns None if value is text-only (NEG., NONE, etc.)
    """
    if pd.isna(value_str) or value_str is None:
        return None, None
    
    value_str = str(value_str).strip()
    
    # Check if it's a text-only result (NEG., NONE, HOLD., etc.)
    text_only_patterns = ['NEG', 'NONE', 'HOLD', 'DISCARD', 'FEW', 'RARE', 
                          'MODERATE', 'MANY', 'Clear', 'Yellow', 'NotDone',
                          'RANDOM', 'Using this', '<', '>']
    
    for pattern in text_only_patterns:
        if pattern in value_str:
            return None, value_str  # Return original text
    
    # Try to extract numeric value
    match = re.search(r'([\d.]+)', value_str)
    if match:
        try:
            numeric_val = float(match.group(1))
            return numeric_val, value_str
        except ValueError:
            return None, value_str
    
    return None, value_str

def classify_lab_result(test_value, ref_lower, ref_upper):
    """
    Classify lab result as:
    - Normal: Normal (within range)
    - Decreased: Too low (below lower limit)
    - Elevated: Too high (above upper limit)
    - N/A: Keep original text if non-numeric
    """
    numeric_val, original_text = extract_numeric_value(test_value)
    
    # If no numeric value extracted, return original text
    if numeric_val is None:
        return original_text if original_text else "N/A"
    
    # If no reference ranges available, return "unknown"
    if pd.isna(ref_lower) and pd.isna(ref_upper):
        return "unknown"
    
    # Compare with reference ranges
    if not pd.isna(ref_lower) and numeric_val < ref_lower:
        return "Decreased"  # Too low
    elif not pd.isna(ref_upper) and numeric_val > ref_upper:
        return "Elevated"   # Too high
    else:
        return "Normal"   # Normal

def process_lab_tests(row):
    """
    Process laboratory tests for one patient
    Returns a dictionary of test_id: classification
    """
    try:
        lab_tests = json.loads(row['Laboratory Tests']) if isinstance(row['Laboratory Tests'], str) else row['Laboratory Tests']
        ref_lower = json.loads(row['Reference Range Lower']) if isinstance(row['Reference Range Lower'], str) else row['Reference Range Lower']
        ref_upper = json.loads(row['Reference Range Upper']) if isinstance(row['Reference Range Upper'], str) else row['Reference Range Upper']
    except:
        return {}
    
    if not isinstance(lab_tests, dict):
        return {}
    
    classified_results = {}
    
    for test_id, test_value in lab_tests.items():
        lower = ref_lower.get(test_id) if isinstance(ref_lower, dict) else None
        upper = ref_upper.get(test_id) if isinstance(ref_upper, dict) else None
        
        classification = classify_lab_result(test_value, lower, upper)
        classified_results[test_id] = classification
    
    return classified_results


In [4]:
# Example usage:
df = pd.read_csv('raw_data/diverticulitis_hadm_info_first_diag.csv')

# Apply processing
df['lab_tests_classified'] = df.apply(process_lab_tests, axis=1)

# Convert to JSON string for storage
df['lab_tests_classified_json'] = df['lab_tests_classified'].apply(
    lambda x: json.dumps(x) if isinstance(x, dict) else "{}")


lab_classification_df = df[['hadm_id', 'lab_tests_classified_json']].copy()
lab_classification_df.columns = ['hadm_id', 'lab_tests_classified_text']

In [5]:
lab_classification_df.head()

Unnamed: 0,hadm_id,lab_tests_classified_text
0,24911566,"{""51146"": ""Normal"", ""51006"": ""Elevated"", ""5098..."
1,24600926,"{""50861"": ""Normal"", ""51200"": ""Normal"", ""51221""..."
2,28694648,"{""50933"": ""HOLD. DISCARD GREATER THAN 4 HOURS..."
3,20410636,"{""50861"": ""Normal"", ""51233"": ""NORMAL."", ""51244..."
4,23877579,"{""50861"": ""Normal"", ""51279"": ""Normal"", ""51277""..."


### Replace test code by name

In [7]:
import pandas as pd
import json
import ast

# ---------------- paths ----------------
mapping_path = "raw_data/lab_test_mapping.csv"

# 1) Load mapping and keep only exact item-level rows
map_df= pd.read_csv(mapping_path)

# Drop rows without itemid or label (these are panel/group rows like BMP/CMP)
map_df = map_df.dropna(subset=["itemid", "label"]).copy()
map_df["itemid"] = map_df["itemid"].astype(int)
map_df["label"] = map_df["label"].astype(str).str.strip()

# If duplicate itemid exists, keep first label (you can change to 'last' if preferred)
map_df = map_df.drop_duplicates(subset=["itemid"], keep="first")

# Exact mapping: itemid -> label
id_to_label = dict(zip(map_df["itemid"], map_df["label"]))

# 2) Load extracted sequences
df_lab_tests= lab_classification_df

def parse_lab_dict(x):
    if pd.isna(x):
        return {}
    s = str(x)
    try:
        return json.loads(s)          # preferred
    except Exception:
        try:
            return ast.literal_eval(s) # fallback
        except Exception:
            return {}

def replace_ids_with_exact_names(d):
    new_d = {}
    for k, v in d.items():
        try:
            itemid = int(k)
            # exact mapping only; keep original id string if not found
            new_key = id_to_label.get(itemid, str(k))
        except Exception:
            new_key = str(k)

        # avoid overwriting if same name appears multiple times
        if new_key in new_d:
            if not isinstance(new_d[new_key], list):
                new_d[new_key] = [new_d[new_key]]
            new_d[new_key].append(v)
        else:
            new_d[new_key] = v
    return new_d

# 3) Replace keys in lab_tests_classified
parsed = df_lab_tests["lab_tests_classified_text"].apply(parse_lab_dict)
mapped = parsed.apply(replace_ids_with_exact_names)

# overwrite the original column with mapped JSON string
df_lab_tests["lab_tests_classified_text"] = mapped.apply(lambda d: json.dumps(d, ensure_ascii=False))


In [9]:
df_lab_tests

Unnamed: 0,hadm_id,lab_tests_classified_text
0,24911566,"{""Basophils"": ""Normal"", ""Urea Nitrogen"": ""Elev..."
1,24600926,"{""Alanine Aminotransferase (ALT)"": ""Normal"", ""..."
2,28694648,"{""Green Top Hold, plasma"": ""HOLD. DISCARD GRE..."
3,20410636,"{""Alanine Aminotransferase (ALT)"": ""Normal"", ""..."
4,23877579,"{""Alanine Aminotransferase (ALT)"": ""Normal"", ""..."
...,...,...
252,26360008,"{""Alanine Aminotransferase (ALT)"": ""Normal"", ""..."
253,21055588,"{""Green Top Hold, plasma"": ""HOLD. DISCARD GRE..."
254,22255808,"{""INR(PT)"": ""Normal"", ""Alanine Aminotransferas..."
255,24921121,"{""Alanine Aminotransferase (ALT)"": ""Normal"", ""..."


## Radiology sequences

In [10]:
df_hpi_lab = df_lab_tests.merge(history_df, on='hadm_id', how='left')

In [11]:
df_hpi_lab

Unnamed: 0,hadm_id,lab_tests_classified_text,past_medical_history,past_surgical_history,social_history,family_history,hpi
0,24911566,"{""Basophils"": ""Normal"", ""Urea Nitrogen"": ""Elev...",Neuropathy Insomnia Hypercholesteremia Hyperte...,,___,Mother had a large MI at age ___ and died from...,Patient is a ___ M with PMHx of atrial fibrill...
1,24600926,"{""Alanine Aminotransferase (ALT)"": ""Normal"", ""...",1. Coronary artery disease status post coronar...,,___ ___ History: Mother - CHF Father - died ...,,This is a ___ yo M with late stage Alzheimer's...
2,28694648,"{""Green Top Hold, plasma"": ""HOLD. DISCARD GRE...",Past Medical History: Headaches,,___,Family History: Mother with HTN. Father died a...,"___ presenting with 5 days of LLQ, subjective ..."
3,20410636,"{""Alanine Aminotransferase (ALT)"": ""Normal"", ""...",Hand surgery,,___,___ contributory,This is a ___ year old male who presented to t...
4,23877579,"{""Alanine Aminotransferase (ALT)"": ""Normal"", ""...","PMH: IBS type symptoms, recently improved. Po...","lap ccy, c-scope ___ (Multiple mild non-bleedi...",___,Noncontributory,"___ female with h/o IBS (chronic diarrhea, blo..."
...,...,...,...,...,...,...,...
252,26360008,"{""Alanine Aminotransferase (ALT)"": ""Normal"", ""...",PAST MEDICAL HISTORY: IBS HA Psoriasis Osteoar...,,___,"Father- colon cancer Mother- arthritis, asthma...",Pt has several months of severe abdominal pain...
253,21055588,"{""Green Top Hold, plasma"": ""HOLD. DISCARD GRE...",PMH: ___'s thyroiditis,,___,"No first or second degree FH of GI cancers, or...",___ male past medical history significant for ...
254,22255808,"{""INR(PT)"": ""Normal"", ""Alanine Aminotransferas...",PAST MEDICAL HISTORY: right trochanteric bursi...,,___,Non-contributory,___ with history of IBS who presents with 3-da...
255,24921121,"{""Alanine Aminotransferase (ALT)"": ""Normal"", ""...",-Non-ischemic cardiomyopathy -Mitral and tricu...,,___,+CAD. Both parents died in fire age <___.,Ms. ___ is a ___ year old female with PMH Grav...


In [12]:
import pandas as pd
import json
import ast
from collections import defaultdict

# ---------------- paths ----------------
# If your lab test IDs are already replaced with exact names, point this to that file.
radio_path = "raw_data/diverticulitis_hadm_info_first_diag.csv"
out_path = "state_text_diver.json"

# ---------------- helpers ----------------
def parse_obj(x):
    """Parse JSON-ish string to python object."""
    if pd.isna(x):
        return None
    s = str(x).strip()
    if s == "" or s.lower() == "nan":
        return None
    try:
        return json.loads(s)
    except Exception:
        try:
            return ast.literal_eval(s)
        except Exception:
            return s  # keep raw text if not parseable

def build_radiology_sequence(radiology_obj):
    """
    Preserve original radiology test order as a list of events.
    Output example:
    [
      {"modality": "CT", "note_id": ..., "region": ..., "exam_name": ..., "report": ...},
      {"modality": "Ultrasound", ...},
      ...
    ]
    """
    out = []

    if radiology_obj is None:
        return out

    def normalize_item(item):
        if isinstance(item, dict):
            modality = str(item.get("Modality", "other")).strip() or "other"
            return {
                "modality": modality,
                "note_id": item.get("Note ID"),
                "region": item.get("Region"),
                "exam_name": item.get("Exam Name"),
                "report": item.get("Report"),
            }
        return {"modality": "other", "raw": item}

    if isinstance(radiology_obj, list):
        for item in radiology_obj:
            out.append(normalize_item(item))
    elif isinstance(radiology_obj, dict):
        out.append(normalize_item(radiology_obj))
    else:
        out.append({"modality": "other", "raw": radiology_obj})

    return out

# ---------------- load ----------------
rad_df = pd.read_csv(radio_path)

# normalize hadm_id
df_hpi_lab["hadm_id"] = pd.to_numeric(df_hpi_lab["hadm_id"], errors="coerce").astype("Int64")
rad_df["hadm_id"] = pd.to_numeric(rad_df["hadm_id"], errors="coerce").astype("Int64")

df_hpi_lab = df_hpi_lab.dropna(subset=["hadm_id"]).copy()
rad_df = rad_df.dropna(subset=["hadm_id"]).copy()

# keep only needed columns
seq_small = df_hpi_lab[["hadm_id", "hpi", "lab_tests_classified_text"]].copy()
rad_small = rad_df[["hadm_id", "Radiology"]].copy()

# if duplicate hadm_id exists, keep first row (change if you need different behavior)
seq_small = seq_small.drop_duplicates(subset=["hadm_id"], keep="first")
rad_small = rad_small.drop_duplicates(subset=["hadm_id"], keep="first")

# merge
merged = seq_small.merge(rad_small, on="hadm_id", how="left")

# ---------------- build output ----------------
state = {}

for _, row in merged.iterrows():
    hadm = str(int(row["hadm_id"]))

    hpi_text = "" if pd.isna(row["hpi"]) else str(row["hpi"])

    lab_obj = parse_obj(row["lab_tests_classified_text"])
    if not isinstance(lab_obj, dict):
        lab_obj = {} if lab_obj is None else {"raw": lab_obj}

    radiology_obj = parse_obj(row["Radiology"])
    radiology_seq = build_radiology_sequence(radiology_obj)

    state[hadm] = {
        "hpi": hpi_text,
        "lab_tests": lab_obj,
        "radiology": radiology_seq
    }

# save
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(state, f, ensure_ascii=False, indent=2)

print(f"Saved {len(state)} patients to {out_path}")

Saved 257 patients to state_text_diver.json
