In [27]:
import pandas as pd
import json

# Read the CSV
df = pd.read_csv('data/cholecystitis_hadm_info_first_diag.csv')

# For each patient, extract modality_region sequence
modality_region_sequences = {}

for idx, row in df.iterrows():
    hadm_id = row['hadm_id']
    radiology_data = row['Radiology']
    
    # Parse the JSON string
    if pd.notna(radiology_data):
        exams = json.loads(radiology_data)
        # Extract modality_region in order (preserving temporal sequence)
        sequence = [f"{exam['Modality']}_{exam['Region']}" for exam in exams]
        modality_region_sequences[hadm_id] = sequence

# Convert to DataFrame for easier manipulation
sequences_df = pd.DataFrame([
    {'hadm_id': hadm_id, 'modality_region_sequence': seq, 'sequence_length': len(seq)}
    for hadm_id, seq in modality_region_sequences.items()
])

In [32]:
full_df = pd.read_csv('extracted_sequences.csv')
sequences_df = sequences_df[['hadm_id', 'modality_region_sequence']]
result_df = full_df.merge(sequences_df, on='hadm_id', how='left')

In [33]:
result_df.to_csv('extracted_sequences_full.csv', index=False)

In [11]:
import re
def extract_diseases(text):
    # Remove numbering (1., 2., -, etc.)
    text = re.sub(r'^\s*[\d\-\•]+[\.\)]\s*', '', text, flags=re.MULTILINE)
    
    # Split by newlines
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    
    # Filter out empty lines and section headers
    diseases = []
    for line in lines:
        # Skip if it's a header/label
        if re.match(r'^(primary|secondary|diagnosis)', line, re.IGNORECASE):
            continue
        diseases.append(line)
    
    return diseases

def parse_discharge_diagnosis(text):
    if pd.isna(text):
        return []
    
    text = str(text).strip()
    
    # Split by primary/secondary markers
    primary_match = re.search(r'PRIMARY DIAGNOSIS[:\s]+(.*?)(?:SECONDARY|$)', 
                             text, re.IGNORECASE | re.DOTALL)
    
    if primary_match:
        primary_text = primary_match.group(1)
    else:
        # If no explicit label, treat entire text as primary
        primary_text = text
    
    # Clean and extract disease list
    diseases = extract_diseases(primary_text)
    return diseases

In [12]:
def extract_modality_sequence(radiology_json):
    """Extract ordered list of modalities from Radiology column"""
    if pd.isna(radiology_json):
        return []
    
    try:
        exams = json.loads(radiology_json)
        # Keep order - extract Modality field from each exam
        sequence = [exam['Modality'] for exam in exams]
        return sequence
    except:
        return []
df = pd.read_csv('data/cholecystitis_hadm_info_first_diag.csv')
# 4. APPLY TO DATAFRAME
df['modality_sequence'] = df['Radiology'].apply(extract_modality_sequence)
df['primary_diagnoses'] = df['Discharge Diagnosis'].apply(parse_discharge_diagnosis)
output_df = df[['hadm_id', 'modality_sequence', 'primary_diagnoses']].copy()
output_df.to_csv('extracted_sequences.csv', index=False)

In [20]:
import pandas as pd

# Read and fix the entire file
with open('extracted_sequences.csv', 'r', encoding='utf-8') as f:
    content = f.read()

# Replace ALL types of problematic quotes
content = content.replace(''', "'")  # Left single curly quote
content = content.replace(''', "'")  # Right single curly quote  
content = content.replace('"', '"')  # Left double curly quote
content = content.replace('"', '"')  # Right double curly quote

# Write back
with open('extracted_sequences.csv', 'w', encoding='utf-8') as f:
    f.write(content)

print("Fixed! Now reload your dataframe:")
print("df = pd.read_csv('extracted_sequences.csv')")

Fixed! Now reload your dataframe:
df = pd.read_csv('extracted_sequences.csv')


In [14]:
import pandas as pd
import ast

df = pd.read_csv('extracted_sequences.csv')
all_diagnoses = set()
for diag_str in df['diagnoses']:
    diag_list = [d.lower() for d in ast.literal_eval(diag_str)]
    all_diagnoses.update(diag_list)

print(sorted(all_diagnoses))

['a-fib', 'abdominal pain', 'acalculous cholecystitis', 'acalculus cholecystitis', 'active diagnoses', 'acue cholecystitis', 'acute acalculous cholecystitis', 'acute and chronic cholecystitis', 'acute and chronic cholecystitis multiple gallstones', 'acute blood loss anemia', 'acute blood loss anemia with lower gi bleed', 'acute blood stream infection; h. influenzae', 'acute calculous cholecystitis', 'acute calculus cholecystitis, cholangitis', 'acute cholangitis', 'acute cholecystitis', 'acute cholecystitis (gallbladder infection)', 'acute cholecystitis (gangrenous)', 'acute cholecystitis (infected gallbladder)', 'acute cholecystitis (necrosis of gall bladder)', 'acute cholecystitis and cholangitis', 'acute cholecystitis and choledocholithiasis', 'acute cholecystitis and cholelithiasis', 'acute cholecystitis and hydrops of the gallbladder', 'acute cholecystitis complicated by bacteremia and septic shock', 'acute cholecystitis s/p ___ percutaneous cholecystostomy', 'acute cholecystitis 

In [15]:
import pandas as pd
import ast

# Your diagnosis labels dictionary
diagnosis_labels = {
    # 第一组：胆囊及胆道相关疾病 (Primary Diagnosis)
    'gallbladder_biliary_related': [
        'acalculous cholecystitis', 'acalculus cholecystitis', 'acue cholecystitis', 
        'acute acalculous cholecystitis', 'acute and chronic cholecystitis', 
        'acute and chronic cholecystitis multiple gallstones', 'acute calculous cholecystitis', 
        'acute calculus cholecystitis, cholangitis', 'acute cholangitis', 'acute cholecystitis', 
        'acute cholecystitis (gallbladder infection)', 'acute cholecystitis (gangrenous)', 
        'acute cholecystitis (infected gallbladder)', 'acute cholecystitis (necrosis of gall bladder)', 
        'acute cholecystitis and cholangitis', 'acute cholecystitis and choledocholithiasis', 
        'acute cholecystitis and cholelithiasis', 'acute cholecystitis and hydrops of the gallbladder', 
        'acute cholecystitis complicated by bacteremia and septic shock', 
        'acute cholecystitis s/p ___ percutaneous cholecystostomy', 
        'acute cholecystitis s/p lap cholecystectomy', 'acute cholecystitis s/p lap cholecystitis', 
        'acute cholecystitis s/p laparoscopic cholecystectomy', 
        'acute cholecystitis s/p laparoscopic cholecystectomy converted', 
        'acute cholecystitis s/p laparoscopic converted to open', 
        'acute cholecystitis s/p percutaneous cholecystostomy tube', 
        'acute cholecystitis sp laparascopic cholecystectomy', 
        'acute cholecystitis sp laparascopic cholecystitis', 
        'acute cholecystitis sp laparoscopic cholecystectomy', 'acute cholecystitis with cholangitis', 
        'acute cholecystitis with gangrenous cholecystitis', 
        'acute cholecystitis with gangrenous gallbladder', 'acute cholecystitis with hydrops of', 
        'acute cholecystitis with hydrops of the gallbladder', 
        'acute cholecystitis with hydrops of the gallbladder and probable', 
        'acute cholecystitis with liver abscess', 'acute cholecystitis with necrosis of the fundus of the', 
        'acute cholecystitis with obstruction', 'acute cholecystitis with obstruction and hydrops of the', 
        'acute cholecystitis with obstruction of the cystic duct by a', 
        'acute cholecystitis with possible mirizzi syndrome, with', 'acute cholecystitis, choledocholithiasis', 
        'acute cholecystitis, cholelithiasis', 'acute cholecystitis, gallstone pancreatitis', 
        'acute cholecystitis, s/p laparoscopic cholecystectomy', 'acute cholecystitis, sepsis', 
        'acute cholecystitis, with hydrops of the gallbladder', 'acute cholecystitis/ cholelithiasis', 
        'acute cholecystitis/cholangitis', 'acute cholecystitis/choledocholithiasis', 
        'acute gangrenous cholecystitis', 'acute gangrenous cholecystitis with cholelithiasis', 
        'acute gangrenous cholecystitis with empyema of the gallbladder', 
        'acute gangrenous cholecystitis with perforation at the', 'acute necrotizing cholecystitis', 
        'acute on chronic cholecystitis', 'acute pancreatitis secondary to gallstones', 
        'acute perforated calculous cholecystitis', 'acute perforated cholecystitis', 
        'acute perforated cholecystitis s/p open cholecystectomy on ___', 'acute purulent cholecystitis', 
        'acute-on-chronic cholecystitis', 'ascending cholangitis', 'bile duct obstruction', 
        'biliary obstruction due to cholangiocarcinoma', 'biliary stricture', 
        'calculus of gallbladder with acute cholecystitis with', 'cholangitis, acute', 
        'cholangitis/acute cholecystitis', 'cholangitis/cholecystitis', 'cholecystectomy', 
        'cholecystitis', 'cholecystitis and choledocholithiasis', 
        'cholecystitis and possible choledocholithiasis', 'cholecystitis in setting of cholangiocarcinoma', 
        'cholecystitis s/p lap->open cholecystectomy', 'cholecystitis s/p percutaneous cholecystostomy tube', 
        'cholecystitis s/p percutaneous cholecytostomy and percutaneous', 
        'cholecystitis status post lap cholecystectomy', 'cholecystitis status post laparoscopic cholecsytectomy', 
        'cholecystitis status post laparoscopic cholecystectomy', 'cholecystitis with gallbladder perferation', 
        'cholecystitis with history of common bile duct obstruction', 'cholecystitis, choledocolithiasis', 
        'cholecystitis, common bile duct stone', 'cholecystitis, lower extremity ischemia', 
        'cholecystitis, perforated', 'cholecystitis, sp laparascopic cholecystectomy', 
        'cholecystostomy tube', 'cholecytostomy tube placement', 'choledocholithiasis', 
        'choledocolithiasis with acute cholecystitis', 'cholelithiasis', 
        'cholelithiasis with acute on chronic cholecystitis', 
        'cholelithiasis with acute on chronic cholecystitis with hydrops', 
        'cholelithiasis with cholecystitis', 'chronic biliary colic', 'chronic cholecystitis', 
        'chronic cholecystitis with acute biliary colic', 'common bile duct obstruction', 
        'common bile duct stone status post laparoscopic cholecystectomy', 
        'cystic stump leak status post laparoscopic cholecystectomy', 'early acute cholecystitis', 
        'early cholecystitis', 'e.coli and bacteroides bacteremia/biliary sepsis', 
        'emphysematous cholecystitis', 'gall stone', 'gallbladder', 'gallbladder and pericholecystic abscess', 
        'gallstone ileus', 'gallstone pancreatitis', 'gangrenous acute cholecystitis', 
        'gangrenous cholecystitis', 'gangrenous cholecystitis and large umbilical hernia', 
        'gangrenous cholecystitis with hydrops of the gallbladder', 
        'gangrenous perforated cholecystitis, choledocholithiasis, &', 'hydrops of gallbladder', 
        'hydrops of the gallbladder', 'metastatic neuroendocrine tumor w/obstructive cholangitis', 
        "mirizzi's syndrome", 'of the gallbladder', 'pancreatic dilatationsecondary to stricture', 
        'pancreatitis', 'perforated cholecystitis', 'perforated cholecystitis with abscess', 
        'perforated gallbladder', 'perforated gangrenous cholecystitis', 
        'recurrent acute cholecystitis growing mrsa', 
        's/p avr/mvr/cabg post-op acute cholecystitis s/p percutaneous', 
        's/p laparoscopic cholecystectomy for acute cholecystitis', 'subacute cholecystitis', 
        'the gallbladder', 'to extrinsic compression of common bile duct, atrial', 
        'transient cholangitis - passage of stone through cbd'
    ],

    # 第二组：慢性疾病 (Chronic / Baseline Diseases)
    'chronic_diseases': [
        'a-fib', 'abdominal pain', 'active diagnoses', 'acute myelofibrosis', 
        'acute on chronic encephalopathy/alzheimers dementia', 'acute on chronic gait dysfunction', 
        'adrenal insufficiency', 'afib', 'alcohol abuse', 'alcohol abuse, complicated by alcohol withdrawal', 
        'anemia', 'anemia: chronic', 'anxiety', 'asthma', 'asthma/copd', 'atrial fibrillation', 
        'atrial tachycardia', 'autoimmune neutropenia', 
        'back pain: secondary to lumbar stenosis/degenerative disease', 
        'benign essential hypertension - controlled', 'bilateral dvts', 'bilateral parotitis', 
        'bipolar disorder', 'cad', 'cad s/p cabg', 'candidiasis', 'carpal tunnel syndrome', 
        'chemotherapy treatment', 'cholangiocarcinoma', 'chronic afib with runs of rvr', 
        'chronic back pain', 'chronic hbv infection', 'chronic issues', 'chronic kidney disease', 
        'chronic obstructive pulmonary disease exacerbation', 'chronic systolic heart failure', 
        'cirrhosis related to alcohol use', 'cll', 'coagulopathy', 'coagulopathy with elevated inr', 
        'copd', 'copd exacerbation', 'coronary artery', 'coronary artery disease', 
        'coronary artery disease s/p cabg', 'cva with dysphagia and "locked in syndrome", dementia,', 
        'delirium', 'dementia', 'depression, s/p orif, compression fxs, osteoporosis, lytic', 
        'diabetes', 'diabetes mellitus', 'diabetes type ii', 'diabetic peripheraly neuropathy', 
        'drug rash', 'duodenal ulcers', 'dysphagia', 'e. coli uti', 'early necrosis of the fundus', 
        'ejection fraction', 'elevated lfts and total bilirubin', 'elevated t bilirubin', 
        'esrd on hemodialysis', 'fall / dementia', 'fevers', 'fibromyalgia', 
        'fsgs (focal segmental glomerulonephritis)', 'gastric cancer', 'gastroesophageal reflux disease', 
        'glaucoma', 'gout', 'granulomatosis, gout', 'h. pylori positve gerd on treatment', 
        'h/o iv drug use', 'h/o kidney and pancreas transplant', 'hemolytic anemia', 
        'heparin induced thrombocytopenia', 'hepatic steatosis', 'hepatitis c', 'hepatocellular carcinoma', 
        'hepc', 'hfref ___ lvad placement', 'history of chronic hepatitis c infection', 
        'history of recurrent dvt/pe', 'hiv', 'hld', "hodgkin's disease", 'htn', 'hyperlipidemia', 
        'hypertension', 'hypocalcemia', 'hypogammaglobulinemia', 'hypothyroidism', 
        'hypovolemic, hypotonic, hyponatremia', 'in-grown toe nail infection', 'incidental finding:', 
        'infundibulum', 'injury of right hepatic duct', 'irritable bowel syndrome', 'leukocytosis', 
        'liver disease', 'malnutrition', 'mechanical mitral valve', 'metastatic breast cancer', 
        'metastatic prostate cancer', 'migraine', 'mononeuritic multiplex from vasculitis', 
        'morbid obesity', 'multiple burn surgeries', 'multiple myeloma', 
        'multiple myeloma s/p bmt, treated with radiation, revlimid,', 'myelodysplastic syndrome', 
        'neuroendocrine tumor with cbd stricture', 'neutropenia/eosinophilia', 
        'new onset atrial fibrillation (self-resolved)', 'no intraabdominal surgery', 'obesity', 
        'obstruction', 'open fasciotomy sites', 'osa', 'oxygen dependence', 'pancreatic adenocarcinoma', 
        'pancreatic ampullary mass', 'pancreatic cyst (pathology pending)', 
        'pancreatic insufficiency', 'pancreatic pseudocyst, chronic', 'parapelvic cysts bilaterally', 
        'paraproteinemia', 'paroxysmal atrial fibrillation', 'perigastric fluid collection', 
        'perihepatic collection', 'pmh: as, mr, cad, htn, ^chol, pvd, oa b/l shoulder, bursitis l', 
        'post op delirium', 'post-op low urine output', 'post-operative urinary retention', 
        'postoperative elevation in liver function tests sp ercp', 
        'prostate ca s/p resection with bladder neck stricture', 
        'psh: s/p r thr, renal lithotripsy, tonsillectomy', 'pulmonary hypertension', 
        'restenosis of gj anastomosis', 'retroperitoneal hematoma', 'rib fractures', 
        'right adrenal adenoma measuring 1.2 cm', 'right diaphragm paralysis', 'right hepatic duct injury', 
        'right lower lobe of the lung collapse', 'rt. forearm superficial thrombophlebitis', 
        's/p abcess drainage', 's/p percutaneous cholecystotomy drainage', 
        'severe protein calorie malnutrition', 'shoulder, chf, afib', 
        'simple left renal cyst measuring 2.7 cm', 'sinus tachycardia', 'sleep apnea', 
        'stage 2 sacral decubitus', 'stone', 'suspected nash', 't2dm', 't9 compression fracture', 
        'thrombocytopenia', 'thrombocytopenia (low platelets)', 'thrombocytosis', 'to open', 
        'transverse diverticulitis', 'tube placement', 'type ii diabetes mellitus', 'type ii dm', 
        'umbilical hernia', 'urinary retention', 'urinary retention/urethral stricture s/p dilation', 
        'urinary tract infection', 'uterine prolapse', 'volume overload'
    ],

    # 第三组：非胆囊类严重急性疾病 (Serious Acute Diseases, Non-Gallbladder)
    'serious_acute_non_gallbladder': [
        'acute blood loss anemia', 'acute blood loss anemia with lower gi bleed', 
        'acute blood stream infection; h. influenzae', 'acute diastolic congestive heart failure', 
        'acute encephalopathy', 'acute exacerbation of hfpef', 'acute kidney injury', 
        'acute kindey injury', 'acute on chronic blood loss anemia', 
        'acute on chronic diastolic heart failure', 'acute on chronic systolic chf', 
        'acute on chronic systolic congestive heart failure', 'acute pulmonary embolism', 
        'aneurysm , recurrent urinary tract infections', 
        'atrial fibrillation with rapid ventricular response', 'bacteremia, due to contaminant', 
        'blood loss anemia', 'by ileus', 'c.diff colitis', 'clostridium difficile infection', 
        'community acquired pneumonia', 'complete heart block', 
        'decadron; cad s/p nstemi with stent, hx dvt, gerd, htn,', 
        'demand myocardial ischemia', 'depression, aspiration pneumonia , constipation, recurrent skin', 
        'e coli bacteremia', 'e. coli bacteremia', 'ecoli bacteremia', 'embolic stroke', 
        'few episodes of complete heart block with ___ second pauses', 
        'fibrillation, history of pulmonary embolism, wegeners', 'gram negative sepsis', 
        'heart failure with a preserved ejection fraction', 'hyperkalemia', 'hypertensive urgency', 
        'hypoxemia', 'ileus', 'klebsiella bacteremia', 'left intracranial internal carotid artery aneurysm', 
        'left leg cellulitis', 'lle cellulitis', 'metabolic acidosis', 'mrsa bacteremia', 
        'nonspecific interstitial pneumonia', 'placement, ileus (resolved)', 'pneumonia', 
        'post op ileus', 'post-operative hypotension and bradycardia', 'post-operative ileus', 
        'postoperative hemorrhage/acute blood loss anemia', 'prior stroke', 
        'right sided heart failure, with normal left ventricular', 'sepsis', 
        'sepsis due to acute cholecystitis with choledocholithiasis', 
        'septic shock due to acute cholecystitis', 'septic shock secondary to cholecystitis', 
        'severe sepsis', 'severe-complicated clostridium difficile infection complicated', 
        'sinus bradycardia', 'st elevation (stemi) myocardial infarction involving right', 
        'subacute subdural hematoma', 'superior mesenteric artery occlusion', 'systolic heart failure', 
        'toxic-metabolic encephalopathy', 
        'ulcer, atypical psychosis, pulmonary embolism, thoracic aortic'
    ]
}


In [17]:
# Convert lists to sets for faster lookup
diagnosis_sets = {
    category: set(diagnoses) 
    for category, diagnoses in diagnosis_labels.items()
}

# Read the CSV
df = pd.read_csv('extracted_sequences.csv')

# Function to check if patient has diagnoses in a category
def has_diagnosis_in_category(diagnosis_str, category_set):
    """
    Returns 1 if any diagnosis in the patient's list belongs to the category, else 0
    """
    patient_diagnoses = ast.literal_eval(diagnosis_str)
    
    # Check if any diagnosis matches (case-insensitive)
    for diag in patient_diagnoses:
        if diag.lower() in {d.lower() for d in category_set}:
            return 1
    return 0

# Create binary labels for each category
df['gallbladder_biliary_label'] = df['diagnoses'].apply(
    lambda x: has_diagnosis_in_category(x, diagnosis_sets['gallbladder_biliary_related'])
)

df['chronic_diseases_label'] = df['diagnoses'].apply(
    lambda x: has_diagnosis_in_category(x, diagnosis_sets['chronic_diseases'])
)

df['serious_acute_non_gb_label'] = df['diagnoses'].apply(
    lambda x: has_diagnosis_in_category(x, diagnosis_sets['serious_acute_non_gallbladder'])
)

# Display results
print(df[['hadm_id', 'gallbladder_biliary_label', 'chronic_diseases_label', 'serious_acute_non_gb_label']].head(20))

# Summary statistics
print("\nLabel Distribution:")
print(f"Gallbladder/Biliary: {df['gallbladder_biliary_label'].sum()} / {len(df)} ({df['gallbladder_biliary_label'].mean()*100:.1f}%)")
print(f"Chronic Diseases: {df['chronic_diseases_label'].sum()} / {len(df)} ({df['chronic_diseases_label'].mean()*100:.1f}%)")
print(f"Serious Acute Non-GB: {df['serious_acute_non_gb_label'].sum()} / {len(df)} ({df['serious_acute_non_gb_label'].mean()*100:.1f}%)")

# Save to new CSV
df.to_csv('extracted_sequences_labeled.csv', index=False)

     hadm_id  gallbladder_biliary_label  chronic_diseases_label  \
0   29897948                          1                       0   
1   21166109                          1                       0   
2   20535755                          1                       0   
3   27553284                          1                       0   
4   25514003                          1                       0   
5   26024119                          1                       0   
6   20269467                          1                       0   
7   20307822                          1                       0   
8   26013833                          1                       0   
9   22470664                          1                       0   
10  21084833                          1                       0   
11  28446121                          1                       1   
12  20578869                          1                       0   
13  27095133                          1                       

In [18]:
import pandas as pd
import re

# Read the data
cholecystitis_df = pd.read_csv('data/cholecystitis_hadm_info_first_diag.csv')
sequences_df = pd.read_csv('extracted_sequences.csv')

def parse_patient_history(text):
    """Extract structured sections from patient history text"""
    if pd.isna(text):
        return {
            'hpi': None,
            'past_medical_history': None,
            'past_surgical_history': None,
            'social_history': None,
            'family_history': None
        }
    
    # Initialize dict
    sections = {}
    
    # Extract Past Medical History (two patterns: "PMH:" and "Past Medical History:")
    pmh_match = re.search(r'Past Medical History:\s*(.+?)(?=\s*PSH:|Social History:|Family History:|$)', text, re.DOTALL)
    if not pmh_match:
        pmh_match = re.search(r'PMH:\s*(.+?)(?=\s*PSH:|Social History:|Family History:|$)', text, re.DOTALL)
    sections['past_medical_history'] = pmh_match.group(1).strip() if pmh_match else None
    
    # Extract Past Surgical History
    psh_match = re.search(r'PSH:\s*(.+?)(?=\s*Social History:|Family History:|$)', text, re.DOTALL)
    sections['past_surgical_history'] = psh_match.group(1).strip() if psh_match else None
    
    # Extract Social History
    social_match = re.search(r'Social History:\s*(.+?)(?=\s*Family History:|$)', text, re.DOTALL)
    sections['social_history'] = social_match.group(1).strip() if social_match else None
    
    # Extract Family History
    family_match = re.search(r'Family History:\s*(.+?)$', text, re.DOTALL)
    sections['family_history'] = family_match.group(1).strip() if family_match else None
    
    # Extract HPI (everything before first section header)
    hpi_match = re.search(r'^(.+?)(?=\s*Past Medical History:|PMH:|Social History:|Family History:|$)', text, re.DOTALL)
    sections['hpi'] = hpi_match.group(1).strip() if hpi_match else text.strip()
    
    return sections

# Apply parsing
history_parsed = cholecystitis_df['Patient History'].apply(parse_patient_history)
history_df = pd.DataFrame(history_parsed.tolist())
history_df['hadm_id'] = cholecystitis_df['hadm_id']

# Merge with extracted_sequences
result_df = sequences_df.merge(history_df, on='hadm_id', how='left')

# Save result
result_df.to_csv('extracted_sequences_with_history.csv', index=False)

In [55]:
import pandas as pd
import json
import re

def extract_numeric_value(value_str):
    """
    Extract numeric value from string like "62.0 IU/L", "1.0 #/hpf", etc.
    Returns None if value is text-only (NEG., NONE, etc.)
    """
    if pd.isna(value_str) or value_str is None:
        return None, None
    
    value_str = str(value_str).strip()
    
    # Check if it's a text-only result (NEG., NONE, HOLD., etc.)
    text_only_patterns = ['NEG', 'NONE', 'HOLD', 'DISCARD', 'FEW', 'RARE', 
                          'MODERATE', 'MANY', 'Clear', 'Yellow', 'NotDone',
                          'RANDOM', 'Using this', '<', '>']
    
    for pattern in text_only_patterns:
        if pattern in value_str:
            return None, value_str  # Return original text
    
    # Try to extract numeric value
    match = re.search(r'([\d.]+)', value_str)
    if match:
        try:
            numeric_val = float(match.group(1))
            return numeric_val, value_str
        except ValueError:
            return None, value_str
    
    return None, value_str

def classify_lab_result(test_value, ref_lower, ref_upper):
    """
    Classify lab result as:
    - 0: Normal (within range)
    - +1: Too high (above upper limit)
    - -1: Too low (below lower limit)
    - text: Keep original text if non-numeric
    """
    numeric_val, original_text = extract_numeric_value(test_value)
    
    # If no numeric value extracted, return original text
    if numeric_val is None:
        return original_text if original_text else "N/A"
    
    # If no reference ranges available, return "unknown"
    if pd.isna(ref_lower) and pd.isna(ref_upper):
        return "unknown"
    
    # Compare with reference ranges
    if not pd.isna(ref_lower) and numeric_val < ref_lower:
        return "Decreased"  # Too low
    elif not pd.isna(ref_upper) and numeric_val > ref_upper:
        return "Elevated"   # Too high
    else:
        return "Normal"   # Normal

def process_lab_tests(row):
    """
    Process laboratory tests for one patient
    Returns a dictionary of test_id: classification
    """
    try:
        lab_tests = json.loads(row['Laboratory Tests']) if isinstance(row['Laboratory Tests'], str) else row['Laboratory Tests']
        ref_lower = json.loads(row['Reference Range Lower']) if isinstance(row['Reference Range Lower'], str) else row['Reference Range Lower']
        ref_upper = json.loads(row['Reference Range Upper']) if isinstance(row['Reference Range Upper'], str) else row['Reference Range Upper']
    except:
        return {}
    
    if not isinstance(lab_tests, dict):
        return {}
    
    classified_results = {}
    
    for test_id, test_value in lab_tests.items():
        lower = ref_lower.get(test_id) if isinstance(ref_lower, dict) else None
        upper = ref_upper.get(test_id) if isinstance(ref_upper, dict) else None
        
        classification = classify_lab_result(test_value, lower, upper)
        classified_results[test_id] = classification
    
    return classified_results


In [56]:
# Example usage:
df = pd.read_csv('data/cholecystitis_hadm_info_first_diag.csv')
sequences_full = pd.read_csv('extracted_sequences_full.csv')

# Apply processing
df['lab_tests_classified'] = df.apply(process_lab_tests, axis=1)

# Convert to JSON string for storage
df['lab_tests_classified_json'] = df['lab_tests_classified'].apply(
    lambda x: json.dumps(x) if isinstance(x, dict) else "{}")


lab_classification_df = df[['hadm_id', 'lab_tests_classified_json']].copy()
lab_classification_df.columns = ['hadm_id', 'lab_tests_classified_text']
updated_df = sequences_full.merge(
    lab_classification_df, 
    on='hadm_id', 
    how='left'
)
updated_df.to_csv('extracted_sequences_with_lab_classification_text.csv', index=False)

## Relationships

In [16]:
import pandas as pd

filtered_path = "filtered_family_history.csv"
full_path = "extracted_sequences_full.csv"
out_path = "extracted_sequences_full_with_fh_label.csv"

def read_csv_robust(path, dtype=None):
    # Try common encodings used in mixed clinical exports
    for enc in ("utf-8", "utf-8-sig", "cp1252", "latin1"):
        try:
            return pd.read_csv(path, dtype=dtype, encoding=enc)
        except UnicodeDecodeError:
            continue
    raise UnicodeDecodeError("all", b"", 0, 1, f"Could not decode file: {path}")

# Read both files as strings for stable ID matching
df_filtered = read_csv_robust(filtered_path, dtype={"hadm_id": str})
df_full = read_csv_robust(full_path, dtype={"hadm_id": str})


In [21]:
positive_ids = list(df_filtered["hadm_id"])
positive_ids

['29897948',
 '21166109',
 '27553284',
 '26013833',
 '28446121',
 '29044720',
 '20544652',
 '28614442',
 '26549810',
 '20073240',
 '28721835',
 '21557830',
 '23271541',
 '20621538',
 '25360313',
 '23968056',
 '22361306',
 '26286187',
 '25529120',
 '25323331',
 '29810669',
 '21860863',
 '26425634',
 '27749421',
 '29994310',
 '28775769',
 '28501096',
 '25972552',
 '21865534',
 '24453001',
 '22841113',
 '26903994',
 '24636219',
 '27673222',
 '26169546',
 '21513149',
 '21948836',
 '21081277',
 '23199260',
 '27863365',
 '29803347',
 '25582981',
 '28048474',
 '26019177',
 '28710048',
 '28951548',
 '28973441',
 '22969824',
 '23018016',
 '21081638',
 '29321309',
 '24508441',
 '20348954',
 '20369494',
 '20543211',
 '23289786',
 '25656416',
 '29815440',
 '28208232',
 '23444567',
 '21548852',
 '23591858',
 '28772575',
 '26573641',
 '27248191',
 '21269103',
 '24689109',
 '23189672',
 '23852047',
 '26040520',
 '28306018',
 '23625534',
 '22521761',
 '28874227',
 '23688784',
 '23824159',
 '27269012',

In [22]:
# Assign label: 1 if hadm_id in filtered_family_history, else 0
df_full["family_history_label"] = df_full["hadm_id"].isin(positive_ids).astype(int)

# Save
df_full.to_csv(out_path, index=False)

print(f"Saved: {out_path}")
print("Total rows:", len(df_full))
print("Label counts:\n", df_full["family_history_label"].value_counts(dropna=False))

Saved: extracted_sequences_full_with_fh_label.csv
Total rows: 648
Label counts:
 family_history_label
0    353
1    295
Name: count, dtype: int64


In [23]:
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact, chi2_contingency

# ---- 1) Load data ----
# If you already created this file in previous step:
df = pd.read_csv("extracted_sequences_full.csv")

# Labels to analyze
cols = [
    "family_history_label",
    "gallbladder_biliary_label",
    "chronic_diseases_label",
    "serious_acute_non_gb_label",
]

# Keep only needed columns, coerce to numeric 0/1
d = df[cols].apply(pd.to_numeric, errors="coerce")
d = d.dropna()

# Optional safety check: confirm binary
for c in cols:
    bad_vals = sorted(set(d[c].unique()) - {0, 1})
    if bad_vals:
        print(f"[WARN] {c} has non-binary values: {bad_vals}")

# ---- 2) Correlation matrix (phi coefficients) ----
corr = d[cols].corr(method="pearson")
print("Phi/Pearson correlation matrix (binary-binary):")
print(corr.round(3))
print()

# ---- 3) Pairwise Fisher test + odds ratio ----
def association_table(x, y):
    # rows: x=0/1, cols: y=0/1
    ct = pd.crosstab(x, y).reindex(index=[0,1], columns=[0,1], fill_value=0)
    # Fisher exact for 2x2
    odds_ratio, p_fisher = fisher_exact(ct.values)
    # Also chi-square (optional)
    chi2, p_chi2, dof, expected = chi2_contingency(ct.values)
    return ct, odds_ratio, p_fisher, p_chi2

pairs = []
for i in range(len(cols)):
    for j in range(i + 1, len(cols)):
        a, b = cols[i], cols[j]
        ct, or_, p_f, p_c = association_table(d[a], d[b])
        pairs.append({
            "var1": a,
            "var2": b,
            "odds_ratio": or_,
            "p_fisher": p_f,
            "p_chi2": p_c
        })
        print(f"{a} vs {b}")
        print(ct)
        print(f"  odds_ratio={or_:.4f}, p_fisher={p_f:.4g}, p_chi2={p_c:.4g}")
        print("-" * 60)

res = pd.DataFrame(pairs).sort_values("p_fisher")
print("\nPairwise association summary (sorted by Fisher p):")
print(res.to_string(index=False))

Phi/Pearson correlation matrix (binary-binary):
                            family_history_label  gallbladder_biliary_label  \
family_history_label                       1.000                     -0.008   
gallbladder_biliary_label                 -0.008                      1.000   
chronic_diseases_label                     0.122                     -0.085   
serious_acute_non_gb_label                 0.156                     -0.303   

                            chronic_diseases_label  serious_acute_non_gb_label  
family_history_label                         0.122                       0.156  
gallbladder_biliary_label                   -0.085                      -0.303  
chronic_diseases_label                       1.000                       0.366  
serious_acute_non_gb_label                   0.366                       1.000  

family_history_label vs gallbladder_biliary_label
gallbladder_biliary_label  0    1
family_history_label             
0                          3  34

In [41]:
import pandas as pd
import ast

csv_path = "extracted_sequences_full.csv"
col = "modality_region_sequence"

df = pd.read_csv(csv_path)

def parse_list(x):
    if pd.isna(x):
        return []
    if isinstance(x, list):
        return x
    x = str(x).strip()
    if not x:
        return []
    try:
        return ast.literal_eval(x)  # e.g. "['CT_Abdomen', 'Ultrasound_Abdomen']"
    except Exception:
        return []

# Flatten all modality_region entries
all_pairs = []
for seq in df[col].apply(parse_list):
    all_pairs.extend(seq)

unique_pairs = sorted(set(all_pairs))
print(f"Number of unique modality_region pairs: {len(unique_pairs)}")
print("Unique pairs:")
for p in unique_pairs:
    print(p)

Number of unique modality_region pairs: 23
Unique pairs:
CTU_Abdomen
CT_Abdomen
CT_Chest
CT_Head
CT_Spine
Carotid ultrasound_Neck
Drainage_Abdomen
ERCP_Abdomen
Fluoroscopy_Chest
MRCP_Abdomen
MRE_Abdomen
MRI_Abdomen
MRI_Head
MRI_Spine
Radiograph_Abdomen
Radiograph_Ankle
Radiograph_Chest
Radiograph_Knee
Radiograph_Venous
Ultrasound_Abdomen
Ultrasound_Neck
Ultrasound_Venous
Upper GI Series_Abdomen


In [42]:
from collections import Counter
import pandas as pd

# 1) Count raw occurrences across all sequences
all_pairs = []
for seq in df[col].apply(parse_list):
    all_pairs.extend(seq)

freq = Counter(all_pairs)

freq_df = (
    pd.DataFrame(freq.items(), columns=["test_type", "count"])
    .sort_values("count", ascending=False)
    .reset_index(drop=True)
)

# percentage among all test events
freq_df["pct_of_all_events"] = (freq_df["count"] / freq_df["count"].sum() * 100).round(2)

print(f"Number of unique test types: {len(freq_df)}")
display(freq_df)

Number of unique test types: 23


Unnamed: 0,test_type,count,pct_of_all_events
0,Ultrasound_Abdomen,609,36.19
1,Radiograph_Chest,503,29.89
2,CT_Abdomen,273,16.22
3,MRCP_Abdomen,88,5.23
4,Radiograph_Abdomen,80,4.75
5,CT_Chest,44,2.61
6,CT_Head,26,1.54
7,ERCP_Abdomen,24,1.43
8,Ultrasound_Venous,9,0.53
9,MRI_Abdomen,7,0.42


### Recommendation

In [37]:
import ast
import random
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

# Optional: use sentence-transformers for better text embeddings
# pip install sentence-transformers
USE_SENTENCE_TRANSFORMER = True


def parse_seq(x):
    """Parse sequence column like "['CT_Abdomen', 'Ultrasound_Abdomen']"."""
    if pd.isna(x):
        return []
    if isinstance(x, list):
        return x
    s = str(x).strip()
    if not s:
        return []
    try:
        v = ast.literal_eval(s)
        return v if isinstance(v, list) else []
    except Exception:
        return []


def split_reference_test(df, test_ratio=0.1, seed=42):
    rng = random.Random(seed)
    idx = list(df.index)
    rng.shuffle(idx)
    n_test = max(1, int(len(idx) * test_ratio))
    test_idx = set(idx[:n_test])
    test_df = df.loc[list(test_idx)].copy()
    ref_df = df.loc[[i for i in idx if i not in test_idx]].copy()
    return ref_df, test_df


def build_text_encoder(texts, use_sentence_transformer=True, model_name="all-MiniLM-L6-v2"):
    """
    Returns embeddings and a function encode(new_texts)->embeddings
    """
    if use_sentence_transformer:
        try:
            from sentence_transformers import SentenceTransformer
            model = SentenceTransformer(model_name)
            emb = model.encode(texts, show_progress_bar=False, convert_to_numpy=True)

            def encode_fn(new_texts):
                return model.encode(new_texts, show_progress_bar=False, convert_to_numpy=True)

            return emb, encode_fn
        except Exception as e:
            print(f"[Warning] SentenceTransformer unavailable ({e}), fallback to TF-IDF.")

    # Fallback: TF-IDF
    from sklearn.feature_extraction.text import TfidfVectorizer
    vec = TfidfVectorizer(max_features=20000, ngram_range=(1, 2))
    emb = vec.fit_transform(texts)

    def encode_fn(new_texts):
        return vec.transform(new_texts)

    return emb, encode_fn


def build_transition_matrix(sequences):
    """
    Build:
      - start_prob: P(first_test = x)
      - trans_prob: P(next = j | current = i)
    """
    start_counts = defaultdict(int)
    trans_counts = defaultdict(lambda: defaultdict(int))
    state_set = set()

    for seq in sequences:
        if not seq:
            continue
        start_counts[seq[0]] += 1
        state_set.update(seq)
        for a, b in zip(seq[:-1], seq[1:]):
            trans_counts[a][b] += 1

    states = sorted(state_set)
    if not states:
        return pd.Series(dtype=float), pd.DataFrame()

    # Start distribution
    total_starts = sum(start_counts.values())
    start_prob = pd.Series(
        {s: (start_counts[s] / total_starts if total_starts > 0 else 0.0) for s in states}
    ).sort_values(ascending=False)

    # Transition matrix
    trans_prob = pd.DataFrame(0.0, index=states, columns=states)
    for a in states:
        row_total = sum(trans_counts[a].values())
        if row_total > 0:
            for b, c in trans_counts[a].items():
                trans_prob.loc[a, b] = c / row_total

    return start_prob, trans_prob


def suggest_for_one_patient(test_hpi, ref_hpi_emb, ref_df, encode_fn, top_k=10):
    """
    For one new patient:
      1) similarity by HPI embedding
      2) top_k most similar reference patients
      3) build transition matrix from their sequences
      4) predict first test by start distribution argmax
    """
    test_emb = encode_fn([test_hpi])
    sims = cosine_similarity(test_emb, ref_hpi_emb).flatten()

    # top_k similar reference patients
    top_idx_local = np.argsort(sims)[::-1][:top_k]
    neighbors = ref_df.iloc[top_idx_local].copy()
    neighbors["similarity"] = sims[top_idx_local]

    neighbor_seqs = neighbors["modality_region_sequence"].tolist()
    start_prob, trans_prob = build_transition_matrix(neighbor_seqs)

    predicted_first = start_prob.index[0] if len(start_prob) > 0 else None
    return neighbors, start_prob, trans_prob, predicted_first


In [38]:
def run_demo(
    csv_path="extracted_sequences_full.csv",
    test_ratio=0.1,
    top_k=10,
    seed=42,
    use_sentence_transformer=USE_SENTENCE_TRANSFORMER
):
    # Load
    df = pd.read_csv(csv_path)
    required_cols = ["hpi", "modality_region_sequence"]
    for c in required_cols:
        if c not in df.columns:
            raise ValueError(f"Missing required column: {c}")

    # Clean
    df = df.dropna(subset=["hpi"]).copy()
    df["hpi"] = df["hpi"].astype(str).str.strip()
    df = df[df["hpi"] != ""].copy()
    df["modality_region_sequence"] = df["modality_region_sequence"].apply(parse_seq)

    # Split
    ref_df, test_df = split_reference_test(df, test_ratio=test_ratio, seed=seed)

    # Fit encoder on reference HPI only
    ref_hpi_texts = ref_df["hpi"].tolist()
    ref_hpi_emb, encode_fn = build_text_encoder(ref_hpi_texts, use_sentence_transformer=use_sentence_transformer)

    # Evaluate all test patients
    results = []
    for i, (_, row) in enumerate(test_df.iterrows(), start=1):
        true_seq = row["modality_region_sequence"]
        true_first = true_seq[0] if len(true_seq) > 0 else None

        neighbors, start_prob, trans_prob, pred_first = suggest_for_one_patient(
            test_hpi=row["hpi"],
            ref_hpi_emb=ref_hpi_emb,
            ref_df=ref_df,
            encode_fn=encode_fn,
            top_k=top_k
        )

        correct = (pred_first == true_first) if true_first is not None else False
        results.append({
            "test_row_index": row.name,
            "true_first": true_first,
            "pred_first": pred_first,
            "first_correct": correct
        })

        # ---- step 4 output for each test patient ----
        print(f"\n=== Test patient #{i} (row index {row.name}) ===")
        print(f"Real sequence: {true_seq}")
        print(f"Predicted first test: {pred_first}")
        print(f"First test correct? {correct}")

        print("\nStart distribution (top 10):")
        print(start_prob.head(10))

        print("\nTransition matrix:")
        print(trans_prob)

    # Overall metric
    res_df = pd.DataFrame(results)
    acc = res_df["first_correct"].mean() if len(res_df) else 0.0
    print(f"\nOverall first-test accuracy on test set: {acc:.4f} ({res_df['first_correct'].sum()}/{len(res_df)})")

    return res_df



In [47]:
if __name__ == "__main__":
    _ = run_demo(
        csv_path="extracted_sequences_full.csv",
        test_ratio=0.1,   # 10% test / 90% reference
        top_k=10,         # top-10 similar reference patients
        seed=45
    )


=== Test patient #1 (row index 129) ===
Real sequence: ['Ultrasound_Abdomen', 'Radiograph_Chest', 'CT_Abdomen', 'Radiograph_Chest', 'ERCP_Abdomen', 'Radiograph_Chest']
Predicted first test: Ultrasound_Abdomen
First test correct? True

Start distribution (top 10):
Ultrasound_Abdomen    0.7
CT_Abdomen            0.2
ERCP_Abdomen          0.1
MRCP_Abdomen          0.0
Radiograph_Chest      0.0
dtype: float64

Transition matrix:
                    CT_Abdomen  ERCP_Abdomen  MRCP_Abdomen  Radiograph_Chest  \
CT_Abdomen            0.333333           0.0      0.000000          0.333333   
ERCP_Abdomen          0.000000           0.0      0.000000          1.000000   
MRCP_Abdomen          0.000000           0.0      0.000000          0.000000   
Radiograph_Chest      1.000000           0.0      0.000000          0.000000   
Ultrasound_Abdomen    0.166667           0.0      0.166667          0.666667   

                    Ultrasound_Abdomen  
CT_Abdomen                    0.333333  
ERCP_Ab

In [50]:
import random
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Optional better encoder:
# pip install sentence-transformers
USE_SENTENCE_TRANSFORMER = True


def split_reference_test(df, test_ratio=0.1, seed=42):
    rng = random.Random(seed)
    idx = list(df.index)
    rng.shuffle(idx)
    n_test = max(1, int(len(idx) * test_ratio))
    test_idx = set(idx[:n_test])
    test_df = df.loc[list(test_idx)].copy()
    ref_df = df.loc[[i for i in idx if i not in test_idx]].copy()
    return ref_df, test_df


def build_text_encoder(texts, use_sentence_transformer=True, model_name="all-MiniLM-L6-v2"):
    if use_sentence_transformer:
        try:
            from sentence_transformers import SentenceTransformer
            model = SentenceTransformer(model_name)
            emb = model.encode(texts, show_progress_bar=False, convert_to_numpy=True)

            def encode_fn(new_texts):
                return model.encode(new_texts, show_progress_bar=False, convert_to_numpy=True)

            return emb, encode_fn
        except Exception as e:
            print(f"[Warning] SentenceTransformer unavailable ({e}), fallback to TF-IDF.")

    from sklearn.feature_extraction.text import TfidfVectorizer
    vec = TfidfVectorizer(max_features=20000, ngram_range=(1, 2))
    emb = vec.fit_transform(texts)

    def encode_fn(new_texts):
        return vec.transform(new_texts)

    return emb, encode_fn


def prepare_binary_label(series):
    # robust conversion for 0/1-like values
    return pd.to_numeric(series, errors="coerce").fillna(0).clip(0, 1).astype(int)


def predict_label_percentages(
    csv_path="extracted_sequences_full.csv",
    test_ratio=0.1,
    top_k=10,
    seed=42,
    use_sentence_transformer=USE_SENTENCE_TRANSFORMER
):
    df = pd.read_csv(csv_path)

    # NOTE: your file uses 'biliary' (one l after bi-), not 'billiary'
    label_cols = [
        "gallbladder_biliary_label",
        "chronic_diseases_label",
        "serious_acute_non_gb_label",
        "family_history_label",
    ]

    needed = ["hpi"] + label_cols
    missing = [c for c in needed if c not in df.columns]
    if missing:
        raise ValueError(f"Missing columns: {missing}")

    # clean
    df = df.dropna(subset=["hpi"]).copy()
    df["hpi"] = df["hpi"].astype(str).str.strip()
    df = df[df["hpi"] != ""].copy()
    for c in label_cols:
        df[c] = prepare_binary_label(df[c])

    # split
    ref_df, test_df = split_reference_test(df, test_ratio=test_ratio, seed=seed)

    # encode reference hpi
    ref_emb, encode_fn = build_text_encoder(
        ref_df["hpi"].tolist(),
        use_sentence_transformer=use_sentence_transformer
    )

    rows = []
    for idx, row in test_df.iterrows():
        test_emb = encode_fn([row["hpi"]])
        sims = cosine_similarity(test_emb, ref_emb).flatten()

        top_idx = np.argsort(sims)[::-1][:top_k]
        neigh = ref_df.iloc[top_idx]
        neigh_sims = sims[top_idx]

        # shift weights to non-negative for stability (if any negatives)
        w = neigh_sims - neigh_sims.min() + 1e-8
        w = w / w.sum() if w.sum() > 0 else np.ones_like(w) / len(w)

        out = {"test_index": idx}

        for c in label_cols:
            # unweighted percentage
            p_unweighted = neigh[c].mean()
            # weighted percentage by similarity
            p_weighted = np.average(neigh[c].values, weights=w)

            out[f"{c}_pct_topk"] = p_unweighted * 100
            out[f"{c}_pct_topk_weighted"] = p_weighted * 100

            # optional hard prediction at 50%
            out[f"{c}_pred"] = int(p_weighted >= 0.5)
            out[f"{c}_true"] = int(row[c])
            out[f"{c}_correct"] = int(out[f"{c}_pred"] == out[f"{c}_true"])

        rows.append(out)

    pred_df = pd.DataFrame(rows)

    # per-label accuracy (optional evaluation)
    acc = {
        c: pred_df[f"{c}_correct"].mean()
        for c in label_cols
    }

    return pred_df, acc


# Run
pred_df, acc = predict_label_percentages(
    csv_path="extracted_sequences_full.csv",
    test_ratio=0.2,
    top_k=10,
    seed=42
)

print("Per-label accuracy (threshold=50% on weighted percentage):")
for k, v in acc.items():
    print(f"{k}: {v:.3f}")

print("\nExample predictions (percentages):")
display_cols = ["test_index"]
for c in [
    "gallbladder_biliary_label",
    "chronic_diseases_label",
    "serious_acute_non_gb_label",
    "family_history_label",
]:
    display_cols += [
        f"{c}_pct_topk",
        f"{c}_pct_topk_weighted",
        f"{c}_true",
        f"{c}_pred",
    ]

print(pred_df[display_cols].head(10))

Per-label accuracy (threshold=50% on weighted percentage):
gallbladder_biliary_label: 1.000
chronic_diseases_label: 0.775
serious_acute_non_gb_label: 0.915
family_history_label: 0.612

Example predictions (percentages):
   test_index  gallbladder_biliary_label_pct_topk  \
0           2                               100.0   
1         515                                90.0   
2           5                               100.0   
3           8                               100.0   
4          12                               100.0   
5         525                               100.0   
6          14                               100.0   
7         526                               100.0   
8          15                               100.0   
9          13                               100.0   

   gallbladder_biliary_label_pct_topk_weighted  \
0                                   100.000000   
1                                    90.677004   
2                                   100.000000

In [54]:
import random
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

USE_SENTENCE_TRANSFORMER = True  # set False to use TF-IDF fallback


def split_reference_test(df, test_ratio=0.1, seed=42):
    rng = random.Random(seed)
    idx = list(df.index)
    rng.shuffle(idx)
    n_test = max(1, int(len(idx) * test_ratio))
    test_idx = set(idx[:n_test])
    test_df = df.loc[list(test_idx)].copy()
    ref_df = df.loc[[i for i in idx if i not in test_idx]].copy()
    return ref_df, test_df


def build_text_encoder(texts, use_sentence_transformer=True, model_name="all-MiniLM-L6-v2"):
    if use_sentence_transformer:
        try:
            from sentence_transformers import SentenceTransformer
            model = SentenceTransformer(model_name)
            emb = model.encode(texts, show_progress_bar=False, convert_to_numpy=True)
            def encode_fn(new_texts):
                return model.encode(new_texts, show_progress_bar=False, convert_to_numpy=True)
            return emb, encode_fn
        except Exception as e:
            print(f"[Warning] SentenceTransformer unavailable ({e}), fallback to TF-IDF.")

    from sklearn.feature_extraction.text import TfidfVectorizer
    vec = TfidfVectorizer(max_features=20000, ngram_range=(1, 2))
    emb = vec.fit_transform(texts)
    def encode_fn(new_texts):
        return vec.transform(new_texts)
    return emb, encode_fn


def to_binary(s):
    return pd.to_numeric(s, errors="coerce").fillna(0).clip(0, 1).astype(int)


def evaluate_topk_majority(
    csv_path="extracted_sequences_full.csv",
    test_ratio=0.1,
    top_k=10,
    seed=42
):
    df = pd.read_csv(csv_path)

    label_cols = [
        "gallbladder_biliary_label",
        "chronic_diseases_label",
        "serious_acute_non_gb_label",
        "family_history_label",
    ]

    df = df.dropna(subset=["hpi"]).copy()
    df["hpi"] = df["hpi"].astype(str).str.strip()
    df = df[df["hpi"] != ""].copy()
    for c in label_cols:
        df[c] = to_binary(df[c])

    ref_df, test_df = split_reference_test(df, test_ratio=test_ratio, seed=seed)

    ref_emb, encode_fn = build_text_encoder(ref_df["hpi"].tolist(), USE_SENTENCE_TRANSFORMER)

    pred_rows = []
    for idx, row in test_df.iterrows():
        test_emb = encode_fn([row["hpi"]])
        sims = cosine_similarity(test_emb, ref_emb).flatten()
        top_idx = np.argsort(sims)[::-1][:top_k]
        neigh = ref_df.iloc[top_idx]

        out = {"test_index": idx}
        for c in label_cols:
            pct = 100.0 * neigh[c].mean()           # % of label==1 among top-k
            pred = int(pct > 50.0)                  # strict > 50%
            true = int(row[c])

            out[f"{c}_pct1"] = pct
            out[f"{c}_pred"] = pred
            out[f"{c}_true"] = true
            out[f"{c}_correct"] = int(pred == true)
        pred_rows.append(out)

    pred_df = pd.DataFrame(pred_rows)

    # Per-label accuracy
    per_label_acc = {
        c: pred_df[f"{c}_correct"].mean()
        for c in label_cols
    }

    # Overall micro accuracy across all label decisions
    all_correct = []
    for c in label_cols:
        all_correct.extend(pred_df[f"{c}_correct"].tolist())
    micro_acc = float(np.mean(all_correct)) if all_correct else 0.0

    # Exact-match accuracy: all labels must be correct for a patient
    exact_match = []
    for _, r in pred_df.iterrows():
        ok = all(r[f"{c}_correct"] == 1 for c in label_cols)
        exact_match.append(int(ok))
    exact_match_acc = float(np.mean(exact_match)) if exact_match else 0.0

    print("Per-label accuracy:")
    for c, a in per_label_acc.items():
        print(f"  {c}: {a:.4f}")

    print(f"\nMicro accuracy (all label decisions pooled): {micro_acc:.4f}")
    print(f"Exact-match accuracy (all 4 labels correct): {exact_match_acc:.4f}")

    return pred_df, per_label_acc, micro_acc, exact_match_acc


# Run
pred_df, per_label_acc, micro_acc, exact_match_acc = evaluate_topk_majority(
    csv_path="extracted_sequences_full.csv",
    test_ratio=0.1,
    top_k=10,
    seed=2
)

# Inspect first few test patients
show_cols = ["test_index"]
for c in ["gallbladder_biliary_label", "chronic_diseases_label", "serious_acute_non_gb_label", "family_history_label"]:
    show_cols += [f"{c}_pct1", f"{c}_pred", f"{c}_true", f"{c}_correct"]

pred_df[show_cols].head(10)

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1530.66it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Per-label accuracy:
  gallbladder_biliary_label: 1.0000
  chronic_diseases_label: 0.8438
  serious_acute_non_gb_label: 0.9375
  family_history_label: 0.5625

Micro accuracy (all label decisions pooled): 0.8359
Exact-match accuracy (all 4 labels correct): 0.5000


Unnamed: 0,test_index,gallbladder_biliary_label_pct1,gallbladder_biliary_label_pred,gallbladder_biliary_label_true,gallbladder_biliary_label_correct,chronic_diseases_label_pct1,chronic_diseases_label_pred,chronic_diseases_label_true,chronic_diseases_label_correct,serious_acute_non_gb_label_pct1,serious_acute_non_gb_label_pred,serious_acute_non_gb_label_true,serious_acute_non_gb_label_correct,family_history_label_pct1,family_history_label_pred,family_history_label_true,family_history_label_correct
0,256,100.0,1,1,1,0.0,0,0,1,10.0,0,0,1,20.0,0,0,1
1,258,100.0,1,1,1,0.0,0,0,1,0.0,0,0,1,50.0,0,1,0
2,3,100.0,1,1,1,30.0,0,0,1,0.0,0,0,1,60.0,1,1,1
3,388,100.0,1,1,1,20.0,0,0,1,10.0,0,0,1,50.0,0,1,0
4,387,100.0,1,1,1,30.0,0,1,0,0.0,0,1,0,40.0,0,1,0
5,6,100.0,1,1,1,20.0,0,0,1,0.0,0,0,1,40.0,0,0,1
6,515,100.0,1,1,1,0.0,0,0,1,0.0,0,0,1,60.0,1,1,1
7,523,90.0,1,1,1,40.0,0,0,1,40.0,0,0,1,40.0,0,1,0
8,525,100.0,1,1,1,20.0,0,0,1,10.0,0,0,1,60.0,1,0,0
9,141,100.0,1,1,1,0.0,0,0,1,10.0,0,0,1,40.0,0,1,0


In [1]:
import json

with open("state_trajectories.json", "r", encoding="utf-8") as f:
    data = json.load(f)

print("num_ids:", len(data))  # top-level records
print("total_states:", sum(len(v) for v in data.values()))  # all nested items

num_ids: 648
total_states: 2331
