In [2]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import re
from collections import Counter

# Database connection
PG_URL = "postgresql+psycopg2://postgres:4030@localhost:5432/omop_sandbox"
engine = create_engine(PG_URL)

In [3]:
#load the resutls previously saved from llm_final.ipynb

df_eval = pd.read_csv("df_evaluation_final.csv")
ev = pd.read_csv("ev_sentence_level_final.csv")
notes = pd.read_csv("notes_sample_final.csv")
agg = pd.read_csv("agg_patient_level_final.csv")

tp = pd.read_csv("true_positives_final.csv")
tn = pd.read_csv("true_negatives_final.csv")
fp = pd.read_csv("false_positives_final.csv")
fn = pd.read_csv("false_negatives_final.csv")

In [4]:

#Here we want to check information completeness

print("Start anlysis")

llm_only = df_eval[(df_eval["any_gold"] == 1) & (df_eval["any_text"] == 1)]
icd9_only = df_eval[(df_eval["any_gold"] == 1) & (df_eval["any_text"] == 0)]
both = df_eval[(df_eval["any_gold"] == 1)]

print(f"\nTotal gold standard positives: {len(both)}")
print(f"Detected by both ICD-9 and LLM: {len(llm_only)}")
print(f"Missed by LLM (ICD-9 only): {len(icd9_only)}")
print(f"Percentage missed by LLM: {len(icd9_only)/len(both)*100:.1f}%")

#Check what information was found in clinical notes
print("From clinical notes")
for subject_id in llm_only["subject_id"].head(10):
    patient_ev = ev[ev["subject_id"] == subject_id]
    print(f"\nPatient {subject_id}:")
    
    # Check what criteria were met
    if patient_ev["is_sleep"].any() and patient_ev["is_impair"].any():
        print("  Found: Sleep difficulty + Daytime impairment (Rule A)")
    if patient_ev["is_primary"].any():
        print("  Found: Primary insomnia medications (Rule B)")
    if patient_ev["is_secondary"].any():
        print("  Found: Secondary insomnia medications (Rule C)")
    
    # Show sample sentences
    positive_sentences = patient_ev[
        patient_ev["is_sleep"] | patient_ev["is_impair"] | 
        patient_ev["is_primary"] | patient_ev["is_secondary"]
    ]
    if len(positive_sentences) > 0:
        print(f"  Sample evidence: '{positive_sentences.iloc[0]['text_span'][:100]}...'")


    print("End of anlysis")

Start anlysis

Total gold standard positives: 66
Detected by both ICD-9 and LLM: 58
Missed by LLM (ICD-9 only): 8
Percentage missed by LLM: 12.1%
From clinical notes

Patient 10029649:
  Found: Sleep difficulty + Daytime impairment (Rule A)
  Found: Secondary insomnia medications (Rule C)
  Sample evidence: 'Although she 
endorses fatigue and diminished motivation, she is unable to 
cite specific stressors ...'
End of anlysis

Patient 10194804:
  Found: Sleep difficulty + Daytime impairment (Rule A)
  Found: Secondary insomnia medications (Rule C)
  Sample evidence: 'Reported poor energy, fatigue, and difficulty concentrating,
although admits she has been performing...'
End of anlysis

Patient 10241958:
  Found: Primary insomnia medications (Rule B)
  Found: Secondary insomnia medications (Rule C)
  Sample evidence: 'Gabapentin 600 mg PO 6A AND 1P 
3....'
End of anlysis

Patient 10325512:
  Found: Secondary insomnia medications (Rule C)
  Sample evidence: '#) CAD: repeat echocardiogram

In [5]:
# Here we want to check information expressiveness

print("Start of anlysis in information expressiveness")

print("\nSymptom Details Available in Notes")
symptom_evidence = ev[ev["is_sleep"]]["text_span"].head(5)
for i, text in enumerate(symptom_evidence, 1):
    print(f"\n{i}. {text[:300]}...")

print("\nMedication Context Available in Notes")
med_evidence = ev[ev["is_primary"] | ev["is_secondary"]]["text_span"].head(5)
for i, text in enumerate(med_evidence, 1):
    print(f"\n{i}. {text[:300]}...")

print("End of anlysis in information expressiveness")


Start of anlysis in information expressiveness

Symptom Details Available in Notes

1. He was brought to the ED by ambulance
and was highly agitated on presentation, chanting loudly in
gibberish which he called "speaking in tongues."  The patient 
was
given Zyprexa 10mg and became more sedate, but was still unable
to cooperate with an interview and responded to questions with
gibberis...

2. Medical records obtained from pt's ___ hospitalization
indicate that he was started on Lithium 300 mg 2 po AM and 3 po
___, as well as Lunesta 3 mg po qhs, and Valium 5 mg bid prn....

3. Mood is "fine." Affect irritable....

4. He also received Ativan 1 mg q6 PRN anxiety and poor 
sleep which he used regularly....

5. Through discussions 
about his presentation and literature review, it was thought 
___ might benefit from initiation of Mirtazapine to help with 
negative sxs....

Medication Context Available in Notes

1. He was brought to the ED by ambulance
and was highly agitated on presentation,

In [6]:
print("\n--- Analysis of Granularity Issues ---")

# Temporal counts
count_curr = (ev["temporality"] == "current").sum()
count_hist = (ev["temporality"] == "historical").sum()
count_uncertain = (ev["temporality"] == "uncertain").sum()
count_neg = ev["negated"].sum()

print(f"\nTemporal Granularity in Notes:")
print(f"  Current conditions: {count_curr} ({count_curr/len(ev)*100:.1f}%)")
print(f"  Historical conditions: {count_hist} ({count_hist/len(ev)*100:.1f}%)")
print(f"  Uncertain timing: {count_uncertain} ({count_uncertain/len(ev)*100:.1f}%)")

print(f"\nNegation in Notes:")
print(f"  Negated statements: {count_neg} ({count_neg/len(ev)*100:.1f}%)")
print(f"  Affirmed statements: {len(ev)-count_neg} ({(len(ev)-count_neg)/len(ev)*100:.1f}%)")

# -------------------------
# CATEGORY A: Explicit negation
# -------------------------
print("\n\n### NEGATION EXAMPLES ###")
neg_ex = ev[ev["negated"]]["text_span"].head(10)
for i, text in enumerate(neg_ex, 1):
    print(f"[NEG-{i}] {text[:200]}...")

# -------------------------
# CATEGORY B: Ambiguous daytime symptoms (fatigue but no sleep mention)
# -------------------------
print("\n\n### AMBIGUOUS DAYTIME SYMPTOMS (NOT sleep-related) ###")

mask_ambiguous = (
    ev["text_span"].str.contains(r"\bfatigue|tired|weakness|malaise|confusion|poor concentration\b", case=False, regex=True)
    & ~ev["text_span"].str.contains(r"sleep|insomnia", case=False, regex=True)
)

ambiguous_examples = ev[mask_ambiguous]["text_span"].head(10)
for i, text in enumerate(ambiguous_examples, 1):
    print(f"[AMB-{i}] {text[:200]}...")

# -------------------------
# CATEGORY C: Historical insomnia (correctly included)
# -------------------------
print("\n\n### HISTORICAL INSOMNIA EXAMPLES (CORRECT BEHAVIOR) ###")

mask_hist_insomnia = (
    (ev["temporality"] == "historical")
    & ev["text_span"].str.contains(r"insomnia|sleep", case=False, regex=True)
)

hist_insomnia = ev[mask_hist_insomnia]["text_span"].head(10)
for i, text in enumerate(hist_insomnia, 1):
    print(f"[HIST-INS-{i}] {text[:200]}...")

# -------------------------
# CATEGORY D: Historical NON-insomnia symptoms (should NOT impact classification)
# -------------------------
print("\n\n### HISTORICAL NON-INSOMNIA SYMPTOMS ###")

mask_hist_noninsomnia = (
    (ev["temporality"] == "historical")
    & ~ev["text_span"].str.contains(r"insomnia|sleep", case=False, regex=True)
)

hist_noninsomnia = ev[mask_hist_noninsomnia]["text_span"].head(10)
for i, text in enumerate(hist_noninsomnia, 1):
    print(f"[HIST-NON-{i}] {text[:200]}...")

# -------------------------
# CATEGORY E: Uncertain temporality examples
# -------------------------
print("\n\n### UNCERTAIN TEMPORALITY EXAMPLES ###")

uncertain_examples = ev[ev["temporality"] == "uncertain"]["text_span"].head(10)
for i, text in enumerate(uncertain_examples, 1):
    print(f"[UNC-{i}] {text[:200]}...")

# -------------------------
# CATEGORY F: Medication mentions without explicit sleep context
# -------------------------
print("\n\n### MEDICATION WITHOUT SLEEP CONTEXT ###")

sleep_meds_regex = r"trazodone|mirtazapine|lunesta|ambien|benadryl|zyprexa|ativan|valium"
mask_med_ambiguous = (
    ev["text_span"].str.contains(sleep_meds_regex, case=False, regex=True)
    & ~ev["text_span"].str.contains(r"sleep|insomnia", case=False, regex=True)
)

med_no_sleep = ev[mask_med_ambiguous]["text_span"].head(10)
for i, text in enumerate(med_no_sleep, 1):
    print(f"[MED-NO-SLEEP-{i}] {text[:200]}...")

# -------------------------
# CATEGORY G: Medication explicitly tied to sleep (clear insomnia evidence)
# -------------------------
print("\n\n### MEDICATION WITH SLEEP CONTEXT ###")

mask_med_sleep = (
    ev["text_span"].str.contains(sleep_meds_regex, case=False, regex=True)
    & ev["text_span"].str.contains(r"sleep|insomnia", case=False, regex=True)
)

med_sleep = ev[mask_med_sleep]["text_span"].head(10)
for i, text in enumerate(med_sleep, 1):
    print(f"[MED-SLEEP-{i}] {text[:200]}...")



--- Analysis of Granularity Issues ---

Temporal Granularity in Notes:
  Current conditions: 556 (82.2%)
  Historical conditions: 24 (3.6%)
  Uncertain timing: 96 (14.2%)

Negation in Notes:
  Negated statements: 77 (11.4%)
  Affirmed statements: 599 (88.6%)


### NEGATION EXAMPLES ###
[NEG-1] # T-cell lymphoma / Fever and malaise: on arrival objective temp 
of 101.0 reported in ED at ___ w/ hx of low grade temps of 
nursing home....
[NEG-2] You were 
admitted to the hospital for fevers and fatigue....
[NEG-3] Chief Complaint:
abdominal pain, weakness, n/v

 
Major Surgical or Invasive Procedure:
___ EGD (upper endoscopy)
___ Colonoscopy
___ PRBC transfusion x 1 unit
___ PRBC transfusion x 2 units
EGD ___
C...
[NEG-4] ___ is an ___ year old with CAD, a remote history of MI, 
DMII, CKD, who presented with three days of nausea abdominal 
pain and fatigue....
[NEG-5] HELD- LORazepam 1 mg PO ASDIR  This medication was held....
[NEG-6] Do 
not restart LORazepam until no longer taking opioi

In [9]:
#Rule breakdown anlysis

print("start of rule breakdown")

# Compare rule by rule
rules = [
    ("Rule A (Sleep + Impairment)", "rule_a_gold", "rule_a_text"),
    ("Rule B (Primary Meds)", "rule_b_gold", "rule_b_text"),
    ("Rule C (Secondary Meds)", "rule_c_gold", "rule_c_text")
]

for rule_name, gold_col, text_col in rules:
    print(f"\n--- {rule_name} ---")
    
    # True positives: Gold=1, Text=1
    tp = df_eval[(df_eval[gold_col] == 1) & (df_eval[text_col] == 1)]
    # False negatives: Gold=1, Text=0
    fn = df_eval[(df_eval[gold_col] == 1) & (df_eval[text_col] == 0)]
    # False positives: Gold=0, Text=1
    fp = df_eval[(df_eval[gold_col] == 0) & (df_eval[text_col] == 1)]
    
    total_gold = len(df_eval[df_eval[gold_col] == 1])
    
    print(f"Total ICD-9 positive patients: {total_gold}")
    print(f"Detected by LLM: {len(tp)} ({len(tp)/total_gold*100:.1f}%)")
    print(f"Missed by LLM (False Negatives): {len(fn)} ({len(fn)/total_gold*100:.1f}%)")
    print(f"Extra detected by LLM (False Positives): {len(fp)}")
    
    if len(fn) > 0:
        print(f"\nFalse Negative Patient IDs: {fn['subject_id'].tolist()}")

start of rule breakdown

--- Rule A (Sleep + Impairment) ---
Total ICD-9 positive patients: 29
Detected by LLM: 11 (37.9%)
Missed by LLM (False Negatives): 18 (62.1%)
Extra detected by LLM (False Positives): 11

False Negative Patient IDs: [10241958, 10497294, 10608904, 10793407, 11415136, 11712892, 12924398, 13077594, 13158370, 13383915, 16384274, 16704898, 18465811, 18826385, 19043184, 19219440, 19276983, 19848209]

--- Rule B (Primary Meds) ---
Total ICD-9 positive patients: 52
Detected by LLM: 16 (30.8%)
Missed by LLM (False Negatives): 36 (69.2%)
Extra detected by LLM (False Positives): 4

False Negative Patient IDs: [10194804, 10325512, 10451947, 10608904, 11598228, 12201330, 12379543, 12513126, 12747844, 12924398, 13157375, 13689390, 13777455, 14244279, 14349210, 14432757, 15090960, 15106894, 15229355, 16362820, 16384274, 16493581, 16704898, 16866064, 16868992, 17691205, 17813103, 17842866, 18139850, 18465811, 18510965, 18732942, 19219440, 19391478, 19456875, 19500277]

--- Rule

In [10]:
#False negative analysis
print("(Patients with ICD-9 codes but missed by LLM)")

false_neg_patients = df_eval[(df_eval["any_gold"] == 1) & (df_eval["any_text"] == 0)]

print(f"\nTotal False Negatives: {len(false_neg_patients)}")
print("\nAnalyzing each false negative case:\n")

for idx, row in false_neg_patients.iterrows():
    subject_id = row["subject_id"]
    print(f"\n{'='*60}")
    print(f"Patient ID: {subject_id}")
    print(f"ICD-9 Rules Met: Rule A={row['rule_a_gold']}, Rule B={row['rule_b_gold']}, Rule C={row['rule_c_gold']}")
    
    # Here we extract the notes
    patient_notes = notes[notes["subject_id"] == subject_id]
    print(f"Num of discharge summaries: {len(patient_notes)}")
    
    # LLM evidence
    patient_ev = ev[ev["subject_id"] == subject_id]
    print(f"Num of candidate sentences extracted: {len(patient_ev)}")
    
    if len(patient_ev) > 0:
        print("\nEvidence found in notes:")
        print(f"  Sleep difficulty mentions: {patient_ev['asserts_sleep_difficulty'].sum()}")
        print(f"  Daytime impairment mentions: {patient_ev['asserts_daytime_impairment'].sum()}")
        print(f"  Primary med mentions: {patient_ev['asserts_primary_med'].sum()}")
        print(f"  Secondary med mentions: {patient_ev['asserts_secondary_med'].sum()}")
        
        # Show why it failed
        print("\n  Reasons for exclusion:")
        negated_pct = patient_ev["negated"].sum() / len(patient_ev) * 100
        hist_pct = (patient_ev["temporality"] == "historical").sum() / len(patient_ev) * 100
        
        if negated_pct > 50:
            print(f"    - High negation rate: {negated_pct:.1f}%")
        if hist_pct > 50:
            print(f"    - High historical rate: {hist_pct:.1f}%")
        
        # Show sample sentences
        print("\n  Sample sentences found:")
        for i, (_, sent) in enumerate(patient_ev.head(3).iterrows(), 1):
            print(f"    {i}. [{sent['temporality']}, negated={sent['negated']}]")
            print(f"       '{sent['text_span'][:300]}...'")
    else:
        print("\n No candidate sentences extracted - likely no relevant keywords found in notes")


(Patients with ICD-9 codes but missed by LLM)

Total False Negatives: 8

Analyzing each false negative case:


Patient ID: 11598228
ICD-9 Rules Met: Rule A=0, Rule B=1, Rule C=1
Num of discharge summaries: 1
Num of candidate sentences extracted: 1

Evidence found in notes:
  Sleep difficulty mentions: 0
  Daytime impairment mentions: 0
  Primary med mentions: 0
  Secondary med mentions: 0

  Reasons for exclusion:
    - High negation rate: 100.0%

  Sample sentences found:
    1. [uncertain, negated=True]
       'There were no paraphasic 
errors....'

Patient ID: 12201330
ICD-9 Rules Met: Rule A=0, Rule B=1, Rule C=1
Num of discharge summaries: 1
Num of candidate sentences extracted: 2

Evidence found in notes:
  Sleep difficulty mentions: 1
  Daytime impairment mentions: 0
  Primary med mentions: 0
  Secondary med mentions: 0

  Reasons for exclusion:

  Sample sentences found:
    1. [current, negated=False]
       'He felt very tired....'
    2. [uncertain, negated=True]
       'The