In [1]:
"""
Insomnia Phenotyping Algorithm - SENTENCE-BASED APPROACH
=========================================================
Final version without strict temporal filtering to match ICD-based gold standard.
Comprehensive analysis of all classification outcomes.
"""

import sys
import os, json, re
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime
import ollama
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

# =============================================================================
# DATABASE CONNECTION
# =============================================================================

PG_URL = "postgresql+psycopg2://postgres:4030@localhost:5432/omop_sandbox"
engine = create_engine(PG_URL)

# =============================================================================
# DATA LOADING
# =============================================================================

# Load patients with notes from gold standard
patients = pd.read_sql("""
    SELECT DISTINCT c.subject_id
    FROM mimic_omop.insomnia_cohort c
    JOIN mimic_omop.notes_norm n
      ON c.subject_id = n.subject_id
    WHERE n.text IS NOT NULL AND LENGTH(n.text) > 50
""", engine)

# Sample 50 patients
sample_patients = patients["subject_id"].sample(70, random_state=42)
print(f"Using {len(sample_patients)} gold-standard patients WITH notes")

# Load all notes for these patients
notes = pd.read_sql(f"""
    SELECT subject_id, hadm_id, text AS note_text
    FROM mimic_omop.notes_norm
    WHERE subject_id IN ({",".join(map(str, sample_patients.tolist()))})
      AND text IS NOT NULL AND LENGTH(text) > 50;
""", engine)

notes = notes.reset_index().rename(columns={"index": "note_rowid"})
print(f"Loaded {len(notes)} notes from {len(sample_patients)} patients")

# Load gold standard
gold = pd.read_sql("""
    SELECT subject_id, rule_a, rule_b, rule_c, any_rule AS any_gold
    FROM mimic_omop.insomnia_cohort;
""", engine)

gold = gold.rename(columns={
    "rule_a": "rule_a_gold",
    "rule_b": "rule_b_gold",
    "rule_c": "rule_c_gold"
})

gold = gold[gold["subject_id"].isin(sample_patients)]

for col in ["rule_a_gold", "rule_b_gold", "rule_c_gold", "any_gold"]:
    gold[col] = gold[col].astype(int)

# =============================================================================
# VOCABULARY DEFINITIONS
# =============================================================================

SLEEP_TERMS = [
    "insomnia", "sleep onset", "sleep maintenance", "early awakening",
    "trouble sleeping", "difficulty sleeping", "can't sleep", "cant sleep",
    "sleep latency", "sleeplessness", "not sleeping", "poor sleep",
    "restless sleep", "hard to fall asleep", "sleep problem"
]

IMPAIR_TERMS = [
    "fatigue", "tired", "daytime sleepiness", "somnolence", "malaise",
    "irritable", "irritability", "poor concentration", "attention",
    "memory", "impaired performance", "decreased motivation",
    "errors", "accidents", "dissatisfaction with sleep",
    "low energy", "hard to concentrate", "sleepy", "tiredness"
]

PRIMARY_MED_TERMS = [
    "zolpidem", "ambien", "zaleplon", "sonata", "eszopiclone", "lunesta",
    "temazepam", "restoril", "triazolam", "halcion",
    "ramelteon", "rozerem", "suvorexant", "belsomra",
    "lemborexant", "dayvigo"
]

SECONDARY_MED_TERMS = [
    "trazodone", "mirtazapine", "melatonin", "hydroxyzine",
    "doxepin", "gabapentin", "quetiapine", "seroquel", "olanzapine", "zyprexa",
    "clonazepam", "klonopin", "lorazepam", "ativan", "diazepam", "valium"
]

# =============================================================================
# SENTENCE PROCESSING
# =============================================================================

def split_sentences(t):
    """Split text into sentences"""
    sents = re.split(r'(?<=[.!?])\s+', t.strip())
    return [s.strip()[:1000] for s in sents if 5 < len(s) < 1000]

def is_candidate(sent):
    """Check if sentence contains relevant keywords"""
    s = sent.lower()
    return (
        any(w in s for w in SLEEP_TERMS) or
        any(w in s for w in IMPAIR_TERMS) or
        any(w in s for w in PRIMARY_MED_TERMS) or
        any(w in s for w in SECONDARY_MED_TERMS) 
    )

# =============================================================================
# EXTRACT CANDIDATE SENTENCES
# =============================================================================

print("\nExtracting candidate sentences...")

rows = []
for _, r in notes.iterrows():
    sents = split_sentences(r["note_text"])
    for i, s in enumerate(sents):
        if is_candidate(s):
            rows.append({
                "subject_id": r["subject_id"],
                "hadm_id": r["hadm_id"],
                "note_rowid": r["note_rowid"],
                "sent_id": i,
                "text_span": s
            })

cands = pd.DataFrame(rows)
print(f"Extracted {len(cands)} candidate sentences")

# =============================================================================
# LLM PROMPT AND CLASSIFICATION
# =============================================================================

SYSTEM_PROMPT = """
You are a clinical NLP assistant identifying insomnia evidence.

Primary insomnia medications:
zolpidem (Ambien), zaleplon (Sonata), eszopiclone (Lunesta), temazepam (Restoril),
triazolam (Halcion), suvorexant (Belsomra), lemborexant (Dayvigo), ramelteon (Rozerem).

Secondary insomnia medications:
trazodone, mirtazapine, melatonin, hydroxyzine,
doxepin, gabapentin, quetiapine (Seroquel), olanzapine (Zyprexa),
clonazepam (Klonopin), lorazepam (Ativan), diazepam (Valium).

Return strict JSON:
{
 "asserts_sleep_difficulty": bool,
 "asserts_daytime_impairment": bool,
 "asserts_primary_med": bool,
 "asserts_secondary_med": bool,
 "negated": bool,
 "temporality": "current|historical|uncertain"
}
Be inclusive; lean toward True and 'current'.
"""

def extract_json(text):
    """Extract JSON from LLM response"""
    m = re.search(r"\{.*\}", text, re.DOTALL)
    if not m:
        return {"error": "no JSON", "raw": text[:200]}
    try:
        return json.loads(m.group(0))
    except:
        return {"error": "bad json", "raw": m.group(0)}

def classify_sentence_ollama(text):
    """Classify a single sentence"""
    prompt = f"{SYSTEM_PROMPT}\nSentence: \"{text}\""
    resp = ollama.chat(
        model="llama3:8b",
        messages=[{"role": "user", "content": prompt}]
    )
    return extract_json(resp["message"]["content"])

# =============================================================================
# CLASSIFY ALL SENTENCES
# =============================================================================

print(f"\nClassifying {len(cands)} sentences with LLM...")

out = []
for idx, r in cands.iterrows():
    if (idx + 1) % 50 == 0:
        print(f"  Progress: {idx + 1}/{len(cands)} sentences")
    
    y = classify_sentence_ollama(r["text_span"])
    out.append({
        **r.to_dict(),
        "asserts_sleep_difficulty": y.get("asserts_sleep_difficulty", False),
        "asserts_daytime_impairment": y.get("asserts_daytime_impairment", False),
        "asserts_primary_med": y.get("asserts_primary_med", False),
        "asserts_secondary_med": y.get("asserts_secondary_med", False),
        "negated": y.get("negated", False),
        "temporality": y.get("temporality", "uncertain")
    })

ev = pd.DataFrame(out)
print(f"Classified {len(ev)} sentences")

# =============================================================================
# PATIENT-LEVEL AGGREGATION (NO STRICT TEMPORAL FILTER)
# =============================================================================

print("\nAggregating to patient level...")
print("NOTE: Only filtering negated items, NOT filtering by temporality")
print("      This matches ICD-based gold standard (any history of insomnia)")

# Apply filters: Remove only negated items
# Include current, historical, and uncertain temporality
ev["is_sleep"] = ev["asserts_sleep_difficulty"] & ~ev["negated"]
ev["is_impair"] = ev["asserts_daytime_impairment"] & ~ev["negated"]
ev["is_primary"] = ev["asserts_primary_med"] & ~ev["negated"]
ev["is_secondary"] = ev["asserts_secondary_med"] & ~ev["negated"]

agg = ev.groupby("subject_id").agg({
    "is_sleep": "max",
    "is_impair": "max",
    "is_primary": "max",
    "is_secondary": "max"
}).reset_index()

agg["rule_a_text"] = (agg["is_sleep"] & agg["is_impair"]).astype(int)
agg["rule_b_text"] = agg["is_primary"].astype(int)
agg["rule_c_text"] = agg["is_secondary"].astype(int)
agg["any_text"] = agg[["rule_a_text", "rule_b_text", "rule_c_text"]].any(axis=1).astype(int)

# =============================================================================
# MERGE WITH GOLD STANDARD
# =============================================================================

df = gold.merge(agg, on="subject_id", how="inner").fillna(0)

for col in ["rule_a_text", "rule_b_text", "rule_c_text", "any_text"]:
    df[col] = df[col].astype(int)

print(f"\nFinal merged dataframe shape: {df.shape}")

# =============================================================================
# EVALUATION
# =============================================================================

print("\n" + "="*70)
print("EVALUATION RESULTS - SENTENCE-BASED APPROACH (NO TEMPORAL FILTER)")
print("="*70)

def evaluate(true, pred, label):
    print(f"\n=== {label} ===")
    cm = confusion_matrix(true, pred)
    print("Confusion Matrix:")
    print(cm)
    
    prec = precision_score(true, pred, zero_division=0)
    rec = recall_score(true, pred, zero_division=0)
    f1 = f1_score(true, pred, zero_division=0)
    
    print(f"Precision: {prec:.3f}")
    print(f"Recall:    {rec:.3f}")
    print(f"F1 Score:  {f1:.3f}")
    
    return {"precision": prec, "recall": rec, "f1": f1, "cm": cm}

results = {}
results["Rule A"] = evaluate(df["rule_a_gold"], df["rule_a_text"], "Rule A (Symptoms)")
results["Rule B"] = evaluate(df["rule_b_gold"], df["rule_b_text"], "Rule B (Primary Meds)")
results["Rule C"] = evaluate(df["rule_c_gold"], df["rule_c_text"], "Rule C (Secondary Meds)")
results["Any Rule"] = evaluate(df["any_gold"], df["any_text"], "Any Rule (Insomnia)")

# =============================================================================
# COMPREHENSIVE CLASSIFICATION ANALYSIS
# =============================================================================

print("\n" + "="*70)
print("COMPREHENSIVE CLASSIFICATION ANALYSIS")
print("="*70)

def analyze_classification_outcomes(gold_col, pred_col, rule_name):
    """Detailed analysis of all four classification outcomes"""
    
    print(f"\n{'='*70}")
    print(f"{rule_name.upper()}")
    print(f"{'='*70}")
    
    # Calculate all four outcomes
    true_pos = df[(df[gold_col] == 1) & (df[pred_col] == 1)]
    true_neg = df[(df[gold_col] == 0) & (df[pred_col] == 0)]
    false_pos = df[(df[gold_col] == 0) & (df[pred_col] == 1)]
    false_neg = df[(df[gold_col] == 1) & (df[pred_col] == 0)]
    
    total = len(df)
    
    print(f"\nOVERALL DISTRIBUTION:")
    print(f"  Total patients: {total}")
    print(f"  Gold standard positive: {df[gold_col].sum()}")
    print(f"  Gold standard negative: {(df[gold_col] == 0).sum()}")
    print(f"  LLM predicted positive: {df[pred_col].sum()}")
    print(f"  LLM predicted negative: {(df[pred_col] == 0).sum()}")
    
    print(f"\nCLASSIFICATION OUTCOMES:")
    print(f"  True Positives:  {len(true_pos):2d} ({len(true_pos)/total*100:5.1f}%) - Correctly identified cases")
    print(f"  True Negatives:  {len(true_neg):2d} ({len(true_neg)/total*100:5.1f}%) - Correctly identified non-cases")
    print(f"  False Positives: {len(false_pos):2d} ({len(false_pos)/total*100:5.1f}%) - Incorrectly flagged as cases")
    print(f"  False Negatives: {len(false_neg):2d} ({len(false_neg)/total*100:5.1f}%) - Missed actual cases")
    
    # True Positives Analysis
    if len(true_pos) > 0:
        print(f"\n--- TRUE POSITIVES ({len(true_pos)} patients) ---")
        print("Patients correctly identified as having the condition")
        print(f"Patient IDs: {true_pos['subject_id'].tolist()}")
        
        for idx, (_, patient) in enumerate(true_pos.iterrows(), 1):
            pid = patient["subject_id"]
            patient_ev = ev[ev["subject_id"] == pid]
            
            print(f"\n  [{idx}] Patient {pid}:")
            print(f"      Evidence found in {len(patient_ev)} sentences")
            
            if "rule_a" in rule_name.lower():
                sleep_sents = patient_ev[patient_ev["is_sleep"]].head(2)
                impair_sents = patient_ev[patient_ev["is_impair"]].head(2)
                print(f"      Sleep difficulty mentions: {patient_ev['is_sleep'].sum()}")
                print(f"      Daytime impairment mentions: {patient_ev['is_impair'].sum()}")
                if len(sleep_sents) > 0:
                    print(f"      Example sleep: '{sleep_sents.iloc[0]['text_span'][:100]}...'")
                if len(impair_sents) > 0:
                    print(f"      Example impairment: '{impair_sents.iloc[0]['text_span'][:100]}...'")
            
            elif "rule_b" in rule_name.lower():
                med_sents = patient_ev[patient_ev["is_primary"]].head(2)
                print(f"      Primary medication mentions: {patient_ev['is_primary'].sum()}")
                if len(med_sents) > 0:
                    print(f"      Example: '{med_sents.iloc[0]['text_span'][:100]}...'")
            
            elif "rule_c" in rule_name.lower():
                med_sents = patient_ev[patient_ev["is_secondary"]].head(2)
                print(f"      Secondary medication mentions: {patient_ev['is_secondary'].sum()}")
                if len(med_sents) > 0:
                    print(f"      Example: '{med_sents.iloc[0]['text_span'][:100]}...'")
    
    # True Negatives Analysis
    if len(true_neg) > 0:
        print(f"\n--- TRUE NEGATIVES ({len(true_neg)} patients) ---")
        print("Patients correctly identified as NOT having the condition")
        print(f"Patient IDs: {true_neg['subject_id'].tolist()}")
        print("These patients had no evidence in gold standard and LLM found none")
    
    # False Positives Analysis
    if len(false_pos) > 0:
        print(f"\n--- FALSE POSITIVES ({len(false_pos)} patients) ---")
        print("PROBLEM: LLM incorrectly identified these patients as cases")
        print(f"Patient IDs: {false_pos['subject_id'].tolist()}")
        
        for idx, (_, patient) in enumerate(false_pos.iterrows(), 1):
            pid = patient["subject_id"]
            patient_ev = ev[ev["subject_id"] == pid]
            
            print(f"\n  [{idx}] Patient {pid}:")
            print(f"      Gold standard: NEGATIVE")
            print(f"      LLM prediction: POSITIVE (INCORRECT)")
            
            if "rule_a" in rule_name.lower():
                print(f"      Sleep mentions: {patient_ev['is_sleep'].sum()}")
                print(f"      Impairment mentions: {patient_ev['is_impair'].sum()}")
                
                # Show what triggered false positive
                if patient_ev['is_sleep'].sum() > 0:
                    example = patient_ev[patient_ev["is_sleep"]].iloc[0]
                    print(f"      Triggered by sleep: '{example['text_span'][:150]}...'")
                if patient_ev['is_impair'].sum() > 0:
                    example = patient_ev[patient_ev["is_impair"]].iloc[0]
                    print(f"      Triggered by impairment: '{example['text_span'][:150]}...'")
            
            elif "rule_b" in rule_name.lower():
                print(f"      Primary med mentions: {patient_ev['is_primary'].sum()}")
                if patient_ev['is_primary'].sum() > 0:
                    example = patient_ev[patient_ev["is_primary"]].iloc[0]
                    print(f"      Triggered by: '{example['text_span'][:150]}...'")
                    print(f"      Temporality: {example['temporality']}")
            
            elif "rule_c" in rule_name.lower():
                print(f"      Secondary med mentions: {patient_ev['is_secondary'].sum()}")
                if patient_ev['is_secondary'].sum() > 0:
                    example = patient_ev[patient_ev["is_secondary"]].iloc[0]
                    print(f"      Triggered by: '{example['text_span'][:150]}...'")
                    print(f"      Temporality: {example['temporality']}")
            
            print(f"      LIKELY CAUSE: May be documented in notes but not coded in ICD")
    
    # False Negatives Analysis
    if len(false_neg) > 0:
        print(f"\n--- FALSE NEGATIVES ({len(false_neg)} patients) ---")
        print("PROBLEM: LLM missed these actual cases")
        print(f"Patient IDs: {false_neg['subject_id'].tolist()}")
        
        for idx, (_, patient) in enumerate(false_neg.iterrows(), 1):
            pid = patient["subject_id"]
            patient_ev = ev[ev["subject_id"] == pid]
            
            print(f"\n  [{idx}] Patient {pid}:")
            print(f"      Gold standard: POSITIVE")
            print(f"      LLM prediction: NEGATIVE (MISSED)")
            
            if len(patient_ev) > 0:
                print(f"      LLM processed {len(patient_ev)} candidate sentences")
                
                if "rule_a" in rule_name.lower():
                    print(f"      Sleep detected: {patient_ev['asserts_sleep_difficulty'].sum()} (after filter: {patient_ev['is_sleep'].sum()})")
                    print(f"      Impairment detected: {patient_ev['asserts_daytime_impairment'].sum()} (after filter: {patient_ev['is_impair'].sum()})")
                    
                    # Check if detected but filtered
                    detected_sleep = patient_ev[patient_ev['asserts_sleep_difficulty'] & patient_ev['negated']]
                    if len(detected_sleep) > 0:
                        print(f"      NOTE: Sleep difficulty detected but negated in {len(detected_sleep)} sentences")
                
                elif "rule_b" in rule_name.lower():
                    print(f"      Primary meds detected: {patient_ev['asserts_primary_med'].sum()} (after filter: {patient_ev['is_primary'].sum()})")
                    
                    detected_but_negated = patient_ev[patient_ev['asserts_primary_med'] & patient_ev['negated']]
                    if len(detected_but_negated) > 0:
                        print(f"      NOTE: Medication detected but negated in {len(detected_but_negated)} sentences")
                        example = detected_but_negated.iloc[0]
                        print(f"      Example: '{example['text_span'][:150]}...'")
                
                elif "rule_c" in rule_name.lower():
                    print(f"      Secondary meds detected: {patient_ev['asserts_secondary_med'].sum()} (after filter: {patient_ev['is_secondary'].sum()})")
                    
                    detected_but_negated = patient_ev[patient_ev['asserts_secondary_med'] & patient_ev['negated']]
                    if len(detected_but_negated) > 0:
                        print(f"      NOTE: Medication detected but negated in {len(detected_but_negated)} sentences")
                
                print(f"      LIKELY CAUSE: Medication/symptom in ICD but not explicitly documented in notes,")
                print(f"                    or mentioned in non-candidate sentences")
            else:
                print(f"      NO candidate sentences extracted")
                print(f"      LIKELY CAUSE: Relevant information not captured by keyword filter")

# Run comprehensive analysis for each rule
analyze_classification_outcomes("rule_a_gold", "rule_a_text", "Rule A (Sleep + Impairment)")
analyze_classification_outcomes("rule_b_gold", "rule_b_text", "Rule B (Primary Medications)")
analyze_classification_outcomes("rule_c_gold", "rule_c_text", "Rule C (Secondary Medications)")
analyze_classification_outcomes("any_gold", "any_text", "Any Rule (Overall Insomnia)")

# =============================================================================
# COMPARISON TABLE
# =============================================================================

print("\n" + "="*70)
print("SUMMARY COMPARISON TABLE")
print("="*70 + "\n")

comparison = pd.DataFrame({
    'Rule': ['A: Symptoms', 'B: Primary Meds', 'C: Secondary Meds', 'Any Rule'],
    'Gold +': [
        df['rule_a_gold'].sum(),
        df['rule_b_gold'].sum(),
        df['rule_c_gold'].sum(),
        df['any_gold'].sum()
    ],
    'LLM +': [
        df['rule_a_text'].sum(),
        df['rule_b_text'].sum(),
        df['rule_c_text'].sum(),
        df['any_text'].sum()
    ],
    'TP': [
        ((df['rule_a_gold']==1) & (df['rule_a_text']==1)).sum(),
        ((df['rule_b_gold']==1) & (df['rule_b_text']==1)).sum(),
        ((df['rule_c_gold']==1) & (df['rule_c_text']==1)).sum(),
        ((df['any_gold']==1) & (df['any_text']==1)).sum()
    ],
    'TN': [
        ((df['rule_a_gold']==0) & (df['rule_a_text']==0)).sum(),
        ((df['rule_b_gold']==0) & (df['rule_b_text']==0)).sum(),
        ((df['rule_c_gold']==0) & (df['rule_c_text']==0)).sum(),
        ((df['any_gold']==0) & (df['any_text']==0)).sum()
    ],
    'FP': [
        ((df['rule_a_gold']==0) & (df['rule_a_text']==1)).sum(),
        ((df['rule_b_gold']==0) & (df['rule_b_text']==1)).sum(),
        ((df['rule_c_gold']==0) & (df['rule_c_text']==1)).sum(),
        ((df['any_gold']==0) & (df['any_text']==1)).sum()
    ],
    'FN': [
        ((df['rule_a_gold']==1) & (df['rule_a_text']==0)).sum(),
        ((df['rule_b_gold']==1) & (df['rule_b_text']==0)).sum(),
        ((df['rule_c_gold']==1) & (df['rule_c_text']==0)).sum(),
        ((df['any_gold']==1) & (df['any_text']==0)).sum()
    ]
})

comparison['Precision'] = comparison['TP'] / (comparison['TP'] + comparison['FP'])
comparison['Recall'] = comparison['TP'] / (comparison['TP'] + comparison['FN'])
comparison['F1'] = 2 * (comparison['Precision'] * comparison['Recall']) / (comparison['Precision'] + comparison['Recall'])
comparison['Accuracy'] = (comparison['TP'] + comparison['TN']) / len(df)

print(comparison.round(3).to_string(index=False))

# =============================================================================
# TEMPORALITY IMPACT ANALYSIS
# =============================================================================

print("\n" + "="*70)
print("TEMPORALITY IMPACT ANALYSIS")
print("="*70)

print("\nDistribution of temporality classifications:")
print(ev['temporality'].value_counts())

print("\nMedication detection by temporality:")
primary_by_temp = ev[ev['asserts_primary_med']].groupby('temporality').size()
secondary_by_temp = ev[ev['asserts_secondary_med']].groupby('temporality').size()

print("\nPrimary medications:")
for temp, count in primary_by_temp.items():
    print(f"  {temp}: {count}")

print("\nSecondary medications:")
for temp, count in secondary_by_temp.items():
    print(f"  {temp}: {count}")

print("\nNOTE: Without temporal filtering, all non-negated detections are included.")
print("      This better matches ICD-based gold standard which captures any history.")

# =============================================================================
# SAVE RESULTS
# =============================================================================

print("\n" + "="*70)
print("SAVING RESULTS")
print("="*70)

notes.to_csv("notes_sample_final.csv", index=False)
ev.to_csv("ev_sentence_level_final.csv", index=False)
agg.to_csv("agg_patient_level_final.csv", index=False)
df.to_csv("df_evaluation_final.csv", index=False)

# Save classification subgroups
df[df['any_gold']==1][df['any_text']==1].to_csv("true_positives_final.csv", index=False)
df[df['any_gold']==0][df['any_text']==0].to_csv("true_negatives_final.csv", index=False)
df[df['any_gold']==0][df['any_text']==1].to_csv("false_positives_final.csv", index=False)
df[df['any_gold']==1][df['any_text']==0].to_csv("false_negatives_final.csv", index=False)

print("\nAll results saved")
print("\nFiles created:")
print("  - notes_sample_final.csv")
print("  - ev_sentence_level_final.csv")
print("  - agg_patient_level_final.csv")
print("  - df_evaluation_final.csv")
print("  - true_positives_final.csv")
print("  - true_negatives_final.csv")
print("  - false_positives_final.csv")
print("  - false_negatives_final.csv")

print("\n" + "="*70)
print("ANALYSIS COMPLETE")
print("="*70)

Using 70 gold-standard patients WITH notes
Loaded 249 notes from 70 patients

Extracting candidate sentences...
Extracted 676 candidate sentences

Classifying 676 sentences with LLM...
  Progress: 50/676 sentences
  Progress: 100/676 sentences
  Progress: 150/676 sentences
  Progress: 200/676 sentences
  Progress: 250/676 sentences
  Progress: 300/676 sentences
  Progress: 350/676 sentences
  Progress: 400/676 sentences
  Progress: 450/676 sentences
  Progress: 500/676 sentences
  Progress: 550/676 sentences
  Progress: 600/676 sentences
  Progress: 650/676 sentences
Classified 676 sentences

Aggregating to patient level...
NOTE: Only filtering negated items, NOT filtering by temporality
      This matches ICD-based gold standard (any history of insomnia)

Final merged dataframe shape: (66, 13)

EVALUATION RESULTS - SENTENCE-BASED APPROACH (NO TEMPORAL FILTER)

=== Rule A (Symptoms) ===
Confusion Matrix:
[[26 11]
 [18 11]]
Precision: 0.500
Recall:    0.379
F1 Score:  0.431

=== Rule B 

  df[df['any_gold']==0][df['any_text']==0].to_csv("true_negatives_final.csv", index=False)
  df[df['any_gold']==0][df['any_text']==1].to_csv("false_positives_final.csv", index=False)
