In [1]:
!pip install seaborn

import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2


In [None]:

# ------------------------------------------------------------------
# Load data
# ------------------------------------------------------------------

df = pd.read_parquet("df_evaluation_balanced.parquet")
ev = pd.read_parquet("ev_sentence_level_balanced.parquet")
notes = pd.read_parquet("notes_sample_balanced.parquet")

# ------------------------------------------------------------------
# Metrics
# ------------------------------------------------------------------

def calculate_metrics(y_true, y_pred, label):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()

    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    accuracy = accuracy_score(y_true, y_pred)

    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0

    return {
        'label': label,
        'TP': tp,
        'FP': fp,
        'TN': tn,
        'FN': fn,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'specificity': specificity,
        'npv': npv
    }

results = [
    calculate_metrics(df['rule_a_gold'], df['rule_a_text'], "Rule A"),
    calculate_metrics(df['rule_b_gold'], df['rule_b_text'], "Rule B"),
    calculate_metrics(df['rule_c_gold'], df['rule_c_text'], "Rule C"),
    calculate_metrics(df['any_gold'],    df['any_text'],    "Any Rule")
]

results_df = pd.DataFrame(results)

# ------------------------------------------------------------------
# Information expressiveness
# ------------------------------------------------------------------

feature_stats = (
    ev.groupby('subject_id')
      .agg({
          'asserts_sleep_difficulty': 'sum',
          'asserts_daytime_impairment': 'sum',
          'asserts_primary_med': 'sum',
          'asserts_secondary_med': 'sum',
          'negated': 'sum'
      })
      .reset_index()
)

feature_stats.columns = [
    'subject_id',
    'sleep_mentions',
    'impairment_mentions',
    'primary_med_mentions',
    'secondary_med_mentions',
    'negated_mentions'
]

feature_analysis = feature_stats.merge(
    df[['subject_id','rule_a_gold','rule_b_gold','rule_c_gold','any_gold']],
    on='subject_id'
)

icd_pos = feature_analysis[feature_analysis['any_gold'] == 1]

# ------------------------------------------------------------------
# Completeness
# ------------------------------------------------------------------

total_sentences = notes['note_text'].apply(lambda x: len(x.split('.'))).sum()
candidate_sentences = len(ev)

patients_with_candidates = ev['subject_id'].nunique()
total_patients = df['subject_id'].nunique()

icd_pos_with_features = df[
    (df['any_gold'] == 1) &
    (df['subject_id'].isin(ev['subject_id']))
].shape[0]

# ------------------------------------------------------------------
# Granularity
# ------------------------------------------------------------------

rule_comparison = pd.DataFrame({
    'Rule': ['A','B','C','Any'],
    'Precision': [r['precision'] for r in results],
    'Recall': [r['recall'] for r in results],
    'F1': [r['f1'] for r in results],
    'Gold_Positive': [
        df['rule_a_gold'].sum(),
        df['rule_b_gold'].sum(),
        df['rule_c_gold'].sum(),
        df['any_gold'].sum()
    ],
    'LLM_Positive': [
        df['rule_a_text'].sum(),
        df['rule_b_text'].sum(),
        df['rule_c_text'].sum(),
        df['any_text'].sum()
    ]
})

df['gold_rule_count'] = (
    df['rule_a_gold'] + df['rule_b_gold'] + df['rule_c_gold']
)
df['llm_rule_count'] = (
    df['rule_a_text'] + df['rule_b_text'] + df['rule_c_text']
)

# ------------------------------------------------------------------
# Error analysis
# ------------------------------------------------------------------

fn_patients = df[(df['any_gold'] == 1) & (df['any_text'] == 0)]
fp_patients = df[(df['any_gold'] == 0) & (df['any_text'] == 1)]

rule_a_errors = df[df['rule_a_gold'] != df['rule_a_text']]
rule_b_errors = df[df['rule_b_gold'] != df['rule_b_text']]
rule_c_errors = df[df['rule_c_gold'] != df['rule_c_text']]

# ------------------------------------------------------------------
# Save outputs
# ------------------------------------------------------------------

results_df.to_csv("performance_metrics_summary.csv", index=False)
rule_comparison.to_csv("rule_comparison.csv", index=False)

error_summary = pd.DataFrame({
    'Error_Type': [
        'False Negatives',
        'False Positives',
        'Rule A Errors',
        'Rule B Errors',
        'Rule C Errors'
    ],
    'Count': [
        len(fn_patients),
        len(fp_patients),
        len(rule_a_errors),
        len(rule_b_errors),
        len(rule_c_errors)
    ],
    'Rate': [
        len(fn_patients)/len(df),
        len(fp_patients)/len(df),
        len(rule_a_errors)/len(df),
        len(rule_b_errors)/len(df),
        len(rule_c_errors)/len(df)
    ]
})

error_summary.to_csv("error_analysis_summary.csv", index=False)

if len(fn_patients) > 0:
    ev[ev['subject_id'].isin(fn_patients['subject_id'])] \
        .to_csv("false_negatives_detailed.csv", index=False)

if len(fp_patients) > 0:
    ev[ev['subject_id'].isin(fp_patients['subject_id'])] \
        .to_csv("false_positives_detailed.csv", index=False)


LOADING EVALUATION DATA

Dataset sizes:
  - Patients: 60
  - Sentences: 505
  - Notes: 212

1. OVERALL PERFORMANCE METRICS

Rule A: Sleep Difficulty + Daytime Impairment
---------------------------------------------
Confusion Matrix:
  TN:  38  |  FP:   9
  FN:   4  |  TP:   9

Performance Metrics:
  Accuracy:    0.783
  Precision:   0.500
  Recall:      0.692
  F1-Score:    0.581
  Specificity: 0.809
  NPV:         0.905

Rule B: Primary Insomnia Medications
------------------------------------
Confusion Matrix:
  TN:  35  |  FP:  10
  FN:   6  |  TP:   9

Performance Metrics:
  Accuracy:    0.733
  Precision:   0.474
  Recall:      0.600
  F1-Score:    0.529
  Specificity: 0.778
  NPV:         0.854

Rule C: Secondary Medications + Symptoms
----------------------------------------
Confusion Matrix:
  TN:  36  |  FP:   6
  FN:   5  |  TP:  13

Performance Metrics:
  Accuracy:    0.817
  Precision:   0.684
  Recall:      0.722
  F1-Score:    0.703
  Specificity: 0.857
  NPV:         0.