In [8]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

# Database connection
PG_URL = "postgresql+psycopg2://postgres:4030@localhost:5432/omop_sandbox"
engine = create_engine(PG_URL)

# Load data
df_eval = pd.read_csv("df_evaluation_final.csv")
ev = pd.read_csv("ev_sentence_level_final.csv")
notes = pd.read_csv("notes_sample_final.csv")
agg = pd.read_csv("agg_patient_level_final.csv")

tp = pd.read_csv("true_positives_final.csv")
tn = pd.read_csv("true_negatives_final.csv")
fp = pd.read_csv("false_positives_final.csv")
fn = pd.read_csv("false_negatives_final.csv")


In [9]:


# Get all unique patients with notes
all_patients_with_notes = notes['subject_id'].unique()

# For each patient, check if they have insomnia evidence in notes
print(f"\nAnalyzing all {len(all_patients_with_notes)} patients with discharge summaries...")

# Get ICD-9 gold standard for all patients
all_gold = pd.read_sql(f"""
    SELECT subject_id, rule_a, rule_b, rule_c, any_rule
    FROM mimic_omop.insomnia_cohort
    WHERE subject_id IN ({','.join(map(str, all_patients_with_notes))})
""", engine)

# Merge with our LLM results
all_patients_df = pd.DataFrame({'subject_id': all_patients_with_notes})
all_patients_df = all_patients_df.merge(all_gold, on='subject_id', how='left')
all_patients_df = all_patients_df.fillna(0)

# Get LLM results
nlp_results = ev.groupby('subject_id').agg({
    'is_sleep': 'max',
    'is_impair': 'max', 
    'is_primary': 'max',
    'is_secondary': 'max'
}).reset_index()

nlp_results['nlp_rule_a'] = nlp_results['is_sleep'] & nlp_results['is_impair']
nlp_results['nlp_rule_b'] = nlp_results['is_primary']
nlp_results['nlp_rule_c'] = nlp_results['is_secondary']
nlp_results['nlp_any'] = (nlp_results['nlp_rule_a'] | nlp_results['nlp_rule_b'] | nlp_results['nlp_rule_c']).astype(int)

all_patients_df = all_patients_df.merge(nlp_results[['subject_id', 'nlp_any', 'nlp_rule_a', 'nlp_rule_b', 'nlp_rule_c']], 
                                         on='subject_id', how='left')
all_patients_df = all_patients_df.fillna(0)

# Find gaps
nlp_found_icd_missed = all_patients_df[(all_patients_df['nlp_any'] == 1) & (all_patients_df['any_rule'] == 0)]

print(f"\n SUMMARY STATISTICS:")
print(f"  Total patients with notes: {len(all_patients_with_notes)}")
print(f"  Patients with ICD-9 insomnia codes: {all_patients_df['any_rule'].sum():.0f}")
print(f"  Patients with NLP-detected insomnia: {all_patients_df['nlp_any'].sum():.0f}")
print(f"  Patients with BOTH ICD-9 and NLP evidence: {((all_patients_df['any_rule'] == 1) & (all_patients_df['nlp_any'] == 1)).sum()}")
print(f"\n  ⚠️  NLP found evidence but ICD-9 missed: {len(nlp_found_icd_missed)} patients")
print(f"      (These are potential ICD-9 coding gaps)")


print("\n--- Rule A (Sleep Difficulty + Daytime Impairment) ---")
rule_a_gap = all_patients_df[(all_patients_df['nlp_rule_a'] == 1) & (all_patients_df['rule_a'] == 0)]
print(f"LLM found {all_patients_df['nlp_rule_a'].sum():.0f} cases")
print(f"ICD-9 coded {all_patients_df['rule_a'].sum():.0f} cases")
print(f"Gap: {len(rule_a_gap)} patients with symptoms in notes but no ICD-9 code")

print("\n--- Rule B (Primary Insomnia Medications) ---")
rule_b_gap = all_patients_df[(all_patients_df['nlp_rule_b'] == 1) & (all_patients_df['rule_b'] == 0)]
print(f"LLM found {all_patients_df['nlp_rule_b'].sum():.0f} cases")
print(f"ICD-9 coded {all_patients_df['rule_b'].sum():.0f} cases")
print(f"Gap: {len(rule_b_gap)} patients with primary meds in notes but no ICD-9 code")

print("\n--- Rule C (Secondary Insomnia Medications) ---")
rule_c_gap = all_patients_df[(all_patients_df['nlp_rule_c'] == 1) & (all_patients_df['rule_c'] == 0)]
print(f"LLM found {all_patients_df['nlp_rule_c'].sum():.0f} cases")
print(f"ICD-9 coded {all_patients_df['rule_c'].sum():.0f} cases")
print(f"Gap: {len(rule_c_gap)} patients with secondary meds in notes but no ICD-9 code")



Analyzing all 70 patients with discharge summaries...

 SUMMARY STATISTICS:
  Total patients with notes: 70
  Patients with ICD-9 insomnia codes: 70
  Patients with NLP-detected insomnia: 58
  Patients with BOTH ICD-9 and NLP evidence: 58

  ⚠️  NLP found evidence but ICD-9 missed: 0 patients
      (These are potential ICD-9 coding gaps)

--- Rule A (Sleep Difficulty + Daytime Impairment) ---
LLM found 22 cases
ICD-9 coded 31 cases
Gap: 11 patients with symptoms in notes but no ICD-9 code

--- Rule B (Primary Insomnia Medications) ---
LLM found 20 cases
ICD-9 coded 56 cases
Gap: 4 patients with primary meds in notes but no ICD-9 code

--- Rule C (Secondary Insomnia Medications) ---
LLM found 54 cases
ICD-9 coded 56 cases
Gap: 12 patients with secondary meds in notes but no ICD-9 code


In [10]:


#Here we do rule by rule comparison 
print("\n--- Rule A (Sleep Difficulty + Daytime Impairment) ---")
rule_a_gap = all_patients_df[(all_patients_df['nlp_rule_a'] == 1) & (all_patients_df['rule_a'] == 0)]
print(f"NLP found {all_patients_df['nlp_rule_a'].sum():.0f} cases")
print(f"ICD-9 coded {all_patients_df['rule_a'].sum():.0f} cases")
print(f"Gap: {len(rule_a_gap)} patients with symptoms in notes but no ICD-9 code")

print("\n--- Rule B (Primary Insomnia Medications) ---")
rule_b_gap = all_patients_df[(all_patients_df['nlp_rule_b'] == 1) & (all_patients_df['rule_b'] == 0)]
print(f"NLP found {all_patients_df['nlp_rule_b'].sum():.0f} cases")
print(f"ICD-9 coded {all_patients_df['rule_b'].sum():.0f} cases")
print(f"Gap: {len(rule_b_gap)} patients with primary meds in notes but no ICD-9 code")

print("\n--- Rule C (Secondary Insomnia Medications) ---")
rule_c_gap = all_patients_df[(all_patients_df['nlp_rule_c'] == 1) & (all_patients_df['rule_c'] == 0)]
print(f"NLP found {all_patients_df['nlp_rule_c'].sum():.0f} cases")
print(f"ICD-9 coded {all_patients_df['rule_c'].sum():.0f} cases")
print(f"Gap: {len(rule_c_gap)} patients with secondary meds in notes but no ICD-9 code")


--- Rule A (Sleep Difficulty + Daytime Impairment) ---
NLP found 22 cases
ICD-9 coded 31 cases
Gap: 11 patients with symptoms in notes but no ICD-9 code

--- Rule B (Primary Insomnia Medications) ---
NLP found 20 cases
ICD-9 coded 56 cases
Gap: 4 patients with primary meds in notes but no ICD-9 code

--- Rule C (Secondary Insomnia Medications) ---
NLP found 54 cases
ICD-9 coded 56 cases
Gap: 12 patients with secondary meds in notes but no ICD-9 code
