In [2]:
import sys
!"{sys.executable}" -m pip install sqlalchemy psycopg2-binary ollama




In [3]:
import os, json, re
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime
import os
import ollama


In [4]:
#connect to database
PG_URL = "postgresql+psycopg2://postgres:4030@localhost:5432/omop_sandbox"
engine = create_engine(PG_URL)


In [5]:
#Pull batch of notes 
NOTE_TEXT_COLUMN = "text"  # change if your column is different
notes = pd.read_sql(f"""
    SELECT subject_id, hadm_id, {NOTE_TEXT_COLUMN} AS note_text
    FROM mimic_omop.notes_norm
    WHERE {NOTE_TEXT_COLUMN} IS NOT NULL AND LENGTH({NOTE_TEXT_COLUMN}) > 50
    LIMIT 20;
""", engine)


In [6]:
#assign note_id
notes = notes.reset_index().rename(columns={"index": "note_rowid"})
print("notes succesfully loaded")

#Sentence splitter
def split_sentences(t):
    # Moght refine later
    sents = re.split(r'(?<=[\.\!\?])\s+', t.strip())
    # trim very long/short
    return [s.strip()[:1000] for s in sents if 3 < len(s) < 1000]

notes succesfully loaded


In [7]:
#important key words: sleeo/impair

SLEEP_TERMS = [
    "insomnia","sleep onset","sleep maintenance","early awakening",
    "trouble sleeping","difficulty sleeping","can't sleep","cant sleep",
    "sleep latency","sleeplessness","not sleeping","poor sleep",
    "restless sleep","hard to fall asleep","sleep problem"
]
IMPAIR_TERMS = [
    "fatigue","tired","daytime sleepiness","somnolence","malaise","irritable",
    "irritability","poor concentration","attention","memory",
    "impaired performance","decreased motivation","errors","accidents",
    "dissatisfaction with sleep","low energy","hard to concentrate","sleepy","tiredness"
]

In [16]:
def is_candidate(sent):
    s = sent.lower()
    return any(w in s for w in SLEEP_TERMS) or any(w in s for w in IMPAIR_TERMS)

In [18]:
#Candidate sentence dataframe
rows = []
for _, r in notes.iterrows():
    sents = split_sentences(r["note_text"])
    for i, s in enumerate(sents):
        if is_candidate(s):
            rows.append({
                "subject_id": r["subject_id"],
                "hadm_id": r["hadm_id"],
                "note_rowid": r["note_rowid"],
                "sent_id": i,
                "text_span": s
            })

cands = pd.DataFrame(rows)
print("Candidates:", cands.shape)
print("Number of candidate sentences:", len(cands))
pd.set_option('display.max_colwidth', None)

cands.head(10)

Candidates: (20, 5)
Number of candidate sentences: 20


Unnamed: 0,subject_id,hadm_id,note_rowid,sent_id,text_span
0,10333122,25650366,1,10,Since then her family reports a \nprogressive\nworsening in her executive function and short-term memory.
1,10333122,25650366,1,47,Unable to\nremember events from this morning but long term memory intact.
2,10333122,25650366,1,92,"#Somnolence, resolved: Patient triggered for acute change in \nconsciousness that self-resolved."
3,10333127,21112678,2,29,"GEN: WA woman in NAD, tired, comfortable, appropriate."
4,10333385,28930333,7,9,"She took a \nhydroxyzine to help with anxiety but felt weak, tired, \nlight-headed/faint as she was getting ready and even in the \noffice."
5,10333385,28930333,7,23,"(Of note, patient has also been having financial \ndifficulties, so in order to afford her rent and food, she \nhasn't been going to some medical appointment because she \ncouldn't afford co-pay, and she stopped taking most of her \nmedications including stopping fluoxetine abruptly about 2 weeks \nago also because she couldn't afford co-pay.) Last ___, it \ntook patient 4 hours to leave her apartment for her appointment, \nand she also took hydroxyzine (which, she reports, helps with \nanxiety and insomnia somewhat)."
6,10333385,28930333,7,27,"not eating at all, feelings of worthlessness and \nhopelessness, poor concentration (patient noted that she hadn't \nbeen able to read books recently until her arrival to the ED \nwhere she was actually able to start reading a book), loss of \ninterest in anything outside work, social isolation, fatigue."
7,10333385,28930333,7,92,"On discharge patient reports a subjective improvement in mood \nand anxiety, absence of suicidal ideation, improved sleep \nduration and quality, concentration/attention, and future \norientation."
8,10333909,27924784,10,0,"Name: ___ Unit No: ___\n \nAdmission Date: ___ Discharge Date: ___\n \nDate of Birth: ___ Sex: M\n \nService: MEDICINE\n \nAllergies: \nNo Known Allergies / Adverse Drug Reactions\n \nAttending: ___\n \nChief Complaint:\nchest pain and SOB \n \nMajor Surgical or Invasive Procedure:\nNone\n\n \nHistory of Present Illness:\n___ y/o M with hx of morbid obesity who presents after recent \ndischarge from OSH with new diagnosis of heart failure presents \ntoday with continued fatigue, malaise, shortness of breath, \nchest pain and lower extremity swelling."
9,10333909,27924784,10,5,"Since recent discharge still experiencing worsening fatigue, \nchest tightness, increasing ___ swelling."


We will now aim to classify each sentence with a JSON schema using OLLAMA-based classification

In [19]:
SYSTEM_PROMPT = """You are a clinical NLP assistant identifying mentions of insomnia or sleep difficulty
and their daytime consequences in clinical notes.

Return only valid JSON with these keys:
{"asserts_sleep_difficulty":bool,"asserts_daytime_impairment":bool,"negated":bool,"temporality":"current|historical|uncertain"}.

Be inclusive and context-sensitive. If unsure, lean toward True and "current".
"""

USER_TEMPLATE = """Examples:

Sentence: "Patient reports difficulty staying asleep and feels tired during the day."
â†’ {{"asserts_sleep_difficulty": true, "asserts_daytime_impairment": true, "negated": false, "temporality": "current"}}

Sentence: "He denies any sleep problems or fatigue."
â†’ {{"asserts_sleep_difficulty": false, "asserts_daytime_impairment": false, "negated": true, "temporality": "current"}}

Sentence: "Sleep has been poor lately but improving."
â†’ {{"asserts_sleep_difficulty": true, "asserts_daytime_impairment": false, "negated": false, "temporality": "current"}}

Sentence: "Has a history of insomnia several years ago."
â†’ {{"asserts_sleep_difficulty": true, "asserts_daytime_impairment": false, "negated": false, "temporality": "historical"}}

Now classify this new sentence:
"{s}"
"""



In [21]:
import re, json


def extract_json(text):
    """Extract first {...} JSON block from a model response."""
    m = re.search(r"\{.*\}", text, re.DOTALL)
    if not m:
        return {"error": "no JSON found", "raw": text[:200]}
    try:
        return json.loads(m.group(0))
    except json.JSONDecodeError:
        return {"error": "bad JSON", "raw": m.group(0)}



#Here we run theinsomnia classfier locally through Ollama.
def classify_sentence_ollama(text):
    prompt = f"{SYSTEM_PROMPT}\nSentence: \"{text}\""
    response = ollama.chat(model="llama3:8b", messages=[{"role": "user", "content": prompt}])
    content = response["message"]["content"]
    print("RAW Ollama output:", content)
    parsed = extract_json(content)
    return parsed




In [22]:
examples = [
    "Patient reports poor sleep and fatigue.",
    "He denies any insomnia.",
    "Sleep has been fine lately."
]

for s in examples:
    print("\nTEXT:", s)
    print("OUTPUT:", classify_sentence_ollama(s))




TEXT: Patient reports poor sleep and fatigue.
RAW Ollama output: Here is the output in JSON format:

```
{
  "asserts_sleep_difficulty": true,
  "asserts_daytime_impairment": true,
  "negated": false,
  "temporality": "current"
}
```
OUTPUT: {'asserts_sleep_difficulty': True, 'asserts_daytime_impairment': True, 'negated': False, 'temporality': 'current'}

TEXT: He denies any insomnia.
RAW Ollama output: Here is the JSON output:

{
"asserts_sleep_difficulty": true,
"asserts_daytime_impairment": false,
"negated": true,
"temporality": "current"
}

Note: Since the sentence mentions "denies", which implies that the patient has previously experienced insomnia, but currently does not have it, I'm considering the temporality as "current".
OUTPUT: {'asserts_sleep_difficulty': True, 'asserts_daytime_impairment': False, 'negated': True, 'temporality': 'current'}

TEXT: Sleep has been fine lately.
RAW Ollama output: {"asserts_sleep_difficulty": False, "asserts_daytime_impairment": False, "negated

We now apply classifier to all candidates

In [24]:


out = []
for _, r in cands.iterrows():
    y = classify_sentence_ollama(r["text_span"])
    out.append({
        **r,
        "asserts_sleep_difficulty": bool(y.get("asserts_sleep_difficulty", False)),
        "asserts_daytime_impairment": bool(y.get("asserts_daytime_impairment", False)),
        "negated": bool(y.get("negated", False)),
        "temporality": str(y.get("temporality", "uncertain")),
        "raw_model_output": json.dumps(y)
    })

ev = pd.DataFrame(out)
print("Labeled sentences:", len(ev))


RAW Ollama output: {"asserts_sleep_difficulty": False, "asserts_daytime_impairment": True, "negated": False, "temporality": "historical"}
RAW Ollama output: Here is the output in JSON format:

```
{
  "asserts_sleep_difficulty": true,
  "asserts_daytime_impairment": true,
  "negated": false,
  "temporality": "current"
}
```
RAW Ollama output: {
  "asserts_sleep_difficulty": false,
  "asserts_daytime_impairment": false,
  "negated": true,
  "temporality": "historical"
}

Note: Although the term "Somnolence" implies sleep difficulty, the context indicates that it was resolved and not currently impacting the patient.
RAW Ollama output: {"asserts_sleep_difficulty":True,"asserts_daytime_impairment":True,"negated":False,"temporality":"current"}
RAW Ollama output: Here is the extracted JSON:

{
  "asserts_sleep_difficulty": true,
  "asserts_daytime_impairment": true,
  "negated": false,
  "temporality": "current"
}

Reasoning:
- "felt tired" implies sleep difficulty.
- "weak", "light-headed/f

In [25]:
# drop raw_model_output before writing, since DB table doesn't have it
ev.drop(columns=["raw_model_output"], errors="ignore") \
  .to_sql("note_sent_evidence", engine, schema="kb", if_exists="append", index=False)

sleep_evd = ev[(ev["asserts_sleep_difficulty"]) & (~ev["negated"]) & (ev["temporality"]=="current")]
impair_evd = ev[(ev["asserts_daytime_impairment"]) & (~ev["negated"]) & (ev["temporality"]=="current")]

sleep_evd[["subject_id","hadm_id","note_rowid","sent_id","text_span","negated","temporality"]]\
    .to_sql("evd_note_sleep", engine, schema="kb", if_exists="append", index=False)

impair_evd[["subject_id","hadm_id","note_rowid","sent_id","text_span","negated","temporality"]]\
    .to_sql("evd_note_impair", engine, schema="kb", if_exists="append", index=False)

engine.dispose()
print("âœ… Changes committed and connection closed.")
print("âœ… Stored note evidence to kb.evd_note_sleep / kb.evd_note_impair")

âœ… Changes committed and connection closed.
âœ… Stored note evidence to kb.evd_note_sleep / kb.evd_note_impair


In [26]:
print("\n--- Summary ---")
print("Sleep difficulty sentences:", ev["asserts_sleep_difficulty"].sum())
print("Daytime impairment sentences:", ev["asserts_daytime_impairment"].sum())
print(ev["temporality"].value_counts())


--- Summary ---
Sleep difficulty sentences: 12
Daytime impairment sentences: 9
temporality
current       12
uncertain      7
historical     1
Name: count, dtype: int64


In [27]:
patients = (
    ev[(ev["asserts_sleep_difficulty"]) & (~ev["negated"]) & (ev["temporality"]=="current")]
    .groupby("subject_id")["hadm_id"]
    .nunique()
    .reset_index(name="insomnia_hadm_count")
)
print(patients.head())


   subject_id  insomnia_hadm_count
0    10333122                    1
1    10333385                    1
2    10333909                    1
3    10334165                    1


In [28]:
# --- Final summary -----------------------------------------------------------

print("\n================= INSOMNIA RULE A EVALUATION SUMMARY =================")
ev = pd.DataFrame(out)
# sentence-level stats
total_sents = len(ev)
sleep_sents = ev["asserts_sleep_difficulty"].sum()
impair_sents = ev["asserts_daytime_impairment"].sum()
negated_sents = ev["negated"].sum()
current_sents = (ev["temporality"] == "current").sum()

print(f"Total candidate sentences analyzed: {total_sents}")
print(f"â†’ Sentences asserting sleep difficulty: {sleep_sents}")
print(f"â†’ Sentences asserting daytime impairment: {impair_sents}")
print(f"â†’ Sentences marked as negated: {negated_sents}")
print(f"â†’ Sentences marked as current: {current_sents}")
print("\nTemporality distribution:")
print(ev["temporality"].value_counts(), "\n")

# patient-level aggregation (for Rule A)
patients = (
    ev[
        (ev["asserts_sleep_difficulty"])
        & (ev["asserts_daytime_impairment"])
        & (~ev["negated"])
        & (ev["temporality"] == "current")
    ]
    .groupby("subject_id")["hadm_id"]
    .nunique()
    .reset_index(name="ruleA_hadm_count")
)

n_patients = len(patients)
print(f"Patients meeting Rule A criteria: {n_patients}")
print("\nSample of detected patients:")
print(patients.head(10).to_string(index=False))

print("\nâœ… Results stored to kb.evd_note_sleep and kb.evd_note_impair tables.")
print("âœ… Rule A evidence summary complete.")
print("======================================================================\n")



Total candidate sentences analyzed: 20
â†’ Sentences asserting sleep difficulty: 12
â†’ Sentences asserting daytime impairment: 9
â†’ Sentences marked as negated: 2
â†’ Sentences marked as current: 12

Temporality distribution:
temporality
current       12
uncertain      7
historical     1
Name: count, dtype: int64 

Patients meeting Rule A criteria: 4

Sample of detected patients:
 subject_id  ruleA_hadm_count
   10333122                 1
   10333385                 1
   10333909                 1
   10334165                 1

âœ… Results stored to kb.evd_note_sleep and kb.evd_note_impair tables.
âœ… Rule A evidence summary complete.



In [None]:
# --- Print text spans for patients satisfying Rule A ---------------------

pd.set_option('display.max_colwidth', None)

# Get list of subject_ids that satisfied Rule A
ruleA_subjects = patients["subject_id"].unique().tolist()

# Filter to sentences from those patients that actually met Rule A conditions
ruleA_spans = ev[
    (ev["subject_id"].isin(ruleA_subjects))
    & (ev["asserts_sleep_difficulty"])
    & (ev["asserts_daytime_impairment"])
    & (~ev["negated"])
    & (ev["temporality"] == "current")
]

print("\n================= TEXT SPANS FOR PATIENTS SATISFYING RULE A =================")
for pid, group in ruleA_spans.groupby("subject_id"):
    print(f"\nðŸ©º Patient {pid}")
    print("-" * 60)
    for _, row in group.iterrows():
        print(f"â€¢ {row.text_span}")
    print("-" * 60)

print("\nâœ… Displayed text spans (sentences) for all patients meeting Rule A.")




ðŸ©º Patient 10333122
------------------------------------------------------------
â€¢ Unable to
remember events from this morning but long term memory intact.
------------------------------------------------------------

ðŸ©º Patient 10333385
------------------------------------------------------------
â€¢ She took a 
hydroxyzine to help with anxiety but felt weak, tired, 
light-headed/faint as she was getting ready and even in the 
office.
â€¢ (Of note, patient has also been having financial 
difficulties, so in order to afford her rent and food, she 
hasn't been going to some medical appointment because she 
couldn't afford co-pay, and she stopped taking most of her 
medications including stopping fluoxetine abruptly about 2 weeks 
ago also because she couldn't afford co-pay.) Last ___, it 
took patient 4 hours to leave her apartment for her appointment, 
and she also took hydroxyzine (which, she reports, helps with 
anxiety and insomnia somewhat).
â€¢ not eating at all, feelings 

: 