In [62]:
import sys
!"{sys.executable}" -m pip install sqlalchemy psycopg2-binary ollama




In [63]:
import os, json, re
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime
import os
import ollama


In [64]:
#connect to database
PG_URL = "postgresql+psycopg2://postgres:4030@localhost:5432/omop_sandbox"
engine = create_engine(PG_URL)


In [65]:
#Pull batch of notes 
NOTE_TEXT_COLUMN = "text"  # change if your column is different
notes = pd.read_sql(f"""
    SELECT subject_id, hadm_id, {NOTE_TEXT_COLUMN} AS note_text
    FROM mimic_omop.notes_norm
    WHERE {NOTE_TEXT_COLUMN} IS NOT NULL AND LENGTH({NOTE_TEXT_COLUMN}) > 50
    LIMIT 30;
""", engine)


In [66]:
#assign note_id
notes = notes.reset_index().rename(columns={"index": "note_rowid"})
print("notes succesfully loaded")

#Sentence splitter
def split_sentences(t):
    # Moght refine later
    sents = re.split(r'(?<=[\.\!\?])\s+', t.strip())
    # trim very long/short
    return [s.strip()[:1000] for s in sents if 3 < len(s) < 1000]

notes succesfully loaded


In [67]:
#important key words: sleeo/impair

SLEEP_TERMS = [
    "insomnia","sleep onset","sleep maintenance","early awakening",
    "trouble sleeping","difficulty sleeping","can't sleep","cant sleep",
    "sleep latency","sleeplessness","not sleeping","poor sleep",
    "restless sleep","hard to fall asleep","sleep problem"
]
IMPAIR_TERMS = [
    "fatigue","tired","daytime sleepiness","somnolence","malaise","irritable",
    "irritability","poor concentration","attention","memory",
    "impaired performance","decreased motivation","errors","accidents",
    "dissatisfaction with sleep","low energy","hard to concentrate","sleepy","tiredness"
]

In [68]:
def is_candidate(sent):
    s = sent.lower()
    return any(w in s for w in SLEEP_TERMS) or any(w in s for w in IMPAIR_TERMS)

In [69]:
#Candidate sentence dataframe
rows = []
for _, r in notes.iterrows():
    sents = split_sentences(r["note_text"])
    for i, s in enumerate(sents):
        if is_candidate(s):
            rows.append({
                "subject_id": r["subject_id"],
                "hadm_id": r["hadm_id"],
                "note_rowid": r["note_rowid"],
                "sent_id": i,
                "text_span": s
            })

cands = pd.DataFrame(rows)
print("Candidates:", cands.shape)
print("Number of candidate sentences:", len(cands))
cands.head(5)

Candidates: (27, 5)
Number of candidate sentences: 27


Unnamed: 0,subject_id,hadm_id,note_rowid,sent_id,text_span
0,10333122,25650366,1,10,Since then her family reports a \nprogressive\...
1,10333122,25650366,1,47,Unable to\nremember events from this morning b...
2,10333122,25650366,1,92,"#Somnolence, resolved: Patient triggered for a..."
3,10333127,21112678,2,29,"GEN: WA woman in NAD, tired, comfortable, app..."
4,10333385,28930333,7,9,She took a \nhydroxyzine to help with anxiety ...


We will now aim to classify each sentence with a JSON schema using OLLAMA-based classification

In [70]:
SYSTEM_PROMPT = """You are a clinical NLP assistant identifying mentions of insomnia or sleep difficulty
and their daytime consequences in clinical notes.

Return only valid JSON with these keys:
{"asserts_sleep_difficulty":bool,"asserts_daytime_impairment":bool,"negated":bool,"temporality":"current|historical|uncertain"}.

Be inclusive and context-sensitive. If unsure, lean toward True and "current".
"""

USER_TEMPLATE = """Examples:

Sentence: "Patient reports difficulty staying asleep and feels tired during the day."
â†’ {{"asserts_sleep_difficulty": true, "asserts_daytime_impairment": true, "negated": false, "temporality": "current"}}

Sentence: "He denies any sleep problems or fatigue."
â†’ {{"asserts_sleep_difficulty": false, "asserts_daytime_impairment": false, "negated": true, "temporality": "current"}}

Sentence: "Sleep has been poor lately but improving."
â†’ {{"asserts_sleep_difficulty": true, "asserts_daytime_impairment": false, "negated": false, "temporality": "current"}}

Sentence: "Has a history of insomnia several years ago."
â†’ {{"asserts_sleep_difficulty": true, "asserts_daytime_impairment": false, "negated": false, "temporality": "historical"}}

Now classify this new sentence:
"{s}"
"""



In [71]:
import re, json


def extract_json(text):
    """Extract first {...} JSON block from a model response."""
    m = re.search(r"\{.*\}", text, re.DOTALL)
    if not m:
        return {"error": "no JSON found", "raw": text[:200]}
    try:
        return json.loads(m.group(0))
    except json.JSONDecodeError:
        return {"error": "bad JSON", "raw": m.group(0)}



#Here we run theinsomnia classfier locally through Ollama.
def classify_sentence_ollama(text):
    prompt = f"{SYSTEM_PROMPT}\nSentence: \"{text}\""
    response = ollama.chat(model="llama3:8b", messages=[{"role": "user", "content": prompt}])
    content = response["message"]["content"]
    print("ðŸ§  RAW Ollama output:", content)
    parsed = extract_json(content)
    return parsed




In [72]:
examples = [
    "Patient reports poor sleep and fatigue.",
    "He denies any insomnia.",
    "Sleep has been fine lately."
]

for s in examples:
    print("\nTEXT:", s)
    print("OUTPUT:", classify_sentence(s))




TEXT: Patient reports poor sleep and fatigue.
OUTPUT: {'asserts_sleep_difficulty': True, 'asserts_daytime_impairment': True, 'negated': False, 'temporality': 'current'}

TEXT: He denies any insomnia.
OUTPUT: {'asserts_sleep_difficulty': False, 'asserts_daytime_impairment': False, 'negated': True, 'temporality': 'current'}

TEXT: Sleep has been fine lately.
OUTPUT: {'asserts_sleep_difficulty': False, 'asserts_daytime_impairment': False, 'negated': True, 'temporality': 'historical'}


We now apply classifier to all candidates

In [74]:


out = []
for _, r in cands.iterrows():
    y = classify_sentence_ollama(r["text_span"])
    out.append({
        **r,
        "asserts_sleep_difficulty": bool(y.get("asserts_sleep_difficulty", False)),
        "asserts_daytime_impairment": bool(y.get("asserts_daytime_impairment", False)),
        "negated": bool(y.get("negated", False)),
        "temporality": str(y.get("temporality", "uncertain")),
        "raw_model_output": json.dumps(y)
    })

ev = pd.DataFrame(out)
print("âœ… Labeled sentences:", len(ev))


ðŸ§  RAW Ollama output: {"asserts_sleep_difficulty": False, "asserts_daytime_impairment": True, "negated": False, "temporality": "uncertain"}
ðŸ§  RAW Ollama output: {
"asserts_sleep_difficulty": true,
"asserts_daytime_impairment": true,
"negated": false,
"temporality": "current"
}

Reasoning:

* "Unable to remember events from this morning" implies difficulty with short-term memory, which can be a consequence of sleep difficulty (insomnia).
* Long-term memory being intact is not necessarily relevant to the sentence's meaning, as it doesn't contradict the idea that sleep difficulty occurred.
* The temporality is inferred as "current" because the patient is unable to remember events from this morning, suggesting a recent or current issue.
ðŸ§  RAW Ollama output: {
"asserts_sleep_difficulty": true,
"asserts_daytime_impairment": false,
"negated": true,
"temporality": "historical"
}
ðŸ§  RAW Ollama output: {
"asserts_sleep_difficulty": true,
"asserts_daytime_impairment": true,
"negated": f

In [76]:
# drop raw_model_output before writing, since DB table doesn't have it
ev.drop(columns=["raw_model_output"], errors="ignore") \
  .to_sql("note_sent_evidence", engine, schema="kb", if_exists="append", index=False)

sleep_evd = ev[(ev["asserts_sleep_difficulty"]) & (~ev["negated"]) & (ev["temporality"]=="current")]
impair_evd = ev[(ev["asserts_daytime_impairment"]) & (~ev["negated"]) & (ev["temporality"]=="current")]

sleep_evd[["subject_id","hadm_id","note_rowid","sent_id","text_span","negated","temporality"]]\
    .to_sql("evd_note_sleep", engine, schema="kb", if_exists="append", index=False)

impair_evd[["subject_id","hadm_id","note_rowid","sent_id","text_span","negated","temporality"]]\
    .to_sql("evd_note_impair", engine, schema="kb", if_exists="append", index=False)

engine.dispose()
print("âœ… Changes committed and connection closed.")
print("âœ… Stored note evidence to kb.evd_note_sleep / kb.evd_note_impair")

âœ… Changes committed and connection closed.
âœ… Stored note evidence to kb.evd_note_sleep / kb.evd_note_impair


In [77]:
print("\n--- Summary ---")
print("Sleep difficulty sentences:", ev["asserts_sleep_difficulty"].sum())
print("Daytime impairment sentences:", ev["asserts_daytime_impairment"].sum())
print(ev["temporality"].value_counts())


--- Summary ---
Sleep difficulty sentences: 14
Daytime impairment sentences: 9
temporality
uncertain     14
current       12
historical     1
Name: count, dtype: int64


In [78]:
patients = (
    ev[(ev["asserts_sleep_difficulty"]) & (~ev["negated"]) & (ev["temporality"]=="current")]
    .groupby("subject_id")["hadm_id"]
    .nunique()
    .reset_index(name="insomnia_hadm_count")
)
print(patients.head())


   subject_id  insomnia_hadm_count
0    10333122                    1
1    10333127                    1
2    10333385                    1
3    10333909                    1
4    10333924                    1
