In [None]:
# !pip install transformers accelerate torch sentencepiece

In [None]:
# !pip install psycopg2 transformers accelerate torch

In [None]:
# note = ul.get_clinical_note(subject_id=10000032)
# prompt = ul.sdh_prompt(note)
# response = pipe(prompt, max_new_tokens=400)[0]['generated_text']
# print(response)

In [None]:
import torch
import json
import pandas as pd
import utils_llm as ul
from tqdm import tqdm
from datetime import datetime
from sklearn.metrics import accuracy_score, classification_report
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, T5ForConditionalGeneration, MegatronBertForCausalLM, AutoModelForSeq2SeqLM, LlamaForCausalLM

---------------------------------------------------------

# Mistral-7B-Instruct

In [None]:
model_id = 'I:/Mistral-7B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, pad_token_id=tokenizer.eos_token_id, device=0)

In [None]:
notes = ul.get_notes_for_first_n_notes(1)
prompt = ul.sdh_single_prompt(notes[0][3])

In [None]:
output = pipe(prompt, max_new_tokens=400)[0]['generated_text']

if prompt in output:
    print(output.replace(prompt, '').strip())
else:
    print(output.strip())

---------------------------------------------------------

# BioMistral-7B

In [None]:
model_id = 'I:/BioMistral-7B'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             torch_dtype=torch.bfloat16,
                                             device_map="auto")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=50000, 
    do_sample=True,
    temperature=0.7,
    top_k=50
)

In [None]:
notes = ul.get_notes_for_first_n_notes(1)
prompt = ul.sdh_prompt(notes[0][3])

In [None]:
# user_input = input("You: ")

# messages = [
#     {"role": "user", "content": user_input}
# ]

# prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

# response = pipe(prompt)

# generated_text = response[0]['generated_text']

# if prompt in generated_text:
#     llm_reply = generated_text.replace(prompt, "").strip()
# else:
#     llm_reply = generated_text.strip()

# print(f"LLM: {llm_reply}")

---------------------------------------------------------

# clinicalt5-large

In [None]:
model_id = 'I:/Clinical-T5-Large'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = T5ForConditionalGeneration.from_pretrained(model_id)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)
pipe = pipeline('text2text-generation', model=model, tokenizer=tokenizer, pad_token_id=tokenizer.eos_token_id, device=0)

In [None]:
notes = ul.get_notes_for_first_n_notes(1)
prompt = ul.sdh_single_prompt(notes[0][3])

In [None]:
prompt = 'What drug is used in the following text: He used omeprazol.'

In [None]:
output = pipe(prompt, max_new_tokens=400)[0]['generated_text']

if prompt in output:
    print(output.replace(prompt, '').strip())
else:
    print(output.strip())

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

note = "38 y/o F, single mother of 2 (4 y/o and 6 y/o), h/o HTN and anxiety presents with medication nonadherence due to unstable PT barista job (~20 h/wk) and unreliable public transport causing missed appts and work shifts. Reports 2 mo rent arrears, late-payment notice pending eviction. No personal vehicle, bus cuts limit mobility. Ex-spouse provides no support; sister OOS; one friend for occasional childcare. Limited social support. Plan: continue lisinopril 10 mg daily, add SSRI; refer to housing assistance, workforce development, bus-pass voucher, subsidized childcare, social work, and food pantry."
prompt = f"Answer step by step: \
1. Identify medication.\
2. Evaluate context.\
3. Output medications. Note: {note}"

inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
outputs = model.generate(
    **inputs,
    max_new_tokens=150,
    num_beams=4,
    no_repeat_ngram_size=2,
    early_stopping=True
)
tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, T5ForConditionalGeneration

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = T5ForConditionalGeneration.from_pretrained(model_id)

prompt = 'Q: Can methotrexate be combined with an antibiotic?'

inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)

output = model.generate(
    **inputs,
    max_new_tokens=600,
    length_penalty=1.6,
    num_beams=4,
    no_repeat_ngram_size=3,
    temperature=0.8,
    top_k=150,
    top_p=0.92,
    repetition_penalty=2.1,
    early_stopping=True
)

print(tokenizer.decode(output[0], skip_special_tokens=True))


In [None]:
# input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
# output_ids = model.generate(input_ids, max_length=50, num_beams=4, early_stopping=True)
# generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
# print(generated_text)

In [None]:
from transformers import AutoModel, AutoTokenizer

model_id = 'I:/modernBERT'
model = AutoModel.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)


In [None]:
prompt = 'The patient is healthy'

In [None]:
tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)

In [None]:
pipe = pipeline(
    task="text-classification",
    model=model_id,
    torch_dtype=torch.float16,
    device=0
)

In [None]:
pipe("The patient is healthy and happy!")

---------------------------------------------------------

# gatortronS (gatortronGPT -> decoder-only (GPT))

In [None]:
model_id = 'I:/gatortronS'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = MegatronBertForCausalLM.from_pretrained(model_id, is_decoder=True)#.to('cuda:0')
assert tokenizer.vocab_size <= model.config.vocab_size
# pipe = pipeline('text-generation', model=model, tokenizer=tokenizer, pad_token_id=tokenizer.eos_token_id)

In [None]:
subject_and_hadm_ids = pd.read_csv('C:/Users/salazarda/Downloads/SDOH_MIMICIII_physio_release.csv')
subject_and_hadm_ids = list(subject_and_hadm_ids.loc[:, ['patient_id', 'note_id']].drop_duplicates().itertuples(index=False, name=None))

notes = ul.get_clinical_notes_mimic3(subject_and_hadm_ids)
notes = notes[0:5]

sdoh_output = []

for note in notes:
    meta = {
        'subject_id': note[0],
        'hadm_id': note[1],
        'row_id': note[2],
        'charttime': note[3].isoformat() if note[3] else None
    }

    outputs_per_note = meta.copy()  # Start with metadata

    for sdoh in tqdm(['Employment status', 'Housing issues', 'Transportation issues', 'Parental status', 'Relationship status', 'Social support']):
        instruction = {
            'Employment status': 'Employment status: Whether the patient is currently employed, unemployed, underemployed, disability, retired, student, or unknown. LABELS: [employed, unemployed, underemployed, disability, retired, student, unknown]',
            'Housing issues': 'Housing issues: Any mention of financial status, undomiciled, other. LABELS: [financial status, undomiciled, other, unknown]', 
            'Transportation issues': 'Transportation issues: Any reference to transportation difficulties such as distance, resources, other. LABELS: [distance, resources, other, unknown]', 
            'Parental status':'Parental status: Whether the patient has a child under 18 years old. LABELS: [yes, no, unknown]',
            'Relationship status': 'Relationship status: Whether the patient is widowed, divorced, single. LABELS: [married, partnered, widowed, divorced, single, unknown]',
            'Social support': 'Social support: It does include informal or emotional support from family members, friends, or romantic partners unless such support is clearly mediated through a formal care plan by a social worker or case manager. LABELS: [presence, absence, unknown]'
        }
        
        prompt = ul.sdh_single_prompt(note[4], sdoh, instruction[sdoh])

        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
        inputs['position_ids'] = torch.arange(0, inputs['input_ids'].size(1), dtype=torch.long).unsqueeze(0)

        
        with torch.no_grad():
            generated_ids = model.generate(
                **inputs,
                max_new_tokens=600,
                length_penalty=1.6,
                num_beams=10,
                no_repeat_ngram_size=3,
                temperature=0.8,
                do_sample=True,
                top_k=15,
                top_p=0.95,
                repetition_penalty=2.1,
                early_stopping=True
            )

        output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        outputs_per_note[sdoh] = output_text
    
    sdoh_output.append(outputs_per_note)

    

In [None]:
notes = ul.get_notes_for_first_n_notes(1)
prompt = ul.sdh_prompt_guevara(notes[0][3])

In [None]:
prompt

In [None]:
with torch.no_grad():
    output_ids = model.generate(**inputs, max_new_tokens=200)

In [None]:
pipe(prompt, max_new_tokens=400)

In [None]:
output = pipe(prompt, max_new_tokens=400)[0]['generated_text']

if prompt in output:
    print(output.replace(prompt, '').strip())
else:
    print(output.strip())

In [None]:
prompt = "The patient presents with abdominal pain and"
inputs = tokenizer(prompt, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_length=200, do_sample=True, temperature=0.7)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


---------------------------------------------------------

# meditron-7b

In [None]:
model_id = 'I:/meditron-7b'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
pipe = pipeline('text-generation', model=model, tokenizer=tokenizer, pad_token_id=tokenizer.eos_token_id, device=0)

In [None]:
notes = ul.get_notes_for_first_n_notes(1)
prompt = ul.sdh_single_prompt(notes[0][3])

In [None]:
output = pipe(prompt, max_new_tokens=400)

if prompt in output:
    print(output[0]['generated_text'].strip().replace(prompt, ''))
else:
    print(output[0]['generated_text'].strip())

---------------------------------------------------------

# meditron3-8b

In [None]:
model_id = 'I:/meditron3-8b'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
pipe = pipeline('text-generation', model=model, tokenizer=tokenizer, device=0)

---------------------------------------------------------

In [None]:
notes = ul.get_notes_for_first_n_notes(1)
prompt = ul.sdh_prompt(notes[0][3])
# pipe(prompts[0], max_new_tokens=400)

# LlamaCare + MIMIC III

In [None]:
model_id = 'I:/LlamaCare'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = LlamaForCausalLM.from_pretrained(model_id)
pipe = pipeline('text-generation', model=model, tokenizer=tokenizer, device=0)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import torch

base_model = "I:/Llama-2-13b-hf"          # Llama-2-13b-hf
adapter_id = "I:/LlamaCare"             # LoRA de LlamaCare

bnb_config = BitsAndBytesConfig(load_in_4bit=True,
                                bnb_4bit_quant_type="nf4",
                                bnb_4bit_compute_dtype=torch.bfloat16)

tok = AutoTokenizer.from_pretrained(base_model, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(base_model,
                                             quantization_config=bnb_config,
                                             device_map="auto")
model = PeftModel.from_pretrained(model, adapter_id)  # aplicar el LoRA

def chat(prompt, max_new_tokens=256):
    inputs = tok(prompt, return_tensors="pt").to(model.device)
    out = model.generate(**inputs, max_new_tokens=max_new_tokens)
    return tok.decode(out[0], skip_special_tokens=True)

print(chat("Eres un asistente médico. Explica el manejo inicial de DM2."))


In [None]:
import random

subject_and_hadm_ids = pd.read_csv('C:/Users/salazarda/Downloads/SDOH_MIMICIII_physio_release.csv')
subject_and_hadm_ids = list(subject_and_hadm_ids.loc[:, ['patient_id', 'note_id']].drop_duplicates().itertuples(index=False, name=None))

notes = ul.get_clinical_notes_mimic3(subject_and_hadm_ids)
notes = random.sample(notes, 5)
# notes = notes[0:50]

sdoh_output = []

for note in notes:
    meta = {
        'subject_id': note[0],
        'hadm_id': note[1],
        'row_id': note[2],
        'charttime': note[3].isoformat() if note[3] else None
    }

    outputs_per_note = meta.copy()  # Start with metadata

    for sdoh in tqdm(['Employment status', 'Housing issues', 'Transportation issues', 'Parental status', 'Relationship status', 'Social support']):
        instruction = {
            'Employment status': 'Employment status: Whether the patient is currently employed, unemployed, underemployed, disability, retired, student, or unknown. LABELS: [employed, unemployed, underemployed, disability, retired, student, unknown]',
            'Housing issues': 'Housing issues: Any mention of financial status, undomiciled, other. LABELS: [financial status, undomiciled, other, unknown]', 
            'Transportation issues': 'Transportation issues: Any reference to transportation difficulties such as distance, resources, other. LABELS: [distance, resources, other, unknown]', 
            'Parental status':'Parental status: Whether the patient has a child under 18 years old. LABELS: [yes, no, unknown]',
            'Relationship status': 'Relationship status: Whether the patient is widowed, divorced, single. LABELS: [married, partnered, widowed, divorced, single, unknown]',
            'Social support': 'Social support: It does include informal or emotional support from family members, friends, or romantic partners unless such support is clearly mediated through a formal care plan by a social worker or case manager. LABELS: [presence, absence, unknown]'
        }

        prompt = ul.sdh_single_prompt(note[4], sdoh, instruction[sdoh])

        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)

        with torch.no_grad():
            generated_ids = model.generate(
                **inputs,
                max_new_tokens=600,
                length_penalty=1.6,
                num_beams=10,
                no_repeat_ngram_size=3,
                temperature=0.8,
                do_sample=True,
                top_k=15,
                top_p=0.95,
                repetition_penalty=2.1,
                early_stopping=True
            )

        output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        outputs_per_note[sdoh] = output_text

    sdoh_output.append(outputs_per_note)

    

In [None]:
import random
time_start = datetime.now()
timestamp = time_start.strftime("%Y%m%d_%H%M%S")

subject_and_hadm_ids = pd.read_csv('C:/Users/salazarda/Downloads/SDOH_MIMICIII_physio_release.csv')
subject_and_hadm_ids = list(subject_and_hadm_ids.loc[:,['patient_id', 'note_id']].drop_duplicates().itertuples(index=False, name=None))
notes = ul.get_clinical_notes_mimic3(subject_and_hadm_ids)
notes = random.sample(notes, 2)
# notes = notes[0:50]

prompts = []
metadata = []

for subject_id, hadm_id, row_id, charttime, note_text in notes:
    prompts.append(ul.sdh_prompt_guevara_v2(note_text))
    metadata.append({
        "subject_id": subject_id,
        "hadm_id": hadm_id,
        "row_id": row_id,
        "charttime": charttime.isoformat() if charttime else None
    })

batch_size = 16
parsed_list = []
final_outputs = []

for i in tqdm(range(0, len(prompts), batch_size)):
    batch_prompts = prompts[i:i+batch_size]
    batch_meta = metadata[i:i+batch_size]
    batch_responses = pipe(batch_prompts, max_new_tokens=400)
    
    for meta, raw, prompt in zip(batch_meta, batch_responses, batch_prompts):
        text = raw[0]['generated_text']
        if prompt in text:
            text = text.replace(prompt, "").strip()
        text = {'text': text}
    
        final_outputs.append({**meta, **text})

ul.save_to_jsonl(final_outputs, model_id, timestamp)

n = len(list(set([i['subject_id'] for i in final_outputs])))

print(f' ... For {n} patients and {len(notes)} notes, it took {datetime.now() - time_start} ... ')


In [None]:
notes = ul.get_notes_for_first_n_notes(1)
prompt = ul.sdh_single_prompt(notes[0][3])

In [None]:
output = pipe(prompt, max_new_tokens=400)

if prompt in output:
    print(output.replace(prompt, '').strip())
else:
    print(output.strip())

# Qwen1.5-0.5B

In [None]:
import torch
import utils_llm as ul
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, LlamaForCausalLM

In [None]:
model_id = 'I:/Qwen1.5-0.5B-LoRA-bioinstruct'
model = AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
pipe = pipeline('text-generation', model=model, tokenizer=tokenizer, device=0)

In [None]:
notes = ul.get_notes_for_first_n_notes(1)
prompt = ul.sdh_single_prompt(notes[0][3])

In [None]:
prompt

In [None]:
output = pipe(prompt, max_new_tokens=400)[0]['generated_text']

if prompt in output:
    print(output.replace(prompt, '').strip())
else:
    print(output.strip())

# AWS Bedrock

In [None]:
# !pip install boto3

In [None]:
svc = boto3.client("bedrock", region_name="us-east-1")

# list_foundation_models
resp = svc.list_foundation_models()
print("Foundation models:")
for mdl in resp.get("modelSummaries", []):
    if 'TEXT' in mdl['outputModalities']:
        print(" •", mdl["modelId"])
    else:
        pass
print()

In [None]:
import boto3
import json
import utils_llm as ul

In [None]:
def main():
    client = boto3.client("bedrock-runtime", region_name="us-east-1")

    notes = ul.get_notes_for_first_n_notes(1)
    prompt = ul.sdh_prompt(notes[0][3])
    
    # prompt = 'Extract information for: A 62-year-old male school bus driver with a history of hypertension (diagnosed 2015) and hyperlipidemia (diagnosed 2018) presents with a two-day history of intermittent substernal, pressure-like chest pain radiating to the left arm, rated 6/10 in intensity. The discomfort occurs both at rest and with minimal exertion, lasting five to ten minutes each episode, and is accompanied by mild dyspnea on exertion but no diaphoresis or syncope. He denies tobacco, alcohol, or illicit drug use. He lives with his spouse in a two-bedroom apartment and commutes via public transportation. On exam, his blood pressure is 150/90 mmHg, heart rate 88 bpm, respiratory rate 18 breaths/min, SpO₂ 98% on room air, and temperature 36.8 °C. He appears in mild distress but is alert; cardiovascular exam shows a regular rate and rhythm without murmurs or gallops, lungs are clear bilaterally, abdomen is soft and non-tender, and neurologic exam is non-focal. His medications include lisinopril 20 mg daily and atorvastatin 40 mg nightly. The plan is to rule out acute coronary syndrome with ECG and serial troponins, administer aspirin 325 mg PO now and nitroglycerin 0.4 mg SL PRN, continue home medications, monitor vitals and pain every four hours, and consult cardiology for further evaluation.'  

    list_titan_models = ['amazon.titan-tg1-large',
    'amazon.nova-premier-v1:0:8k',
    'amazon.nova-premier-v1:0:20k',
    'amazon.nova-premier-v1:0:1000k',
    'amazon.nova-premier-v1:0:mm',
    'amazon.nova-premier-v1:0',
    'amazon.titan-text-premier-v1:0']
    
    payload = {
        "inputText": prompt,
        "textGenerationConfig": {
            "maxTokenCount": 512,
            "temperature": 0.7
        }
    }
    
    # payload = {
    #     "prompt": "\n\nHuman: " + prompt + "\n\nAssistant:",
    #     'max_tokens_to_sample': 4000
    # }

    # messages = [
    #     {"role": "system",    "content": "You are a clinical NLP assistant."},
    #     {"role": "assistant", "content": "Ready to extract Employment status_, Housing issues, Transportation needs, Parental status, Relationship status, Social support, and Substance Use from a clinical note."},
    #     {"role": "user",      "content": prompt}
    # ]

    # payload = {
    #     "messages": messages,
    #     "temperature": 0.7,
    #     "max_token_count": 256,
    #     "top_p": 0.5
    # }
    
    response = client.invoke_model(
        modelId="amazon.titan-text-premier-v1:0",
        contentType="application/json",
        accept="application/json",
        body=json.dumps(payload).encode("utf-8")
    )
    
    raw = response["body"].read().decode("utf-8")
    try:
        out = json.loads(raw)
        print(json.dumps(out, indent=2, ensure_ascii=False))
    except json.JSONDecodeError:
        print("Raw:", raw)
    
if __name__ == "__main__":
    main()


# Flan-T5 XL + MIMIC III

In [None]:
model_id = 'I:/Flan-t5-xl'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [None]:
subject_and_hadm_ids = pd.read_csv('C:/Users/salazarda/Downloads/SDOH_MIMICIII_physio_release.csv')
subject_and_hadm_ids = list(subject_and_hadm_ids.loc[:, ['patient_id', 'note_id']].drop_duplicates().itertuples(index=False, name=None))

notes = ul.get_clinical_notes_mimic3(subject_and_hadm_ids)
notes = notes[0:5]

sdoh_output = []

for note in notes:
    meta = {
        'subject_id': note[0],
        'hadm_id': note[1],
        'row_id': note[2],
        'charttime': note[3].isoformat() if note[3] else None
    }

    outputs_per_note = meta.copy()  # Start with metadata

    for sdoh in tqdm(['Employment status', 'Housing issues', 'Transportation issues', 'Parental status', 'Relationship status', 'Social support']):
        instruction = {
            'Employment status': 'Employment status: Whether the patient is currently employed, unemployed, underemployed, disability, retired, student, or unknown. LABELS: [employed, unemployed, underemployed, disability, retired, student, unknown]',
            'Housing issues': 'Housing issues: Any mention of financial status, undomiciled, other. LABELS: [financial status, undomiciled, other, unknown]', 
            'Transportation issues': 'Transportation issues: Any reference to transportation difficulties such as distance, resources, other. LABELS: [distance, resources, other, unknown]', 
            'Parental status':'Parental status: Whether the patient has a child under 18 years old. LABELS: [yes, no, unknown]',
            'Relationship status': 'Relationship status: Whether the patient is widowed, divorced, single. LABELS: [married, partnered, widowed, divorced, single, unknown]',
            'Social support': 'Social support: It does include informal or emotional support from family members, friends, or romantic partners unless such support is clearly mediated through a formal care plan by a social worker or case manager. LABELS: [presence, absence, unknown]'
        }

        prompt = ul.sdh_single_prompt(note[4], sdoh, instruction[sdoh])

        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)

        with torch.no_grad():
            generated_ids = model.generate(
                **inputs,
                max_new_tokens=600,
                length_penalty=1.6,
                num_beams=10,
                no_repeat_ngram_size=3,
                temperature=0.8,
                do_sample=True,
                top_k=15,
                top_p=0.95,
                repetition_penalty=2.1,
                early_stopping=True
            )

        output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        outputs_per_note[sdoh] = output_text

    sdoh_output.append(outputs_per_note)

    

In [None]:
sdoh_output[0]

# ModernBERT + MIMIC III

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

model_id = "I:/modernBERT-Large"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForMaskedLM.from_pretrained(model_id)

text = "The capital of France is [MASK]."
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)

# To get predictions for the mask:
masked_index = inputs["input_ids"][0].tolist().index(tokenizer.mask_token_id)
predicted_token_id = outputs.logits[0, masked_index].argmax(axis=-1)
predicted_token = tokenizer.decode(predicted_token_id)
print("Predicted token:", predicted_token)
# Predicted token:  Paris


# Mistral-7B-Instruct + MIMIC III

In [None]:
model_id = 'I:/Mistral-7B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, pad_token_id=tokenizer.eos_token_id, device=0)

In [None]:
import random
time_start = datetime.now()
timestamp = time_start.strftime("%Y%m%d_%H%M%S")

subject_and_hadm_ids = pd.read_csv('C:/Users/salazarda/Downloads/SDOH_MIMICIII_physio_release.csv')
subject_and_hadm_ids = list(subject_and_hadm_ids.loc[:,['patient_id', 'note_id']].drop_duplicates().itertuples(index=False, name=None))
notes = ul.get_clinical_notes_mimic3(subject_and_hadm_ids)
# notes = random.sample(notes, 10)
# notes = notes[0:50]

prompts = []
metadata = []

for subject_id, hadm_id, row_id, charttime, note_text in notes:
    prompts.append(ul.sdh_prompt_guevara_v2(note_text))
    metadata.append({
        "subject_id": subject_id,
        "hadm_id": hadm_id,
        "row_id": row_id,
        "charttime": charttime.isoformat() if charttime else None
    })

batch_size = 16
parsed_list = []
final_outputs = []

for i in tqdm(range(0, len(prompts), batch_size)):
    batch_prompts = prompts[i:i+batch_size]
    batch_meta = metadata[i:i+batch_size]
    batch_responses = pipe(batch_prompts, max_new_tokens=400)
    
    for meta, raw, prompt in zip(batch_meta, batch_responses, batch_prompts):
        text = raw[0]['generated_text']
        if prompt in text:
            text = text.replace(prompt, "").strip()
        text = {'text': text}
    
        final_outputs.append({**meta, **text})

ul.save_to_jsonl(final_outputs, model_id, timestamp)

n = len(list(set([i['subject_id'] for i in final_outputs])))

print(f' ... For {n} patients and {len(notes)} notes, it took {datetime.now() - time_start} ... ')


In [None]:
prompts[0]

In [None]:
final_outputs

In [None]:
import re
records = []

pattern = re.compile(r'"(?:Employment status|Housing issues|Transportation issues|Parental status|Relationship status|Social support)"\s*:\s*"[^"]+"')

for entry in final_outputs:
    base = {"subject_id": entry["subject_id"], "hadm_id": entry["hadm_id"], "charttime": entry["charttime"], "row_id": entry["row_id"]}
    matches = pattern.findall(entry["text"])
    sdhs = {label.split(':')[0].replace('"',''): label.split(':')[1].replace('"','') for label in matches}
    base.update(sdhs)
    records.append(base)
    
df = pd.DataFrame(records)
df = df.map(lambda x: x.lower().strip() if isinstance(x, str) else x)

In [None]:
df

In [None]:
# import re
# records = []

# pattern = re.compile(r'SDH_([^:]+): \[([^\]]+)\]')

# for entry in final_outputs:
#     base = {"subject_id": entry["subject_id"], "hadm_id": entry["hadm_id"], "charttime": entry["charttime"], "row_id": entry["row_id"]}
#     matches = pattern.findall(entry["text"])
#     sdhs = {label.strip(): value for label, value in matches}
#     base.update(sdhs)
#     records.append(base)
    
# df = pd.DataFrame(records)
# df = df.map(lambda x: x.lower() if isinstance(x, str) else x)

In [None]:
sdh_fields = ['Housing issues',
              'Employment status', 
              'Transportation issues', 
              'Parental status',
              'Relationship status', 
              'Social support']
all_map = {}
for field in sdh_fields:
    all_map[field] = {i: i for i in df[f'{field}'].unique()}
all_map

In [None]:
map_sdoh = {
            # [financial status, undomiciled, other, unknown]
            'Housing issues': {'unknown': 'unknown',
              'financial status': 'financial status',
              'other': 'other',
              'subsidized': 'financial status'},
            # [employed, unemployed, underemployed, disability, retired, student, unknown]
            'Employment status': {'unknown': 'unknown',
              'disability': 'disability',
              'retired': 'retired',
              'unemployed': 'unemployed',
              'deferred': 'unemployed',
              'employed': 'employed',
              'student': 'student',
              'employed,disabled': 'underemployed'},
            # [distance, resources, other, unknown]
            'Transportation issues': {'unknown': 'unknown',
              'resources': 'resources',
              'other': 'other'},
            'Parental status': {'unknown': 'unknown', 'yes': 'yes', 'no': 'no'},
            # [married, partnered, widowed, divorced, single, unknown]
            'Relationship status': {'unknown': 'unknown',
              'married': 'married',
              'divorced': 'divorced',
              'partnered': 'partnered',
              'estranged': 'unknown',
              'widowed': 'widowed',
              'family': 'unknown',
              'brother': 'unknown',
              'separated': 'divorced',
              'daughter': 'unknown'},
            'Social support': {'unknown': 'unknown',
              'presence': 'plus',
              'absence': 'minus'}}

In [None]:
for field in sdh_fields:
    df[f'{field}'] = df[f'{field}'].map(map_sdoh[f'{field}'])
df = df.map(lambda x: x.replace("'", "") if isinstance(x, str) else x)
df_pred = df.rename(columns={'Employment status': 'EMPLOYMENT', 'Housing issues': 'HOUSING', 'Transportation issues': 'TRANSPORTATION', 'Parental status': 'PARENT', 'Relationship status': 'RELATIONSHIP', 'Social support': 'SUPPORT'})
subject_and_hadm_ids = pd.read_csv('C:/Users/salazarda/Downloads/SDOH_MIMICIII_physio_release.csv')
df = subject_and_hadm_ids.iloc[:,5:].copy()

prefixes = set(c.split("_",1)[0] for c in df.columns if "_" in c)

for p in prefixes:
    df[p] = ul.collapse_onehot_group(df, p)
df_real = df.drop(columns=[c for c in df.columns if "_" in c])

df_real_ = pd.concat([subject_and_hadm_ids.iloc[:,0:5], df_real], axis=1)
df_real_ = df_real_.loc[~(df_real_.iloc[:,5:] == 0).all(axis=1)]
df_real_pred = pd.merge(df_real_, df_pred, left_on=['patient_id', 'note_id'], right_on=['subject_id', 'row_id'], how='inner')
df_real_pred.loc[:,['patient_id', 'note_id', 'PARENT_x', 'PARENT_y']].drop_duplicates()

sdh_fields = [
    "EMPLOYMENT",
    "HOUSING",
    "TRANSPORTATION",
    "PARENT",
    "RELATIONSHIP",
    "SUPPORT"
]

for field in sdh_fields:
    df_ = df_real_pred.loc[:,['patient_id', 'note_id', f'{field}_x', f'{field}_y']].drop_duplicates()
    df_ = df_.sort_values(['patient_id', 'note_id', f'{field}_x'],ascending=False).groupby(['patient_id', 'note_id'], as_index=False).first()
    if field == 'PARENT':
        df_ = df_.map(lambda x: 'no' if x == 0 else 'yes')
    else:
        df_ = df_.map(lambda x: 'unknown' if x == 0 else x)
    y_true = df_[f"{field}_x"]
    y_pred = df_[f"{field}_y"]
    # print(f'accuracy for {field}: {accuracy_score(y_true, y_pred)}')
    print(f'{field}... ')
    print(classification_report(y_true, y_pred))
    print('_____________________________________________')

---------------------------------------------------------

# OLD CODE

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

full_prompt = ul.sdh_prompt(notes[0][3])

inputs = tokenizer(full_prompt, return_tensors="pt", truncation=True, max_length=5000).to(device)
output = model.generate(
    **inputs,
    max_new_tokens=5000,
    temperature=0.7,
    top_p=0.9,
    repetition_penalty=1.2,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id 
)

In [None]:
print(tok.decode(output[0], skip_special_tokens=True))

In [None]:
extracted_text = tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
extracted_text

In [None]:
import utils_llm as ul
from tqdm import tqdm
from datetime import datetime

time_start = datetime.now()
n=5
all_outputs = []
notes = ul.get_notes_for_first_n_notes(n)

prompts = []
metadata = []

for subject_id, hadm_id, charttime, note_text in notes:
    prompts.append(ul.sdh_prompt(note_text))
    metadata.append({
        "subject_id": subject_id,
        "hadm_id": hadm_id,
        "charttime": charttime.isoformat() if charttime else None
    })

batch_size = 16
final_outputs = []

for i in tqdm(range(0, len(prompts), batch_size)):
    batch_prompts = prompts[i:i+batch_size]
    batch_meta = metadata[i:i+batch_size]
    
    batch_responses = pipe(batch_prompts, max_new_tokens=400)
    
    for meta, raw in zip(batch_meta, batch_responses):
        parsed = ul.parse_sdh_response(raw[0]['generated_text'])
        final_outputs.append({**meta, **parsed})

ul.save_to_jsonl(final_outputs, model_id)

print(f' ... For {n} patients and {len(notes)} notes, it took {datetime.now() - time_start} ... ')


In [None]:
pipe(prompts[0], max_new_tokens=400)

In [None]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained("I:/BioMistral-7B")
print(config.architectures)

In [None]:
from transformers import pipeline

generator = pipeline("text-generation", model="I:/BioMistral-7B")

prompt = "What are the latest treatments for glioblastoma?"
output = generator(
    prompt,
    max_new_tokens=512
)

print(output[0]['generated_text'])


In [None]:
pipe = pipeline('text-generation', model=model, tokenizer=tokenizer, device=0)

In [None]:
prompt = "Explain step-by-step how mRNA vaccines work."
pipe(prompts[0], max_new_tokens=400)

In [None]:
prompt = "Explain  how are the latest treatments for glioblastoma."
pipe(prompt, max_new_tokens=400)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
import torch

name = "I:/BioMistral-7B"
tok  = AutoTokenizer.from_pretrained(name)
model = AutoModelForCausalLM.from_pretrained(
    name,
    torch_dtype=torch.float16,
    device_map="auto"
)

msgs = [
    {"role": "system", "content": "You are a helpful biomedical assistant."},
    {"role": "user",   "content": "What are the latest treatments for glioblastoma?"}
]

prompt = tok.apply_chat_template(msgs, add_generation_prompt=True, tokenize=False)

inputs = tok(prompt, return_tensors="pt").to(model.device)
streamer = TextStreamer(tok)          # streams tokens as they appear (optional)

out = model.generate(
    **inputs,
    max_new_tokens=400,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    streamer=streamer                 # real-time printout
)

print(tok.decode(out[0], skip_special_tokens=True))


In [None]:
prompt = ul.sdh_prompt("The patient is a 45-year-old female with no fixed address and a history of alcohol use disorder...")  # Short sample

inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)


In [None]:
pipe(prompt, max_new_tokens=256)

In [None]:
def build_metadata_log(final_outputs):
    yes_count = 0
    no_count = 0
    charttimes = []
    unique_subjects = set()
    
    for record in final_outputs:
        unique_subjects.add(record['subject_id'])
        if record.get('charttime'):
            charttimes.append(record['charttime'])
        
        for var in SDOH_VARIABLES:
            if var in record:
                if record[var]['present'] == "Yes":
                    yes_count += 1
                elif record[var]['present'] == "No":
                    no_count += 1
    
    charttimes_sorted = sorted(charttimes)
    
    log = {
        "processing_time": datetime.now().isoformat(),
        "num_patients": len(unique_subjects),
        "num_notes": len(final_outputs),
        "variables_used": SDOH_VARIABLES,
        "num_yes": yes_count,
        "num_no": no_count,
        "charttime_range": {
            "min": charttimes_sorted[0] if charttimes_sorted else None,
            "max": charttimes_sorted[-1] if charttimes_sorted else None
        }
    }
    return log


In [None]:
metadata_log = build_metadata_log(final_outputs)

with open("sdoh_processing_log.json", "w", encoding="utf-8") as f:
    json.dump(metadata_log, f, indent=2)

In [None]:
batch_responses

In [None]:
ul.save_to_jsonl(final_outputs)

In [None]:
analyze_sdh_for_subject(subject_id=15005348)

In [None]:
import utils_llm as ul
ul.get_clinical_note(10000032)

In [None]:
def sdh_prompt(note_text):
    return f"""
    You are a clinical NLP assistant. Analyze the following clinical note and indicate whether each of the following seven social determinants of health (SDH) is specifically mentioned:
        
    1. **Employment status**: Whether the patient is currently employed, unemployed, retired, on disability, or has a job title or income source.
    2. **Housing issues**: Any mention of homelessness, unstable housing, living in shelters, or housing concerns (e.g., can't afford rent, frequent moves).
    3. **Transportation needs**: Any reference to transportation difficulties, lack of car access, reliance on public transit, missed appointments due to transportation.
    4. **Parental status**: Whether the patient has children or dependents, or is a caregiver to minors.
    5. **Relationship status**: Whether the patient is married, divorced, single, has a partner, or is widowed.
    6. **Social support**: Whether the patient is receiving formal help or assistance from a **social worker**, also extracts the name of the service.
    7. **Substance Use**: Any mention of alcohol, drug, or tobacco use, including current use, past use, or explicit denial of use.
    
    Answer with **"Yes" or "No"** for each item, and include a **short evidence sentence**. If not mentioned, say: *There is no evidence.* 
    
    ---
    
    Now analyze the following clinical note:
    
    \"\"\"
    {note_text}
    \"\"\"
    
    Respond in this format:
    
    Employment status: [Yes/No] - [short evidence sentence]
    Housing issues: [Yes/No] - [short evidence sentence]
    Transportation needs: [Yes/No] - [short evidence sentence]
    Parental status: [Yes/No] - [short evidence sentence]
    Relationship status: [Yes/No] - [short evidence sentence]
    Social support: [Yes/No] - [short evidence sentence]
    Substance Use: [Yes/No] - [short evidence sentence]
    """

In [None]:
prompt = ul.sdh_prompt(ul.get_clinical_note(10000032))

In [None]:
prompt

In [None]:
pipe(prompt, max_new_tokens=400)[0].keys()