In [56]:
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration
import torch
import pandas as pd
import glob
from read_csv_gz import read_csv_gz
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
ed_diagnosis_df = read_csv_gz("ed_data/diagnosis.csv.gz")
ed_edstays_df = read_csv_gz("ed_data/edstays.csv.gz")
ed_medrecon_df = read_csv_gz("ed_data/medrecon.csv.gz")
ed_pyxis_df = read_csv_gz("ed_data/pyxis.csv.gz")
ed_triage_df = read_csv_gz("ed_data/triage.csv.gz")
ed_vitalsign_df = read_csv_gz("ed_data/vitalsign.csv.gz")


In [3]:
# def text_gen(x):
#     return f"You are a medical expert assisting in an Emergency Department (ED). Your task is to assess the **diagnostic complexity** of the following chief complaint on a scale of 1-10:\n- **1-3 (Low Complexity):** Easily diagnosable, requires minimal tests.\n- **4-6 (Moderate Complexity):** Needs some testing, single specialty.\n- **7-10 (High Complexity):** Requires multiple tests, possible admission, specialist consults.\nChief Complaint: **{x}**\nPlease only return an integer of Complexity Score (1-10):"

In [3]:
def text_gen(x):
    return f"The patient is suffering from {x}"

In [4]:
print(ed_triage_df.shape[0])

425087


In [5]:
def converter(df, m):
    ans = []
    temp = []
    for i in range(df.shape[0]):
        text = text_gen(df.iloc[i])
        temp.append(text)
        if (i + 1) % m == 0:
            if len(temp) != 0 :
                ans.append(temp)
            temp = []
    if len(temp) != 0:
        ans.append(temp)
    return ans

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [77]:
def inference_prompt(batch_lst, tokenizer, model, device="cuda" if torch.cuda.is_available() else "cpu"):
    ans = []
    for batch in tqdm(batch_lst):
        bs = len(batch)
        inputs = tokenizer(batch, padding=True, return_tensors="pt", truncation=True, max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            output_sequences = model(**inputs)
            outputs_cls_embeddings = output_sequences.last_hidden_state[:, 0, :].cpu().numpy() 
        for i in range(bs):
            ans.append(outputs_cls_embeddings[i, :])

    return pd.Series(ans)
        
        


In [78]:
lst_prompt = converter(ed_triage_df["chiefcomplaint"].head(234), 10)

In [79]:
model_name = "bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [80]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [81]:
res = inference_prompt(lst_prompt, tokenizer, model, device="cuda" if torch.cuda.is_available() else "cpu")

100%|██████████| 24/24 [00:01<00:00, 13.49it/s]


In [82]:
print(res.iloc[0].shape)
print(res.shape)

(768,)
(234,)
