In [None]:
from transformers import AutoModel, AutoTokenizer
import torch
import pandas as pd
import glob
from tqdm import tqdm

In [None]:
# Language Model Processing Pipeline for 'chiefcomplaint'
# 
# This notebook describes the pipeline for processing the 'chiefcomplaint' column using language models.

In [None]:
# Load the triage dataframe, which contains the 'chiefcomplaint' column
ed_triage_df =  pd.read_csv('triage.csv.gz', compression='gzip')

In [None]:
# Helper function to generate prompts for language models.
# We use a simple formulation by prepending "Chief Complaint: " to each entry in 'chiefcomplaint'.
# This aligns with the pretraining format of GatorTron, which was trained on concise clinical notes.

def text_gen(x):
    return f"Chief Complaint: {x}"

In [10]:
# Helper function to convert the 'chiefcomplaint' column into batches of prompts.
# This is useful for efficiently feeding data into language models that process inputs in batches.
# 
# Arguments:
# - df: A DataFrame where each row contains a 'chiefcomplaint' string
# - m: Batch size (number of prompts per batch)
# 
# Returns:
# - A list of lists, where each sublist is a batch of formatted prompt strings.

def converter(df, m):
    ans = []
    temp = []
    for i in range(df.shape[0]):
        text = text_gen(df.iloc[i])  # Generate prompt from the ith row
        temp.append(text)
        if (i + 1) % m == 0:
            if len(temp) != 0:
                ans.append(temp)
            temp = []
    if len(temp) != 0:
        ans.append(temp)
    return ans


In [None]:
# We're now ready to work with language models!
# First, we set the device to use GPU if available, otherwise fallback to CPU. 
# The model we use is not large so CPU would suffice.
device = "cuda" if torch.cuda.is_available() else "cpu"


In [None]:
# Helper function to run inference on batches of prompts and extract embeddings.
# 
# Each prompt is passed through a language model, and the [CLS] token embedding (first token)
# is used as the representation. The function returns a pandas Series of embeddings.
#
# Arguments:
# - batch_lst: List of batches (each batch is a list of prompt strings)
# - tokenizer: Tokenizer corresponding to the language model
# - model: Language model (here we use GatorTron)
# - device: Device to run inference on ("cuda" or "cpu")
#
# Returns:
# - A pandas Series where each entry is a 1D numpy array (embedding vector) for a prompt.

def inference_prompt(batch_lst, tokenizer, model, device="cuda" if torch.cuda.is_available() else "cpu"):
    ans = []
    for batch in tqdm(batch_lst):
        bs = len(batch)
        # Tokenize the batch of prompts
        inputs = tokenizer(batch, padding=True, return_tensors="pt", truncation=True, max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Run inference
        with torch.no_grad():
            output_sequences = model(**inputs)
            # Extract [CLS] token embeddings (first token in each sequence)
            outputs_cls_embeddings = output_sequences.last_hidden_state[:, 0, :].cpu().numpy()
        
        # Collect embeddings for each prompt in the batch
        for i in range(bs):
            ans.append(outputs_cls_embeddings[i, :])

    return pd.Series(ans)

        
        


In [None]:
# Now that all helper functions are defined, we can start using them.
# Here, we convert the 'chiefcomplaint' column into batches of 500 prompts each.

lst_prompt = converter(ed_triage_df["chiefcomplaint"], 500)

In [None]:
# We use GatorTron as our language model.
# The reason behind this choice is discussed in our presentation slides.

model_name = "UFNLP/gatortron-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.to(device)

MegatronBertModel(
  (embeddings): MegatronBertEmbeddings(
    (word_embeddings): Embedding(50176, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MegatronBertEncoder(
    (layer): ModuleList(
      (0-23): 24 x MegatronBertLayer(
        (attention): MegatronBertAttention(
          (ln): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
          (self): MegatronBertSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): MegatronBertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
  

In [23]:
# Run inference on the generated prompt batches and get the embeddings for each prompt.
res = inference_prompt(lst_prompt, tokenizer, model, device=device)

100%|██████████| 851/851 [13:04<00:00,  1.08it/s]


In [24]:
# Save the results (embeddings) to a CSV file for later use.
res.to_csv("chiefcomplaint_embeddings.csv", index=False)
