#### This code will read the complete clinical note and then try to define whether a SDOH is present in the complete note. If a SDOH is present, then LLM should return the position of text in the clinical note which mentions that SDOH

In [1]:
import pandas as pd
import openai
import os
import pickle
from openai import AzureOpenAI
import sys
import json
import time

In [2]:
sys.path.append(os.path.join(os.getcwd(), '..'))

In [3]:
from prompts import *

In [4]:
UNIQUE_ID_COLUMN_NAME = "ROW_ID"
UNIQUE_TEXT_COLUMN_NAME = "TEXT"
UNIQUE_LABEL_COLUMN_NAMES = ['sdoh_economics','sdoh_environment']
# Economics (0: None, 1: True[Non-Adverse], 2: False[Adverse])
# Environment (0: None, 1: True[Non-Adverse], 2: False[Adverse])

In [5]:
#Paths to MIMIC_CSVs
MIMIC_ADMISSION_CSV = "../ahsan_data/ADMISSIONS.csv" #Fill in path/to/file with the path to your MIMIC-III folder
MIMIC_NOTEEVENTS_CSV = "../ahsan_data/NOTEEVENTS.csv" #Fill in path/to/file with the path to your MIMIC-III folder
MIMIC_SBDH = "../ahsan_data/MIMIC-SBDH.csv" #Fill in path/to/file with the path to your MIMIC-SBDH folder

In [6]:
df = pd.read_csv(MIMIC_ADMISSION_CSV)
notes_df = pd.read_csv(MIMIC_NOTEEVENTS_CSV)

  notes_df = pd.read_csv(MIMIC_NOTEEVENTS_CSV)


In [10]:
#Loading DataFrames for Annotated and Unnanotated MIMIC Notes

newborn_list = df[df["ADMISSION_TYPE"] == "NEWBORN"].SUBJECT_ID.to_list()
discharge_df = notes_df[notes_df['CATEGORY'] == 'Discharge summary']
non_neonatal = discharge_df[~discharge_df['SUBJECT_ID'].isin(newborn_list)]

sbdh_data = pd.read_csv(open(MIMIC_SBDH, 'r+', encoding='UTF-8'),encoding='UTF-8', on_bad_lines='warn')
sbdh_data = sbdh_data.rename(columns={'row_id':UNIQUE_ID_COLUMN_NAME})

annotated_list = sbdh_data[UNIQUE_ID_COLUMN_NAME].tolist()
annotated_notes = discharge_df[discharge_df[UNIQUE_ID_COLUMN_NAME].isin(annotated_list)]
annotated_subjects = discharge_df[discharge_df[UNIQUE_ID_COLUMN_NAME].isin(annotated_list)].SUBJECT_ID.to_list()

annotated_sh = pd.merge(annotated_notes,sbdh_data[[UNIQUE_ID_COLUMN_NAME] + UNIQUE_LABEL_COLUMN_NAMES],on=UNIQUE_ID_COLUMN_NAME, how='left')

df = newborn_list = notes_df = discharge_df = non_neonatal = annotated_list = annotated_notes = annotated_subjects = sbdh_data = None

In [11]:
with open('../azure_credentials.json', 'r') as file:
    azure_data = json.load(file)
    api_key = azure_data['API_KEY']
    api_version = azure_data['API_VERSION']
    azure_endpoint = azure_data['AZURE_ENDPOINT']
    azure_deployment_name = azure_data['AZURE_DEPLOYMENT_NAME']

client = AzureOpenAI(
    api_key=api_key,
    api_version=api_version,
    azure_endpoint = azure_endpoint
    )

deployment_name=azure_deployment_name #This will correspond to the custom name you chose for your deployment when you deployed a model. Use a gpt-35-turbo-instruct deployment.

# Defining a function to create the prompt from the instruction system message, the few-shot examples, and the current query
def create_prompt(system_message, user_message):    
    formatted_message = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message}
    ]
    
    return formatted_message

# This function sends the prompt to the GPT model
def send_message(message, model_name, max_response_tokens=500):
    response = client.chat.completions.create(
        model=model_name,
        messages=message,
        temperature=0,
        max_tokens=max_response_tokens,
        top_p=0.95,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
    )
    
    return response.choices[0].message.content.strip()

In [20]:
step1_query_ahsan_sdoh_place_extraction = '''
You are given a free-text clinical note (<<text>>) from electronic health records. For each of the following categories, determine the attribute that accurately describes the status of the patient at the time of <<text>>. Do not infer the impact of <<text>> onto the attributes, instead focus on the factual information only which is present at the time of <<text>>.
The category is shown in double quotes and the attributes are shown in square brackets:

1. "EMPLOYMENT": ['Non-Adverse', 'Adverse'];
2. "HOUSING": ['Non-Adverse', 'Adverse'];

If a valid attribute is determined for any category, you also need to record the starting/ending character position within the <<text>> that signifies the attribute using following rules:
- For no detection, simply return empty lists e.g., Start: [], End: []
- For single/multiple sub-text position, return in a list format separated by commas with each position in the start/end list defining the portion of subtext within <<text>> e.g., Start: [1, 6], End: [10, 15]

In your final response, you must consolidate this information in a brief reply i.e., "YES" if a valid attribute is determined for atleast one category, else "NO". Also return the starting/ending character position lists within <<text>> which signify the attribute.

Input: <<{free_text}>>
Answer: YES/NO
Start: []
End: []
'''

In [27]:
free_text = annotated_sh['TEXT'][2]
system_message = "You are an information extract tool that follows instructions very well and is specifically trained to extract social determinants of health elements from hospital generated free-text."
user_message = step1_query_ahsan_sdoh_place_extraction.format(free_text=free_text)
openai_message = create_prompt(system_message, user_message)
response = send_message(openai_message, deployment_name)

In [28]:
free_text



In [29]:
print(response)

YES
Start: [1029, 1035]
End: [1039, 1044]


In [30]:
print(free_text[1079:1093])

e pancreatitis


In [31]:
len(free_text)

15696