In [None]:
import os, sys
import openai
import json
import numpy as np
import pickle
import pandas as pd
from time import sleep
import signal
import tiktoken
from sklearn.metrics import classification_report, confusion_matrix, cohen_kappa_score

## Extract Data into Accepted Format
### Unnanotated DataFrame Necessary Columns:
- <b>ID</b> Unique identifier number for ease of reference
- <b>TEXT</b> This column should hold all relevant text that should be annotated by AnnotateGPT

##### Example Unnanotated Sample
<table>
  <tr>
    <th><center>ROW_ID</center></th>
    <th><center>TEXT</center></th>
  </tr>
  <tr>
    <td><center>1</center></td>
    <td><center>Patient...</center></td>
  </tr>
  <tr>
    <td><center>...</center></td>
    <td><center>...</center></td>
  </tr>
  <tr>
    <td><center>2038</center></td>
    <td><center>The patient comes from...</center></td>
  </tr>
</table>

### Unnanotated DataFrame Necessary Columns:
- <b>ID</b> Unique identifier number for ease of reference
- <b>TEXT</b> This column should hold all relevant text that should be annotated by AnnotateGPT
- <b>LABEL NAME</b> This column should hold the categorization for the label in question. There may be more than one <i>LABEL NAME</i> column, and each should have its own unique name.

##### Example Annotated Sample
<table>
  <tr>
    <th><center>ROW_ID</center></th>
    <th><center>TEXT</center></th>
    <th><center>sdoh_community_present</center></th>
    <th><center>sdoh_economics</center></th>
    <th><center>behavior_tobacco</center></th>
  </tr>
  <tr>
    <th><center>1</center></th>
    <td><center>The patient...</center></td>
    <td><center>1</center></td>
    <td><center>0</center></td>
    <td><center>1</center></td>
  </tr>
  <tr>
    <td><center>...</center></td>
    <td><center>...</center></td>
    <td><center>...</center></td>
    <td><center>...</center></td>
    <td><center>...</center></td>
  </tr>
    <tr>
    <td><center>233</center></td>
    <td><center>Patient's family...</center></td>
    <td><center>1</center></td>
    <td><center>0</center></td>
    <td><center>0</center></td>
  </tr>
</table>

### Example Extraction of MIMIC-III and MIMIC-SBDH Data

In [None]:
UNIQUE_ID_COLUMN_NAME = "ROW_ID"
UNIQUE_TEXT_COLUMN_NAME = "TEXT"
UNIQUE_LABEL_COLUMN_NAMES = ['sdoh_community_present','sdoh_economics','behavior_tobacco']

In [None]:
def retrieve_social_history(df):
    replace_texts = []
    for row_id in df[UNIQUE_ID_COLUMN_NAME]:
        patient = df[df[UNIQUE_ID_COLUMN_NAME] == row_id][UNIQUE_TEXT_COLUMN_NAME].iloc[0]
        social_history_start = patient.lower().find('social history:')
        pos_ends = []
        pos_ends.append(patient.lower().find('family history:'))
        pos_ends.append(patient.lower().find('physical exam'))
        pos_ends.append(patient.lower().find('medications:'))
        pos_ends.append(patient.lower().find('hospital course:'))
        pos_ends.append(patient.lower().find('review of systems:'))
        pos_ends = [x for x in pos_ends if x > social_history_start]
        pos_ends.append(social_history_start+500)
        social_history_end = min(pos_ends)
        replace_texts.append((row_id,patient[social_history_start:social_history_end]))
    texts = pd.DataFrame(replace_texts,columns =[UNIQUE_ID_COLUMN_NAME,UNIQUE_TEXT_COLUMN_NAME])
    
    return texts

In [None]:
#Paths to MIMIC_CSVs
MIMIC_ADMISSION_CSV = "ADMISSIONS.csv" #Fill in path/to/file with the path to your MIMIC-III folder
MIMIC_NOTEEVENTS_CSV = "NOTEEVENTS.csv" #Fill in path/to/file with the path to your MIMIC-III folder
MIMIC_SBDH = "MIMIC-SBDH.csv" #Fill in path/to/file with the path to your MIMIC-SBDH folder

In [None]:
#Loading DataFrames for Annotated and Unnanotated MIMIC Notes

df = pd.read_csv(MIMIC_ADMISSION_CSV)
newborn_list = df[df["ADMISSION_TYPE"] == "NEWBORN"].SUBJECT_ID.to_list()
notes_df = pd.read_csv(MIMIC_NOTEEVENTS_CSV)
discharge_df = notes_df[notes_df['CATEGORY'] == 'Discharge summary']
non_neonatal = discharge_df[~discharge_df['SUBJECT_ID'].isin(newborn_list)]
sbdh_data = pd.read_csv(open(MIMIC_SBDH, 'r+', encoding='UTF-8'),encoding='UTF-8', on_bad_lines='warn')
sbdh_data = sbdh_data.rename(columns={'row_id':UNIQUE_ID_COLUMN_NAME})
annotated_list = sbdh_data[UNIQUE_ID_COLUMN_NAME].tolist()
annotated_notes = discharge_df[discharge_df[UNIQUE_ID_COLUMN_NAME].isin(annotated_list)]
annotated_subjects = discharge_df[discharge_df[UNIQUE_ID_COLUMN_NAME].isin(annotated_list)].SUBJECT_ID.to_list()

no_soc_his = []
for index, row in non_neonatal.iterrows():
    if 'social history:' not in row[UNIQUE_TEXT_COLUMN_NAME].lower():
        no_soc_his.append(row[UNIQUE_ID_COLUMN_NAME])

final_sdoh_list = non_neonatal[~non_neonatal[UNIQUE_ID_COLUMN_NAME].isin(no_soc_his)]
unnanotated_notes = final_sdoh_list[~final_sdoh_list[UNIQUE_ID_COLUMN_NAME].isin(annotated_list)]

annotated_sh = retrieve_social_history(annotated_notes)
annotated_sh = pd.merge(annotated_sh,sbdh_data[[UNIQUE_ID_COLUMN_NAME] + UNIQUE_LABEL_COLUMN_NAMES],on=UNIQUE_ID_COLUMN_NAME, how='left')
unannotated_sh = retrieve_social_history(unnanotated_notes)

df = newborn_list = notes_df = discharge_df = non_neonatal = annotated_list = annotated_subjects = no_soc_his = final_sdoh_list = unnanotated = sbdh_data = None

In [None]:
economics_binary = [1 if x == 2 else 0 for x in annotated_sh.sdoh_economics.to_list()]
tobacco_binary = [1 if x == 1 or x == 2 else 0 for x in annotated_sh.behavior_tobacco.to_list()]
annotated_sh = annotated_sh.drop(columns=['sdoh_economics','behavior_tobacco'])
annotated_sh['sdoh_economics'] = economics_binary
annotated_sh['behavior_tobacco'] = tobacco_binary

In [None]:
annotated_sh

In [None]:
unannotated_sh

## Setting up API access to Microsoft Azure OpenAI Instance
Setup is dependent on your version of the API, as well as the nature of your instance. The example below uses V0.28 of the openai python module, and makes use of the Completion endpoint. Instructions on how to fill in the API access information below can be found in these cookbook examples: (https://github.com/Azure/openai-samples/blob/main/Basic_Samples/Chat/chatGPT_managing_conversation.ipynb) and (https://github.com/Azure/openai-samples/blob/main/Basic_Samples/Chat/config.json)

In [None]:
#This block is for API access. Replace it with whatever API you use.
    
chatgpt_model_name = None
openai.api_type = None
openai.api_key = None
openai.api_base = None
openai.api_version = None

## Setting up prompt messages
There are 3 kinds of prompt messages that must be set:
- <b>Instructions</b> - The instructions are as follows: a succinct roleplaying instruction designed to contextualize the GPT model; a General Task Instruction that explains the kind of information, which is important to focus on for the task; SDoH Specific Instruction that explicitly states which kinds of information must be extracted.
- <b>Query</b> - The query is a Yes/No question pertaining to the task
- <b>Examples</b> - The examples are constructed using pairs of "Shots" – exemplar responses presented alongside the prompt to assist in guiding GPT. These Shots are consistently presented with the positive shot before the negative shot within Examples section. This approach is systematically employed to craft four unique Two-Shot prompts, which are explained in detail in the Methods section of our paper

## Example prompt setup for MIMIC tasks
Select one of the 3 MIMIC tasks available. Change the variable associated with the task to True for the desired task.

In [None]:
# Choose a MIMIC task. Only one must be true, two must be false.
# Default: community
community = True
economics = False
tobacco = False

assert community + economics + tobacco == 1, "One and only one must be True, the other two must be False"

In [None]:
if community:
    task = 'community'
    label_column = "sdoh_community_present"
elif economics:
    task = 'economics'
    label_column = "sdoh_economics"
else:
    task = 'tobacco'
    label_column = "behavior_tobacco"

In [None]:
task_prompts = pickle.load(open('MIMIC_TASK_PROMPTS.pkl','rb'))

base_system_message = task_prompts[task]['instructions']
system_message = f"<|im_start|>INSTRUCTIONS:\n{base_system_message.strip()}\n<|im_end|>"
query_message = task_prompts[task]['query']

easy_example_pos = annotated_sh[annotated_sh[UNIQUE_ID_COLUMN_NAME] == task_prompts[task]['examples']['easy_example_pos']].iloc[0].TEXT.replace('\n', ' ').strip()
easy_answer_pos = task_prompts[task]['examples']['easy_answer_pos']
easy_answer_pos_explained = task_prompts[task]['examples']['easy_answer_pos_explained']
easy_example_neg = annotated_sh[annotated_sh[UNIQUE_ID_COLUMN_NAME] == task_prompts[task]['examples']['easy_example_neg']].iloc[0].TEXT.replace('\n', ' ').strip()
easy_answer_neg = task_prompts[task]['examples']['easy_answer_neg']
easy_answer_neg_explained = task_prompts[task]['examples']['easy_answer_neg_explained']

hard_example_pos = annotated_sh[annotated_sh[UNIQUE_ID_COLUMN_NAME] == task_prompts[task]['examples']['hard_example_pos']].iloc[0].TEXT.replace('\n', ' ').strip()
hard_answer_pos = task_prompts[task]['examples']['hard_answer_pos']
hard_answer_pos_explained = task_prompts[task]['examples']['hard_answer_pos_explained']
hard_example_neg = annotated_sh[annotated_sh[UNIQUE_ID_COLUMN_NAME] == task_prompts[task]['examples']['hard_example_neg']].iloc[0].TEXT.replace('\n', ' ').strip()
hard_answer_neg = task_prompts[task]['examples']['hard_answer_neg']
hard_answer_neg_explained = task_prompts[task]['examples']['hard_answer_neg_explained']

example_ids = [task_prompts[task]['examples']['easy_example_pos'],task_prompts[task]['examples']['easy_example_neg'],task_prompts[task]['examples']['hard_example_pos'],task_prompts[task]['examples']['hard_example_neg']]

## Prepare funtions
Make sure to run these cells

In [None]:
# Defining a function to create the prompt from the instruction system message, the few-shot examples, and the current query
# The function assumes 'examples' is a list of few-shot examples in dictionaries with 'context', 'query' and 'answer' keys
# Example: examples = [{"context": "Lives with wife, no tobacco, no alcohol, no drugs",
# "query": "Does the social history present tobacco use?", "answer": "No."}]
# The function assumes 'query' is a dictionary containing the current query GPT is expected to answer with 'context' and 'query' keys.
# Example: query = [{"context": "Lives alone, history of 1 ppd, no alcohol use, no drug use", 
# "query": "Does the social history present tobacco use?"}]
def create_prompt(system_message, examples, query):
    prompt = system_message
    if examples != None:
        for example in examples:
            prompt += f"\n<|im_start|>CONTEXT:\n{example['context']}\n<|im_end|>"
            prompt += f"\n<|im_start|>QUERY:\n{example['query']}\n<|im_end|>"
            prompt += f"\n<|im_start|>ANSWER:\n{example['answer']}\n<|im_end|>"
    prompt += f"\n<|im_start|>CONTEXT:\n{query['context']}\n<|im_end|>"
    prompt += f"\n<|im_start|>QUERY:\n{query['query']}\n<|im_end|>"
    prompt += f"\n<|im_start|>ANSWER:\n"
    return prompt

# This function sends the prompt to the GPT model
def send_message(prompt, model_name, max_response_tokens=500):
    response = openai.Completion.create(
        engine=chatgpt_model_name,
        prompt=prompt,
        temperature=0.5,
        max_tokens=max_response_tokens,
        frequency_penalty=0,
        presence_penalty=0,
        stop=['<|im_end|>']
    )
    return response['choices'][0]['text'].strip()

# timeout handler
def alarm_handler(signum, frame):
    print("Timeout... Retrying.")
    raise Exception()

# Defining a function to estimate the number of tokens in a prompt
def estimate_tokens(prompt):
    cl100k_base = tiktoken.get_encoding("cl100k_base") 

    enc = tiktoken.Encoding( 
        name="chatgpt",  
        pat_str=cl100k_base._pat_str, 
        mergeable_ranks=cl100k_base._mergeable_ranks, 
        special_tokens={ 
            **cl100k_base._special_tokens, 
            "<|im_start|>": 100264, 
            "<|im_end|>": 100265
        } 
    ) 

    tokens = enc.encode(prompt,  allowed_special={"<|im_start|>", "<|im_end|>"})
    return len(tokens)

In [None]:
def prepare_examples(shots, example_hard, example_explained):
    if shots:
        if example_hard:
            context_messages = [hard_example_pos, hard_example_neg]
            if example_explained:
                answer_messages = [hard_answer_pos_explained, hard_answer_neg_explained]
            else:
                answer_messages = [hard_answer_pos, hard_answer_neg]
        else:
            context_messages = [easy_example_pos, easy_example_neg]
            if example_explained:
                answer_messages = [easy_answer_pos_explained, easy_answer_neg_explained]
            else:
                answer_messages = [easy_answer_pos, easy_answer_neg]  

        examples = [{"context": context_messages[0], "query": query_message, "answer": answer_messages[0]},{"context": context_messages[1], "query": query_message, "answer": answer_messages[1]}]

    else:
        examples = None
    
    return examples

In [None]:
# This block sends the test set one by one and gathers responses from GPT
# I tried to paralelize it, but it kept throwing timeout errors from the GPT side, not sure what I was doing wrong
# you can try, if you're up for it. its linear right now and that doesn't even get close to the rate limit


def gpt_annotation(dataframe, examples, num_annotations):
    responses = []
    tokens = []

    GPT_positives = 0
    GPT_negatives = 0

    for idx, row in dataframe.iterrows():
        sys.stdout.write("\r")
        sys.stdout.write("{} examples remaining.".format((num_annotations)-GPT_positives-GPT_negatives))
        sys.stdout.flush()
        context_query = row[UNIQUE_TEXT_COLUMN_NAME]

        signal.signal(signal.SIGALRM, alarm_handler)
        context_query = context_query.strip()
        
        query = {"context": context_query, "query": query_message}
        
        prompt = create_prompt(system_message, examples, query)
        
        for attempt in range(4):
            try:
                max_response_tokens = 500

                signal.alarm(5)
                response = send_message(prompt, chatgpt_model_name, max_response_tokens)
                signal.alarm(0)

                if 'Yes' in response and GPT_positives < num_annotations/2:
                    responses.append((row[UNIQUE_ID_COLUMN_NAME],1))
                    GPT_positives += 1
                elif 'No' in response and GPT_negatives < num_annotations/2:
                    responses.append((row[UNIQUE_ID_COLUMN_NAME],0))
                    GPT_negatives += 1
            except Exception as error:
                print("An exception occurred:", error)
                continue
            break
        if GPT_positives >= num_annotations/2 and GPT_negatives >= num_annotations/2:
            sys.stdout.write("\r")
            sys.stdout.write("{} examples remaining.".format((num_annotations)-GPT_positives-GPT_negatives))
            sys.stdout.flush()
            break
    
    return responses

In [None]:
# This block sends the test set one by one and gathers responses from GPT
# I tried to paralelize it, but it kept throwing timeout errors from the GPT side, not sure what I was doing wrong
# you can try, if you're up for it. its linear right now and that doesn't even get close to the rate limit


def gpt_test_annotation(dataframe, examples, num_annotations):
    responses = []
    tokens = []

    GPT_annotations = 0

    for idx, row in dataframe.iterrows():
        sys.stdout.write("\r")
        sys.stdout.write("{} examples remaining.".format((num_annotations)-GPT_annotations))
        sys.stdout.flush()
        context_query = row[UNIQUE_TEXT_COLUMN_NAME]
        label = row[label_column]

        signal.signal(signal.SIGALRM, alarm_handler)
        context_query = context_query.strip()
        
        query = {"context": context_query, "query": query_message}
        
        prompt = create_prompt(system_message, examples, query)
        
        for attempt in range(4):
            try:
                max_response_tokens = 500

                signal.alarm(5)
                response = send_message(prompt, chatgpt_model_name, max_response_tokens)
                signal.alarm(0)

                if 'Yes' in response:
                    responses.append((row[UNIQUE_ID_COLUMN_NAME],1,label))
                    GPT_annotations += 1
                elif 'No' in response:
                    responses.append((row[UNIQUE_ID_COLUMN_NAME],0,label))
                    GPT_annotations += 1
            except Exception as error:
                print("An exception occurred:", error)
                continue
            break
        if GPT_annotations >= num_annotations:
            sys.stdout.write("\r")
            sys.stdout.write("{} examples remaining.".format((num_annotations)-GPT_annotations))
            sys.stdout.flush()
            break
    
    return responses

In [None]:
def get_metrics_best_threshold(predictions, true_labels):
    
    class_preds = [1 if (x > 0.5) else 0 for x in predictions]
    cm = confusion_matrix(true_labels, class_preds)
    target_names = ['negative', 'positive']
    clss_report = classification_report(true_labels, class_preds, target_names=target_names,digits=4)
    
    return {'clss_report':clss_report, 'confusion_matrix':cm}

## Annotating MIMIC task training sets with AnnotateGPT
This section performs annotation of unnanotated samples with AnnotateGPT for the 3 MIMIC tasks.

In [None]:
# The total number of annotations that will be performed by AnnotateGPT
# Default: 2048
num_annotations = 2048

assert num_annotations%2 == 0, "num_annotations must be even"

In [None]:
shots = True
example_hard = [False,True]
example_explained = [False,True]
arr1 = np.array([[False, False, False]])
arr2 = np.array(np.meshgrid(shots, example_hard, example_explained)).T.reshape(-1,3)
all_annotation_types = np.concatenate((arr1,arr2))
all_annotation_names = ['ZeroShot','TwoShot-E','TwoShot-H','TwoShot-E+Ex','TwoShot-H+Ex']

for idx, ann_name in enumerate(all_annotation_names):

    examples = prepare_examples(all_annotation_types[idx][0], all_annotation_types[idx][1], all_annotation_types[idx][2])

    print('Annotating with',ann_name)
    responses = gpt_annotation(unannotated_sh, examples, num_annotations)
    print('\nAnnotation Complete.')

    resp_df = pd.DataFrame(responses, columns=[UNIQUE_ID_COLUMN_NAME, label_column])

    file_name = f"{ann_name}-{task}-gpt-train.pkl"
    pickle.dump(resp_df,open(file_name, 'wb'))
    print("{} training set saved to {}".format(ann_name, file_name))

## Calculating Inter-Annotator Agreement for MIMIC tasks
In this section, we use AnnotateGPT to annotate a set of human annotated samples in order to calculate the Cohen's Kappa between the human annotation and the AnnotateGPT annotation.

In [None]:
train_test_id_lists = pickle.load(open('train-test-id-lists.pkl','rb'))

In [None]:
if community:
    task = 'com'
    label_column = "sdoh_community_present"
elif economics:
    task = 'eco'
    label_column = "sdoh_economics"
else:
    task = 'tob'
    label_column = "behavior_tobacco"

In [None]:
test_list = train_test_id_lists[f'{task}_test_list']
test_sh = annotated_sh[annotated_sh[UNIQUE_ID_COLUMN_NAME].isin(test_list)]

In [None]:
shots = True
example_hard = [False,True]
example_explained = [False,True]
arr1 = np.array([[False, False, False]])
arr2 = np.array(np.meshgrid(shots, example_hard, example_explained)).T.reshape(-1,3)
all_annotation_types = np.concatenate((arr1,arr2))
all_annotation_names = ['ZeroShot','TwoShot-E','TwoShot-H','TwoShot-E+Ex','TwoShot-H+Ex']

for idx, ann_name in enumerate(all_annotation_names):

    examples = prepare_examples(all_annotation_types[idx][0], all_annotation_types[idx][1], all_annotation_types[idx][2])

    print('Annotating with',ann_name)
    responses = gpt_test_annotation(test_sh, examples, len(test_sh))
    print('\nAnnotation Complete.')

    resp_df = pd.DataFrame(responses, columns=[UNIQUE_ID_COLUMN_NAME, f'{label_column}_GPT',label_column])

    file_name = f"{ann_name}-{task}-cohen_calc_set.pkl"
    pickle.dump(resp_df,open(file_name, 'wb'))
    print("{} training set saved to {}".format(ann_name, file_name))

## Results

In [None]:
for ann_name in all_annotation_names:
    print(f'----------------------{ann_name}-----------------------')
    resp_df = pickle.load(open(f'{ann_name}-{task}-cohen_calc_set.pkl','rb'))
    gold = resp_df.sdoh_community_present.to_list()
    annotategpt = resp_df.sdoh_community_present_GPT.to_list()
    metrics = get_metrics_best_threshold(annotategpt, gold)
    print(metrics['clss_report'])
    print("Confusion Matrix")
    print(metrics['confusion_matrix'])
    print("Cohen's kappa:",cohen_kappa_score(annotategpt, gold))
    print(f'------------------------------------------------------')
