In [1]:
from openai import AzureOpenAI
import pickle
import os, sys
import openai
import json
import numpy as np
import pickle
import pandas as pd
from time import sleep
import signal
import tiktoken
from sklearn.metrics import classification_report, confusion_matrix, cohen_kappa_score
from tqdm import tqdm

### Reading the Oncology Data

Get the CORAL data from the following link

https://physionet.org/content/curated-oncology-reports/1.0/

In [3]:
breast_cancer_path = os.path.join('physionet.org/files/curated-oncology-reports/1.0/coral', 'annotated', 'breastca')
pancreatic_cancer_path = os.path.join('physionet.org/files/curated-oncology-reports/1.0/coral', 'annotated', 'pdac')

In [8]:
pancreatic_cancer_reports = []
for i in range(0, 20):
    file_path = os.path.join(pancreatic_cancer_path, str(i) + '.txt')
    with open(file_path, 'r') as file:
        pancreatic_cancer_reports.append(file.read())

In [9]:
breast_cancer_reports = []
for i in range(20, 40):
    file_path = os.path.join(breast_cancer_path, str(i) + '.txt')
    with open(file_path, 'r') as file:
        breast_cancer_reports.append(file.read())

In [16]:
def retrieve_social_history(report):
    social_history_start = report.lower().find('social history')
    pos_ends = []
    pos_ends.append(report.lower().find('family history'))
    pos_ends.append(report.lower().find('physical exam'))
    pos_ends.append(report.lower().find('medications'))
    pos_ends.append(report.lower().find('hospital course'))
    pos_ends.append(report.lower().find('prescriptions'))
    pos_ends.append(report.lower().find('review of systems'))
    pos_ends = [x for x in pos_ends if x > social_history_start]
    pos_ends.append(social_history_start+500)
    social_history_end = min(pos_ends)
    social_history = report[social_history_start:social_history_end]    
    return social_history

In [19]:
pancreatic_cancer_soc_history = [retrieve_social_history(report) for report in pancreatic_cancer_reports]

In [23]:
pancreatic_cancer_soc_history

["SOCIAL HISTORY:  He smoked about a half a pack of cigarettes per day for about 4 years, quitting in 2006. There is a prior history of alcohol abuse where he drank up to a pint of tequila day, but he has been drinking only socially more recently. He has a past history of heroin abuse, but has not used for several years.   He is accompanied by his mother ***** who works as a *****'***** ***** and his sister ***** who is an administrative *****.      PE:    Vitals:   Vitals: BP 131/90 | Pulse 64 | ",
 'Social History     Socioeconomic History   \x07 Marital status: Married     Spouse name: *****   \x07 Number of children: None   \x07 Years of education: None   \x07 Highest education level: None   Occupational History   \x07 None   Social Needs   \x07 Financial resource strain: None   \x07 Food insecurity:     Worry: None     Inability: None   \x07 Transportation needs:     Medical: None     Non-medical: None   Tobacco Use   \x07 Smoking status: Former Smoker   \x07 Smokeless tobacco: Ne

In [24]:
pancreatic_cancer_soc_history = [soc_history for soc_history in pancreatic_cancer_soc_history if len(soc_history) > 0]

In [20]:
breast_cancer_soc_history = [retrieve_social_history(report) for report in breast_cancer_reports]

In [22]:
breast_cancer_soc_history

['Social History     Social History   \x07 Marital status: Married     Spouse name: N/A   \x07 Number of children: N/A   \x07 Years of education: N/A     Occupational History   \x07 Not on file.     Social History Main Topics   \x07 Smoking status: Never Smoker   \x07 Smokeless tobacco: Never Used   \x07 Alcohol use No   \x07 Drug use: No   \x07 Sexual activity: Not on file     Other Topics Concern   \x07 Not on file     Social History Narrative    Married; husband still alive. Lives with her daughter.  Originally from *****. ',
 'SOCIAL HISTORY: She is divorced and lives with her 42-year-old son who  is unemployed.  She worked with the military.  She does not drink  alcohol.  She smoked cigarettes for 75-pack-years stopping more than 12  months ago.    ',
 'Social History:  Social History           Social History   \x07 Marital status: Married     Spouse name: N/A   \x07 Number of children: N/A   \x07 Years of education: N/A           Occupational History   \x07 ***** ***** ***** ****

In [26]:
breast_cancer_soc_history = [soc_history for soc_history in breast_cancer_soc_history if len(soc_history) > 0]

### Reading the MIMIC data to extract the examples

In [40]:
UNIQUE_ID_COLUMN_NAME = "ROW_ID"
UNIQUE_TEXT_COLUMN_NAME = "TEXT"
UNIQUE_LABEL_COLUMN_NAMES = ['sdoh_community_present','sdoh_economics','behavior_tobacco']

In [41]:
def retrieve_social_history(df):
    replace_texts = []
    for row_id in df[UNIQUE_ID_COLUMN_NAME]:
        patient = df[df[UNIQUE_ID_COLUMN_NAME] == row_id][UNIQUE_TEXT_COLUMN_NAME].iloc[0]
        social_history_start = patient.lower().find('social history:')
        pos_ends = []
        pos_ends.append(patient.lower().find('family history:'))
        pos_ends.append(patient.lower().find('physical exam'))
        pos_ends.append(patient.lower().find('medications:'))
        pos_ends.append(patient.lower().find('hospital course:'))
        pos_ends.append(patient.lower().find('review of systems:'))
        pos_ends = [x for x in pos_ends if x > social_history_start]
        pos_ends.append(social_history_start+500)
        social_history_end = min(pos_ends)
        replace_texts.append((row_id,patient[social_history_start:social_history_end]))
    texts = pd.DataFrame(replace_texts,columns =[UNIQUE_ID_COLUMN_NAME,UNIQUE_TEXT_COLUMN_NAME])
    
    return texts

In [42]:
#Paths to MIMIC_CSVs
MIMIC_ADMISSION_CSV = "ADMISSIONS.csv" #Fill in path/to/file with the path to your MIMIC-III folder
MIMIC_NOTEEVENTS_CSV = "NOTEEVENTS.csv" #Fill in path/to/file with the path to your MIMIC-III folder
MIMIC_SBDH = "MIMIC-SBDH.csv" #Fill in path/to/file with the path to your MIMIC-SBDH folder

In [43]:
#Loading DataFrames for Annotated and Unnanotated MIMIC Notes

df = pd.read_csv(MIMIC_ADMISSION_CSV)
newborn_list = df[df["ADMISSION_TYPE"] == "NEWBORN"].SUBJECT_ID.to_list()
notes_df = pd.read_csv(MIMIC_NOTEEVENTS_CSV)
discharge_df = notes_df[notes_df['CATEGORY'] == 'Discharge summary']
non_neonatal = discharge_df[~discharge_df['SUBJECT_ID'].isin(newborn_list)]
sbdh_data = pd.read_csv(open(MIMIC_SBDH, 'r+', encoding='UTF-8'),encoding='UTF-8', on_bad_lines='warn')
sbdh_data = sbdh_data.rename(columns={'row_id':UNIQUE_ID_COLUMN_NAME})
annotated_list = sbdh_data[UNIQUE_ID_COLUMN_NAME].tolist()
annotated_notes = discharge_df[discharge_df[UNIQUE_ID_COLUMN_NAME].isin(annotated_list)]
annotated_subjects = discharge_df[discharge_df[UNIQUE_ID_COLUMN_NAME].isin(annotated_list)].SUBJECT_ID.to_list()

no_soc_his = []
for index, row in non_neonatal.iterrows():
    if 'social history:' not in row[UNIQUE_TEXT_COLUMN_NAME].lower():
        no_soc_his.append(row[UNIQUE_ID_COLUMN_NAME])

final_sdoh_list = non_neonatal[~non_neonatal[UNIQUE_ID_COLUMN_NAME].isin(no_soc_his)]
unnanotated_notes = final_sdoh_list[~final_sdoh_list[UNIQUE_ID_COLUMN_NAME].isin(annotated_list)]

annotated_sh = retrieve_social_history(annotated_notes)
annotated_sh = pd.merge(annotated_sh,sbdh_data[[UNIQUE_ID_COLUMN_NAME] + UNIQUE_LABEL_COLUMN_NAMES],on=UNIQUE_ID_COLUMN_NAME, how='left')
unannotated_sh = retrieve_social_history(unnanotated_notes)

df = newborn_list = notes_df = discharge_df = non_neonatal = annotated_list = annotated_subjects = no_soc_his = final_sdoh_list = unnanotated = sbdh_data = None

  notes_df = pd.read_csv(MIMIC_NOTEEVENTS_CSV)


In [44]:
economics_binary = [1 if x == 2 else 0 for x in annotated_sh.sdoh_economics.to_list()]
tobacco_binary = [1 if x == 1 or x == 2 else 0 for x in annotated_sh.behavior_tobacco.to_list()]
annotated_sh = annotated_sh.drop(columns=['sdoh_economics','behavior_tobacco'])
annotated_sh['sdoh_economics'] = economics_binary
annotated_sh['behavior_tobacco'] = tobacco_binary

### Extracting the SDoH using AnnotateGPT

In [197]:
# Choose a MIMIC task. Only one must be true, two must be false.
# Default: community
community = False
economics = False
tobacco = True

assert community + economics + tobacco == 1, "One and only one must be True, the other two must be False"

In [198]:
if community:
    task = 'community'
    label_column = "sdoh_community_present"
elif economics:
    task = 'economics'
    label_column = "sdoh_economics"
else:
    task = 'tobacco'
    label_column = "behavior_tobacco"

In [199]:
task_prompts = pickle.load(open('MIMIC_TASK_PROMPTS.pkl','rb'))

base_system_message = task_prompts[task]['instructions']
system_message = f"{base_system_message.strip()}"
query_message = task_prompts[task]['query']

easy_example_pos = annotated_sh[annotated_sh[UNIQUE_ID_COLUMN_NAME] == task_prompts[task]['examples']['easy_example_pos']].iloc[0].TEXT.replace('\n', ' ').strip()
easy_answer_pos = task_prompts[task]['examples']['easy_answer_pos']
easy_answer_pos_explained = task_prompts[task]['examples']['easy_answer_pos_explained']
easy_example_neg = annotated_sh[annotated_sh[UNIQUE_ID_COLUMN_NAME] == task_prompts[task]['examples']['easy_example_neg']].iloc[0].TEXT.replace('\n', ' ').strip()
easy_answer_neg = task_prompts[task]['examples']['easy_answer_neg']
easy_answer_neg_explained = task_prompts[task]['examples']['easy_answer_neg_explained']

hard_example_pos = annotated_sh[annotated_sh[UNIQUE_ID_COLUMN_NAME] == task_prompts[task]['examples']['hard_example_pos']].iloc[0].TEXT.replace('\n', ' ').strip()
hard_answer_pos = task_prompts[task]['examples']['hard_answer_pos']
hard_answer_pos_explained = task_prompts[task]['examples']['hard_answer_pos_explained']
hard_example_neg = annotated_sh[annotated_sh[UNIQUE_ID_COLUMN_NAME] == task_prompts[task]['examples']['hard_example_neg']].iloc[0].TEXT.replace('\n', ' ').strip()
hard_answer_neg = task_prompts[task]['examples']['hard_answer_neg']
hard_answer_neg_explained = task_prompts[task]['examples']['hard_answer_neg_explained']

example_ids = [task_prompts[task]['examples']['easy_example_pos'],task_prompts[task]['examples']['easy_example_neg'],task_prompts[task]['examples']['hard_example_pos'],task_prompts[task]['examples']['hard_example_neg']]

{'API_KEY': 'c8ec02536ea94c9faafd4ce70ccfd244',
 'API_VERSION': '2024-02-01',
 'AZURE_ENDPOINT': 'https://mimic-openai.openai.azure.com/'}

In [200]:
with open('azure_credentials.json', 'r') as file:
    azure_data = json.load(file)
    api_key = azure_data['API_KEY']
    api_version = azure_data['API_VERSION']
    azure_endpoint = azure_data['AZURE_ENDPOINT']
    azure_deployment_name = azure_data['AZURE_DEPLOYMENT_NAME']

client = AzureOpenAI(
    api_key=api_key,
    api_version=api_version,
    azure_endpoint = azure_endpoint
    )

deployment_name=azure_deployment_name #This will correspond to the custom name you chose for your deployment when you deployed a model. Use a gpt-35-turbo-instruct deployment.

# Defining a function to create the prompt from the instruction system message, the few-shot examples, and the current query
# The function assumes 'examples' is a list of few-shot examples in dictionaries with 'context', 'query' and 'answer' keys
# Example: examples = [{"context": "Lives with wife, no tobacco, no alcohol, no drugs",
# "query": "Does the social history present tobacco use?", "answer": "No."}]
# The function assumes 'query' is a dictionary containing the current query GPT is expected to answer with 'context' and 'query' keys.
# Example: query = [{"context": "Lives alone, history of 1 ppd, no alcohol use, no drug use", 
# "query": "Does the social history present tobacco use?"}]
def create_prompt(system_message, examples, query):
    user_message = ""
    if examples != None:
        for example in examples:
            user_message += f"\nCONTEXT:\n{example['context']}\n"
            user_message += f"\nQUERY:\n{example['query']}\n"
            user_message += f"\nANSWER:\n{example['answer']}\n"
    user_message += f"\nCONTEXT:\n{query['context']}\n"
    user_message += f"\nQUERY:\n{query['query']}\n"
    user_message += f"\nANSWER:\n"
    
    formatted_message = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message}
    ]
    
    return formatted_message

# This function sends the prompt to the GPT model
def send_message(message, model_name, max_response_tokens=50):
    response = client.chat.completions.create(
        model=model_name,
        messages=message,
        temperature=0.7,
        max_tokens=max_response_tokens,
    )
    return response.choices[0].message.content.strip()

In [201]:
def prepare_examples(shots, example_hard, example_explained):
    if shots:
        if example_hard:
            context_messages = [hard_example_pos, hard_example_neg]
            if example_explained:
                answer_messages = [hard_answer_pos_explained, hard_answer_neg_explained]
            else:
                answer_messages = [hard_answer_pos, hard_answer_neg]
        else:
            context_messages = [easy_example_pos, easy_example_neg]
            if example_explained:
                answer_messages = [easy_answer_pos_explained, easy_answer_neg_explained]
            else:
                answer_messages = [easy_answer_pos, easy_answer_neg]  

        examples = [{"context": context_messages[0], "query": query_message, "answer": answer_messages[0]},{"context": context_messages[1], "query": query_message, "answer": answer_messages[1]}]

    else:
        examples = None
    
    return examples

In [202]:
shots = True
example_hard = [False,True]
example_explained = [False,True]
arr1 = np.array([[False, False, False]])
arr2 = np.array(np.meshgrid(shots, example_hard, example_explained)).T.reshape(-1,3)
all_annotation_types = np.concatenate((arr1,arr2))
all_annotation_names = ['ZeroShot','TwoShot-E','TwoShot-H','TwoShot-E+Ex','TwoShot-H+Ex']

In [203]:
idx = 2
examples = prepare_examples(all_annotation_types[idx][0], all_annotation_types[idx][1], all_annotation_types[idx][2])

In [204]:
# This block sends the test set one by one and gathers responses from GPT
# I tried to paralelize it, but it kept throwing timeout errors from the GPT side, not sure what I was doing wrong
# you can try, if you're up for it. its linear right now and that doesn't even get close to the rate limit

def gpt_annotation(social_history, examples):
    responses = []

    for soc_hist in tqdm(social_history):

        context_query = soc_hist
        context_query = context_query.strip()
        
        query = {"context": context_query, "query": query_message}
        
        prompt = create_prompt(system_message, examples, query)
        
        for attempt in range(4):
            try:
                max_response_tokens = 50

                response = send_message(prompt, deployment_name, max_response_tokens)
                
                if 'Yes' in response:
                    responses.append(1)
                else:
                    responses.append(0)
            except Exception as error:
                print("An exception occurred:", error)
                continue
            break    
    return responses

In [205]:
pancreatic_response = gpt_annotation(pancreatic_cancer_soc_history, examples)

100%|██████████| 13/13 [02:17<00:00, 10.59s/it]


In [207]:
file_name = "pancreatic_" + task + ".pkl"
file_path = os.path.join("AnnotateGPT_Coral", file_name)
with open(file_path, 'wb') as handle:
    pickle.dump(pancreatic_response, handle)

In [208]:
breast_response = gpt_annotation(breast_cancer_soc_history, examples)

100%|██████████| 18/18 [03:07<00:00, 10.42s/it]


In [210]:
file_name = "breast_" + task + ".pkl"
file_path = os.path.join("AnnotateGPT_Coral", file_name)
with open(file_path, 'wb') as handle:
    pickle.dump(breast_response, handle)

#### Reading the pickle files 

In [263]:
with open('pancreatic_community.pkl', 'rb') as handle:
    pancreatic_community = pickle.load(handle)
    
with open('pancreatic_economics.pkl', 'rb') as handle:
    pancreatic_economics = pickle.load(handle)
    
with open('pancreatic_tobacco.pkl', 'rb') as handle:
    pancreatic_tobacco = pickle.load(handle)

In [264]:
pancreatic_community = [1 if response.find('Yes')!=-1 else 0 for response in pancreatic_community]
pancreatic_economics = [1 if response.find('Yes')!=-1 else 0 for response in pancreatic_economics]
pancreatic_tobacco = [1 if response.find('Yes')!=-1 else 0 for response in pancreatic_tobacco]

In [265]:
pancreatic_df = pd.DataFrame({"TEXT": pancreatic_cancer_soc_history,
                              "community_present": pancreatic_community,
                              "economics": pancreatic_economics,
                              "tobacco": pancreatic_tobacco})

In [266]:
pancreatic_df

Unnamed: 0,TEXT,community_present,economics,tobacco
0,SOCIAL HISTORY: He smoked about a half a pack...,1,1,1
1,Social History Socioeconomic History  M...,1,1,1
2,social history which was performed during a \n...,0,0,0
3,SOCIAL HISTORY: Her first husband died at **...,1,1,1
4,"SOCIAL HISTORY Married, husband recently diag...",1,1,1
5,Social History Socioeconomic History  M...,0,1,0
6,SOCIAL HISTORY Social History Socioeconom...,0,0,0
7,SOCIAL HISTORY Social History Tobacco Use...,0,1,1
8,SOCIAL HISTORY Tobacco Use  Smoking status...,1,0,0
9,Social History Socioeconomic History  M...,1,1,0


In [267]:
with open('breast_community.pkl', 'rb') as handle:
    breast_community = pickle.load(handle)
    
with open('breast_economics.pkl', 'rb') as handle:
    breast_economics = pickle.load(handle)
    
with open('breast_tobacco.pkl', 'rb') as handle:
    breast_tobacco = pickle.load(handle)

In [268]:
breast_community = [1 if response.find('Yes')!=-1 else 0 for response in breast_community]
breast_economics = [1 if response.find('Yes')!=-1 else 0 for response in breast_economics]
breast_tobacco = [1 if response.find('Yes')!=-1 else 0 for response in breast_tobacco]

In [269]:
breast_df = pd.DataFrame({"TEXT": breast_cancer_soc_history,
                              "community_present": breast_community,
                              "economics": breast_economics,
                              "tobacco": breast_tobacco})

In [270]:
breast_df

Unnamed: 0,TEXT,community_present,economics,tobacco
0,Social History Social History  Marital ...,1,0,0
1,SOCIAL HISTORY: She is divorced and lives with...,1,0,1
2,Social History: Social History Soci...,0,1,0
3,SOCIAL HISTORY Past Medical History: Diagno...,0,0,0
4,Social History Social History  Marital ...,0,0,0
5,SOCIAL HISTORY: Social History Social His...,1,1,0
6,SOCIAL HISTORY Past Medical History: Diagno...,0,0,0
7,Social History: She lives in ***** ***** in *...,1,0,1
8,Social History Socioeconomic History  M...,0,1,0
9,Social History Social History Narrative ...,0,0,0


### Calculating the Statistics

#### Pancreatic Cancer

In [244]:
len(pancreatic_df)

13

In [245]:
np.sum(pancreatic_df['community_present'] == 0)

4

In [246]:
(np.sum(pancreatic_df['community_present'] == 0)/len(pancreatic_df))*100

30.76923076923077

In [247]:
np.sum(pancreatic_df['economics'] == 0)

5

In [248]:
(np.sum(pancreatic_df['economics'] == 0)/len(pancreatic_df))*100

38.46153846153847

In [249]:
np.sum(pancreatic_df['tobacco'] == 0)

5

In [250]:
(np.sum(pancreatic_df['tobacco'] == 0)/len(pancreatic_df))*100

38.46153846153847

#### Pancreatic Cancer Examples

In [277]:
pancreatic_df.iloc[0]

TEXT                 SOCIAL HISTORY:  He smoked about a half a pack...
community_present                                                    1
economics                                                            1
tobacco                                                              1
Name: 0, dtype: object

In [278]:
text = pancreatic_df.iloc[0].TEXT
print(text)

SOCIAL HISTORY:  He smoked about a half a pack of cigarettes per day for about 4 years, quitting in 2006. There is a prior history of alcohol abuse where he drank up to a pint of tequila day, but he has been drinking only socially more recently. He has a past history of heroin abuse, but has not used for several years.   He is accompanied by his mother ***** who works as a *****'***** ***** and his sister ***** who is an administrative *****.      PE:    Vitals:   Vitals: BP 131/90 | Pulse 64 | 


In [279]:
human_family = 1
human_tobacco = 1
human_ecnomics = 0

#### Breast Cancer

In [251]:
len(breast_df)

18

In [253]:
np.sum(breast_df['community_present'] == 0)

10

In [254]:
(np.sum(breast_df['community_present'] == 0)/len(breast_df))*100

55.55555555555556

In [255]:
np.sum(breast_df['economics'] == 0)

13

In [256]:
(np.sum(breast_df['economics'] == 0)/len(breast_df))*100

72.22222222222221

In [257]:
np.sum(breast_df['tobacco'] == 0)

15

In [258]:
(np.sum(breast_df['tobacco'] == 0)/len(breast_df))*100

83.33333333333334

#### Breast Cancer Examples

In [282]:
breast_df.iloc[1]

TEXT                 SOCIAL HISTORY: She is divorced and lives with...
community_present                                                    1
economics                                                            0
tobacco                                                              1
Name: 1, dtype: object

In [283]:
text = breast_df.iloc[1].TEXT
print(text)

SOCIAL HISTORY: She is divorced and lives with her 42-year-old son who  is unemployed.  She worked with the military.  She does not drink  alcohol.  She smoked cigarettes for 75-pack-years stopping more than 12  months ago.    


In [None]:
human_family = 1
human_tobacco = 1
human_ecnomics = 1