In [None]:
import pickle
import pandas as pd
from xgboost import XGBClassifier
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, roc_curve, auc, precision_recall_fscore_support, confusion_matrix
import numpy as np
from nltk.corpus import stopwords
from nltk import word_tokenize
import string

In [None]:
import warnings
warnings.filterwarnings("ignore")

## Extract Data into Accepted Format
### Unnanotated DataFrame Necessary Columns:
- <b>ID</b> Unique identifier number for ease of reference
- <b>TEXT</b> This column should hold all relevant text that should be annotated by AnnotateGPT

##### Example Unnanotated Sample
<table>
  <tr>
    <th><center>ROW_ID</center></th>
    <th><center>TEXT</center></th>
  </tr>
  <tr>
    <td><center>1</center></td>
    <td><center>Patient...</center></td>
  </tr>
  <tr>
    <td><center>...</center></td>
    <td><center>...</center></td>
  </tr>
  <tr>
    <td><center>2038</center></td>
    <td><center>The patient comes from...</center></td>
  </tr>
</table>

### Unnanotated DataFrame Necessary Columns:
- <b>ID</b> Unique identifier number for ease of reference
- <b>TEXT</b> This column should hold all relevant text that should be annotated by AnnotateGPT
- <b>LABEL NAME</b> This column should hold the categorization for the label in question. There may be more than one <i>LABEL NAME</i> column, and each should have its own unique name.

##### Example Annotated Sample
<table>
  <tr>
    <th><center>ROW_ID</center></th>
    <th><center>TEXT</center></th>
    <th><center>sdoh_community_present</center></th>
    <th><center>sdoh_economics</center></th>
    <th><center>behavior_tobacco</center></th>
  </tr>
  <tr>
    <th><center>1</center></th>
    <td><center>The patient...</center></td>
    <td><center>1</center></td>
    <td><center>0</center></td>
    <td><center>1</center></td>
  </tr>
  <tr>
    <td><center>...</center></td>
    <td><center>...</center></td>
    <td><center>...</center></td>
    <td><center>...</center></td>
    <td><center>...</center></td>
  </tr>
    <tr>
    <td><center>233</center></td>
    <td><center>Patient's family...</center></td>
    <td><center>1</center></td>
    <td><center>0</center></td>
    <td><center>0</center></td>
  </tr>
</table>

### Example Extraction of MIMIC-III and MIMIC-SBDH Data

In [None]:
UNIQUE_ID_COLUMN_NAME = "ROW_ID"
UNIQUE_TEXT_COLUMN_NAME = "TEXT"
UNIQUE_LABEL_COLUMN_NAMES = ['sdoh_community_present','sdoh_economics','behavior_tobacco']

In [None]:
def retrieve_social_history(df):
    replace_texts = []
    for row_id in df[UNIQUE_ID_COLUMN_NAME]:
        patient = df[df[UNIQUE_ID_COLUMN_NAME] == row_id][UNIQUE_TEXT_COLUMN_NAME].iloc[0]
        social_history_start = patient.lower().find('social history:')
        pos_ends = []
        pos_ends.append(patient.lower().find('family history:'))
        pos_ends.append(patient.lower().find('physical exam'))
        pos_ends.append(patient.lower().find('medications:'))
        pos_ends.append(patient.lower().find('hospital course:'))
        pos_ends.append(patient.lower().find('review of systems:'))
        pos_ends = [x for x in pos_ends if x > social_history_start]
        pos_ends.append(social_history_start+500)
        social_history_end = min(pos_ends)
        replace_texts.append((row_id,patient[social_history_start:social_history_end]))
    texts = pd.DataFrame(replace_texts,columns =[UNIQUE_ID_COLUMN_NAME,UNIQUE_TEXT_COLUMN_NAME])
    
    return texts

In [None]:
#Paths to MIMIC_CSVs
MIMIC_ADMISSION_CSV = "ADMISSIONS.csv" #Fill in path/to/file with the path to your MIMIC-III folder
MIMIC_NOTEEVENTS_CSV = "NOTEEVENTS.csv" #Fill in path/to/file with the path to your MIMIC-III folder
MIMIC_SBDH = "MIMIC-SBDH.csv" #Fill in path/to/file with the path to your MIMIC-SBDH folder

In [None]:
#Loading DataFrames for Annotated and Unnanotated MIMIC Notes

df = pd.read_csv(MIMIC_ADMISSION_CSV)
newborn_list = df[df["ADMISSION_TYPE"] == "NEWBORN"].SUBJECT_ID.to_list()
notes_df = pd.read_csv(MIMIC_NOTEEVENTS_CSV)
discharge_df = notes_df[notes_df['CATEGORY'] == 'Discharge summary']
non_neonatal = discharge_df[~discharge_df['SUBJECT_ID'].isin(newborn_list)]
sbdh_data = pd.read_csv(open(MIMIC_SBDH, 'r+', encoding='UTF-8'),encoding='UTF-8', on_bad_lines='warn')
sbdh_data = sbdh_data.rename(columns={'row_id':UNIQUE_ID_COLUMN_NAME})
annotated_list = sbdh_data[UNIQUE_ID_COLUMN_NAME].tolist()
annotated_notes = discharge_df[discharge_df[UNIQUE_ID_COLUMN_NAME].isin(annotated_list)]
annotated_subjects = discharge_df[discharge_df[UNIQUE_ID_COLUMN_NAME].isin(annotated_list)].SUBJECT_ID.to_list()

no_soc_his = []
for index, row in non_neonatal.iterrows():
    if 'social history:' not in row[UNIQUE_TEXT_COLUMN_NAME].lower():
        no_soc_his.append(row[UNIQUE_ID_COLUMN_NAME])

final_sdoh_list = non_neonatal[~non_neonatal[UNIQUE_ID_COLUMN_NAME].isin(no_soc_his)]
unnanotated_notes = final_sdoh_list[~final_sdoh_list[UNIQUE_ID_COLUMN_NAME].isin(annotated_list)]

annotated_sh = retrieve_social_history(annotated_notes)
annotated_sh = pd.merge(annotated_sh,sbdh_data[[UNIQUE_ID_COLUMN_NAME] + UNIQUE_LABEL_COLUMN_NAMES],on=UNIQUE_ID_COLUMN_NAME, how='left')
unannotated_sh = retrieve_social_history(unnanotated_notes)

df = newborn_list = notes_df = discharge_df = non_neonatal = annotated_list = annotated_subjects = no_soc_his = final_sdoh_list = unnanotated = sbdh_data = None

In [None]:
economics_binary = [1 if x == 2 else 0 for x in annotated_sh.sdoh_economics.to_list()]
tobacco_binary = [1 if x == 1 or x == 2 else 0 for x in annotated_sh.behavior_tobacco.to_list()]
annotated_sh = annotated_sh.drop(columns=['sdoh_economics','behavior_tobacco'])
annotated_sh['sdoh_economics'] = economics_binary
annotated_sh['behavior_tobacco'] = tobacco_binary

## Example XGBoost setup for MIMIC tasks
Select one of the 3 MIMIC tasks available. Change the variable associated with the task to True for the desired task. You need to place “XX-XX-gpt-train.pkl” files created using the <b>AnnotateGPT - SHARECOPY</b> file in their appropriate subfolder in the <b>SBDH-Annotated-Sets</b> folder if you wish to train using AnnotateGPT training sets. Pre-annotated Community AnnotateGPT training sets have been provided as a demo

In [None]:
# Choose a MIMIC task. Only one must be true, two must be false. 
# Default: community
community = True
economics = False
tobacco = False

assert community + economics + tobacco == 1, "One and only one must be True, the other two must be False"

In [None]:
if community:
    task = 'community'
    label_column = "sdoh_community_present"
elif economics:
    task = 'economics'
    label_column = "sdoh_economics"
else:
    task = 'tobacco'
    label_column = "behavior_tobacco"

In [None]:
annotated_sets = [f for f in os.listdir(f'./SBDH-Annotated-Sets/{task}/') if '.pkl' in f]
TRAINSETS = [x for x in annotated_sets if 'test' not in x]
TESTSET = list(set(TRAINSETS).symmetric_difference(set(annotated_sets)))[0]

In [None]:
def get_metrics_best_threshold(predictions, true_labels):
    
    tpr, fpr, thresholds = roc_curve(true_labels, [n[1] for n in predictions])
    auroc = auc(tpr, fpr)
    
    class_preds = [1 if (x > 0.5) else 0 for x in [n[1] for n in predictions]]
    cm = confusion_matrix(true_labels, class_preds)
    target_names = ['negative', 'positive']
    clss_report = classification_report(true_labels, class_preds, target_names=target_names,digits=4)
    
    return {'clss_report':clss_report, 'auroc':auroc, 'confusion_matrix':cm}

In [None]:
def tokenizer_better(text):
    # tokenize the text by replacing punctuation and numbers with spaces and lowercase all words
    punc_list = string.punctuation+'0123456789'
    t = str.maketrans(dict.fromkeys(punc_list, " "))
    text = text.lower().translate(t)
    tokens = word_tokenize(text)
    return tokens

In [None]:
trainset_metrics = []

for trainset in TRAINSETS:
    train_example = pickle.load(open(f'./SBDH-Annotated-Sets/{task}/{trainset}','rb'))
    train_id_list = train_example[UNIQUE_ID_COLUMN_NAME].to_list()
    if all(x in unannotated_sh[UNIQUE_ID_COLUMN_NAME].to_list() for x in train_id_list):
        master_df = unannotated_sh
        training_df = master_df[master_df[UNIQUE_ID_COLUMN_NAME].isin(train_id_list)]
        training_df = pd.merge(training_df,train_example[[UNIQUE_ID_COLUMN_NAME, 'sdoh_community_present']],on=UNIQUE_ID_COLUMN_NAME, how='left')

    elif all(x in annotated_sh[UNIQUE_ID_COLUMN_NAME].to_list() for x in train_id_list):
        master_df = annotated_sh
        training_df = master_df[master_df[UNIQUE_ID_COLUMN_NAME].isin(train_id_list)]
    else:
        raise Exception("Incorrect training sample list, cannot continue")
    
    test_df = pickle.load(open(f'./SBDH-Annotated-Sets/{task}/{TESTSET}','rb'))
    test_df = annotated_sh[annotated_sh[UNIQUE_ID_COLUMN_NAME].isin(test_df[UNIQUE_ID_COLUMN_NAME].to_list())]
    
    EXAMPLE_LIST = [16,32,64,128,256,512,1024,2048]
    
    my_stop_words = stopwords.words('english')
    
    full_metrics = []
    for EXAMPLES in EXAMPLE_LIST:
        example_train = pd.concat([training_df[training_df[label_column] == 0].sample(n=int(EXAMPLES/2)),
                                   training_df[training_df[label_column] == 1].sample(n=int(EXAMPLES/2))])

        vect = CountVectorizer(max_features = 3000, 
                           tokenizer = tokenizer_better, 
                           stop_words = my_stop_words)
        vect.fit(example_train.TEXT.values)

        train_vect = vect.transform(example_train.TEXT.values)
        test_vect = vect.transform(test_df.TEXT.values)

        train_label = example_train[label_column]
        test_label = test_df[label_column]

        model_xgb = XGBClassifier(subsample = 0.7, n_estimators = 750, max_depth = 48, learning_rate = 0.016, colsample_bytree = 0.8, colsample_bylevel = 0.5, seed = 20)
        model_xgb = model_xgb.fit(train_vect, train_label)

        predictions = model_xgb.predict_proba(test_vect)

        full_metrics.append([EXAMPLES, get_metrics_best_threshold(predictions,test_label)])
    trainset_metrics.append([trainset, full_metrics])

## Results

In [None]:
for trainset, full_metrics in trainset_metrics:
    print(f"----------{trainset}----------")
    for example_num, metrics in full_metrics:
        print(f"-----------------------{example_num}---------------------------")
        print(metrics['clss_report'])
        print(f"AUROC: {round(metrics['auroc'],4)}")
        print("Confusion Matrix")
        print(metrics["confusion_matrix"])