In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import re
import textwrap

from sklearn.model_selection import StratifiedShuffleSplit

import tensorflow as tf
from tensorflow import keras
from transformers import TFBertModel, BertTokenizerFast, TFBertForSequenceClassification

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
total_abbreviations = pd.read_csv("drive/MyDrive/MIDS/medal_mimic_subset/total_abbreviations.csv")

In [4]:
print(len(total_abbreviations))

5798


In [5]:
total_abbreviations.head()

Unnamed: 0,abbreviation
0,SFG
1,CTP
2,HSES
3,MMPIs
4,ATP7B


In [6]:
mimic_train = pd.read_csv("drive/MyDrive/MIDS/medal_mimic_subset/diagnoses/train.csv")
mimic_validation = pd.read_csv("drive/MyDrive/MIDS/medal_mimic_subset/diagnoses/valid.csv")
mimic_test = pd.read_csv("drive/MyDrive/MIDS/medal_mimic_subset/diagnoses/test.csv")

In [7]:
mimic_train.head()

Unnamed: 0,HADM_ID,SUBJECT_ID,TEXT,HOSPITAL_EXPIRE_FLAG,ICD9_ID
0,146431,31916,"Respiratory failure , acute ( not ARDS / Doc...",1.0,5070;51881;55220;1970;1987;1983;5849;5119;V667...
1,116532,29487,Chief Complaint : \n 24 Hour Events : \n EKG...,1.0,03843;5185;78552;42823;486;2762;5990;5849;7070...
2,116532,29487,No significant events overnight\n Renal fail...,1.0,03843;5185;78552;42823;486;2762;5990;5849;7070...
3,111458,31820,"Sepsis , Severe ( with organ dysfunction ) \...",1.0,1970;5849;1578;5119;5990;0388;5582;2869;51881;...
4,116532,29487,"Chief Complaint : urosepsis , erspiratory fa...",1.0,03843;5185;78552;42823;486;2762;5990;5849;7070...


In [8]:
len(mimic_train)

61079

In [9]:
model_checkpoint = 'NLP4H/ms_bert'
bert_tokenizer = BertTokenizerFast.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/499 [00:00<?, ?B/s]

In [10]:
def make_lower(text):
  return text.lower()

total_abbreviations['abbreviation'] = total_abbreviations['abbreviation'].apply(make_lower)

In [11]:
abbreviations = set(total_abbreviations.abbreviation)

def has_any_abbreviation(text):
    words = set(text.lower().split())
    found = words.intersection(abbreviations)
    return bool(found)

mimic_train_subset = mimic_train.loc[mimic_train.TEXT.apply(has_any_abbreviation)]
mimic_validation_subset = mimic_validation.loc[mimic_validation.TEXT.apply(has_any_abbreviation)]
mimic_test_subset = mimic_test.loc[mimic_test.TEXT.apply(has_any_abbreviation)]

In [12]:
print(f"Train: {len(mimic_train_subset)} out of {len(mimic_train)}")
print(f"Validation: {len(mimic_validation_subset)} out of {len(mimic_validation)}")
print(f"Test: {len(mimic_test_subset)} out of {len(mimic_test)}")

Train: 60682 out of 61079
Validation: 9198 out of 9258
Test: 14191 out of 14287


In [13]:
np.average(mimic_train_subset.HOSPITAL_EXPIRE_FLAG)

0.6145150126891006

In [14]:
mimic_train_subset.SUBJECT_ID.nunique()

5483

In [15]:
MAX_LENGTH = 512

def head_and_tail(tokens, total_size=MAX_LENGTH):
    """Build an array of size <total_size> using the head and tail ends of the input array"""
    half = int(total_size / 2)
    if len(tokens) <= total_size:
        return tokens
    head = tokens[:half]
    tail = tokens[-half:]
    return np.concatenate([head, tail])


def tokenize(texts):
    """
    Tokenize an array of text inputs for bert, and take the head and tail of
    each set of bert inputs. Returns a tuple of three arrays: input_ids, token_type_ids,
    and attention_mask.
    """
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []

    for text in texts:
        bert_output = bert_tokenizer(
            text, padding="max_length", return_tensors="tf", max_length=MAX_LENGTH
        )
        input_ids = bert_output["input_ids"][0]
        token_type_ids = bert_output["token_type_ids"][0]
        attention_mask = bert_output["attention_mask"][0]

        input_ids_list.append(head_and_tail(input_ids))
        token_type_ids_list.append(head_and_tail(token_type_ids))
        attention_mask_list.append(head_and_tail(attention_mask))

    return np.array(input_ids_list), np.array(token_type_ids_list), np.array(attention_mask_list)


# tester = np.array([1,2,3,4,5,6,7,8,9,0])
# print(head_and_tail(tester, 4))
examples = mimic_train.TEXT[0:3].tolist()
input_ids, token_type_ids, attention_mask = tokenize(examples)

print(len(input_ids[2]))
print(examples[2])
print(bert_tokenizer.decode(input_ids[2]))




512
No significant events overnight
   Renal failure ,  Chronic  ( Chronic renal failure ,  CRF ,  Chronic kidney
   disease ) 
   Assessment : 
   U / O remains extremely low  ~  10 cc / hr ,  total body overloaded w /  4 + 
   pitting edema x all 4 extremities
   Action : 
   All meds renally dosed ,  no fluid boluses overnight
   Response : 
   Plan : 
   Cont to trend changes in BUN / CR ,  renally dose all meds ,  nephrology may
   need to re evaluate if urine output does not improve . 
   Pain control  ( acute pain ,  chronic pain ) 
   Assessment : 
   Sedated on fent / midaz grimaces during turns / repositioning
   Action : 
   Fent boluses prior to turning ,  lido patch off  @  00 : 00
   Response : 
   Continues to experience pain
   Plan : 
   Continue w /  current pain / sedation regimen ,  ortho consult to evaluate
   for septic L hip . 

[CLS] no significant events overnight renal failure, chronic ( chronic renal failure, crf, chronic kidney disease ) assessment : u / o r

In [16]:
#tokenizing subsets
mimic_train_subset_inputs = tokenize(mimic_train_subset.TEXT.tolist())
mimic_validation_subset_inputs = tokenize(mimic_validation_subset.TEXT.tolist())
mimic_test_subset_inputs = tokenize(mimic_test_subset.TEXT.tolist())

In [17]:
mimic_train_subset_labels = mimic_train_subset.HOSPITAL_EXPIRE_FLAG.to_numpy(np.int32)
mimic_validation_subset_labels = mimic_validation_subset.HOSPITAL_EXPIRE_FLAG.to_numpy(np.int32)
mimic_test_subset_labels = mimic_test_subset.HOSPITAL_EXPIRE_FLAG.to_numpy(np.int32)

In [18]:
def create_bert_ms_model(
    model_checkpoint,
    max_sequence_length=MAX_LENGTH,
    hidden_size=100,
    dropout=0.3,
    learning_rate=0.00005
):
    """
    Build a simple classification model with BERT. Use the pooler Token output for classification purposes.
    """

    tf.keras.backend.clear_session()

    input_ids = tf.keras.layers.Input(shape=(MAX_LENGTH,), dtype=tf.int32, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(MAX_LENGTH,), dtype=tf.int32, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(MAX_LENGTH,), dtype=tf.int32, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}

    bert_model = TFBertModel.from_pretrained(model_checkpoint)
    bert_out = bert_model(bert_inputs)
    pooler_token = bert_out[1]

    hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(pooler_token)
    hidden = tf.keras.layers.Dropout(dropout)(hidden)


    classification = tf.keras.layers.Dense(1, activation='sigmoid',name='classification_layer')(hidden)

    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[classification])

    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                 loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
                                 metrics='accuracy')

    return classification_model

In [19]:
mortality_ms_bert_model = create_bert_ms_model(model_checkpoint='drive/MyDrive/MIDS/model_checkpoints/embeddings_ms/msbert_model_pretrained')
mortality_ms_bert_model.summary()

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at drive/MyDrive/MIDS/model_checkpoints/embeddings_ms/msbert_model_pretrained.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 attention_mask_layer (Inpu  [(None, 512)]                0         []                            
 tLayer)                                                                                          
                                                                                                  
 input_ids_layer (InputLaye  [(None, 512)]                0         []                            
 r)                                                                                               
                                                                                                  
 token_type_ids_layer (Inpu  [(None, 512)]                0         []                            
 tLayer)                                                                                      

In [20]:
checkpoint_dir = 'drive/MyDrive/MIDS/model_checkpoints/mortality_ms_downstream/'
checkpoint_filepath = checkpoint_dir + 'weights.{epoch:02d}-{val_accuracy:.2f}.model.keras'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
)

In [21]:
mortality_ms_history = mortality_ms_bert_model.fit(
    mimic_train_subset_inputs,
    mimic_train_subset_labels,
    epochs=2,
    batch_size=32,
    validation_data=(mimic_validation_subset_inputs, mimic_validation_subset_labels),
    shuffle = True,
    callbacks=[model_checkpoint_callback],
)

Epoch 1/2



Epoch 2/2


In [22]:
import json
with open('drive/MyDrive/MIDS/model_checkpoints/mortality_ms_downstream/history.json', 'w') as f:
    json.dump(mortality_ms_history.history, f)

In [23]:
results = mortality_ms_bert_model.evaluate(mimic_test_subset_inputs, mimic_test_subset_labels)
print(f"Test loss: {results[0]}\nTest accuracy: {results[1]}")

Test loss: 0.6675019264221191
Test accuracy: 0.7815516591072083


In [24]:
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report, confusion_matrix

In [25]:
#calculate F1, precision, recall
y_pred = mortality_ms_bert_model.predict(mimic_test_subset_inputs)




In [26]:
y_pred.shape

(14191, 1)

In [27]:
mimic_test_subset_labels.shape

(14191,)

In [28]:
y_pred_classes = (y_pred > 0.5).astype(int)
y_true = mimic_test_subset_labels

#classification report
report = classification_report(y_true, y_pred_classes)
print(report)

              precision    recall  f1-score   support

           0       0.62      0.86      0.72      4626
           1       0.92      0.74      0.82      9565

    accuracy                           0.78     14191
   macro avg       0.77      0.80      0.77     14191
weighted avg       0.82      0.78      0.79     14191

