# Problem 1: Modeling with ClinicalBERT Embeddings

Note: to avoid consuming the GPU resources allocated to you by colab on the parts you don't need a GPU for, make sure you use a CPU runtime (Runtime > Change Runtime Type > Hardware accelerator: None) until the notebook indicates otherwise.

## Install Libraries

In [None]:
!pip install transformers
!pip install scikit-learn
!pip install umap-learn

## Setting up Google Drive
Copy the data at the [following link](https://drive.google.com/drive/folders/1G5NuAnUSaKzcry-tzgPZKxafG_vcOzX9?usp=sharing) to a folder in your own drive and set the path to that folder below

In [None]:
# Path to saved data
#------YOUR CODE HERE--------
data_path = "/content/drive/MyDrive/path_to_your_folder"
#------YOUR CODE ENDS--------

In [None]:
from google.colab import auth, drive
drive.mount('/content/drive')
auth.authenticate_user()

In [None]:
import numpy as np
import pandas as pd
import os 
import random
import sklearn
import importlib
import pickle
import math
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import time

from pathlib import Path
from torch.utils import data
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModel, BertConfig, BertTokenizer, BertForMaskedLM, InputExample

pd.set_option('display.max_columns', 50)
pd.options.mode.chained_assignment = None

# Add random seed
random.seed(456)
np.random.seed(456)

(a) You can read more about ClinicalBERT [here](https://huggingface.co/emilyalsentzer/Bio_ClinicalBERT)

In [None]:
#------YOUR CODE HERE--------
# Initialize the tokenizer
tokenizer = 

# Initialize the model
model = 
#------YOUR CODE ENDS--------

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

(b)

In [None]:
def fill_blank(text: str, model: BertForMaskedLM, tokenizer: BertTokenizer) -> str:
    '''
    Given a sentence with a single blank (denoted by an underscore), queries the BERT model to 
        fill in the missing token.
        
    Inputs:
        - text: sentence containing a single underscore corresponding to the missing token
        - model: pytorch ClinicalBERT model, of type BertForMaskedLM
        - tokenizer: BertTokenizer object
    
    Output:
        - string corresponding to the sentence where the underscore is replaced with the most likely token
    '''
    random.seed(456)
    np.random.seed(456)
    torch.manual_seed(456)
    
    #------YOUR CODE HERE--------
    # Replace the underscore by [MASK] and store the result in masked_str
    masked_str = 

    # Tokenize the masked string and store the tokens in inputs
    inputs = 
    #------YOUR CODE ENDS--------

    inputs = inputs.to(device)
    with torch.no_grad():
        #------YOUR CODE HERE--------
        # Compute the logits (log probabilities) from the model
        logits = 
        #------YOUR CODE ENDS--------

    mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

    predicted_logits = logits[0, mask_token_index] 
    #------YOUR CODE HERE--------
    # Select the most likely token in predicted_logits
    predicted_token_id = 

    # Use the tokenizer to decode the token id into a string
    pred = 
    #------YOUR CODE ENDS--------

    return text.replace('_', pred)

In [None]:
# Test fill_blank
nurse_sent = '30 yo white _ helping other nurses at the ICU'
doc_sent = '30 yo white _ helping other doctors at the ICU'
print(f"Predicted sentence: {fill_blank(nurse_sent, model, tokenizer)}")
print("Expected sentence: 30 yo white female helping other nurses at the ICU")
print(f"Predicted sentence: {fill_blank(doc_sent, model, tokenizer)}")
print("Expected sentence: 30 yo white male helping other doctors at the ICU")

(c)

In [None]:
#------YOUR CODE HERE--------
sent1 = 'Sentence 1 goes here!'
sent2 = 'Sentence 2 goes here!'
#------YOUR CODE ENDS--------
print(f"Sentence 1 (completed): {fill_blank(sent1, model, tokenizer)}")
print(f"Sentence 2 (completed): {fill_blank(sent2, model, tokenizer)}")

(d) Answer in your report

(e)

Change the runtime to GPU for this part (Runtime > Change Runtime Type > Hardware Accelerator: GPU)

In [None]:
# Load the data
df = pd.read_hdf(os.path.join(data_path, "text_and_hypertension_data.h5"))

In [None]:
def get_sent_rep(model, tokenizer, txt):
    """
    Compute the sentence representation and return it as a numpy array
    If done correctly, the numpy array should be of size 768
    """
    # Tokenize the input txt and store the result in inputs
    # Remember to set truncation=True and max_length=512
    #------YOUR CODE HERE--------
    inputs = 
    #------YOUR CODE ENDS--------
    inputs.to(device)
    model.to(device)
    with torch.no_grad():
        #------YOUR CODE HERE--------
        # Compute the model outputs and store the result in outputs
        # Make sure output_hidden_states=True
        outputs = 
        #------YOUR CODE ENDS--------
    
        embed = outputs.hidden_states[-1]

        #------YOUR CODE HERE--------
        # embed (of size [1, input_length, 768])
        # contains the hidden states corresponding to each
        # token at the final layer of the model
        # Each hidden state is a vector of size 768
        # Compute the mean of these vectors to get a representation
        # of the input sentence, and store the mean again in embed
        embed = 
        #------YOUR CODE ENDS--------

        embed = embed.squeeze()

    return embed.cpu().detach().numpy()

The following block of code should take around 15min to run on a GPU the first time it is run. Make sure to save its result in your drive to avoid needing to run it again

In [None]:
recompute_embeds = False
# Only regenerate embeds if necessary
if not os.path.exists(os.path.join(data_path, "embeds.npy")) or recompute_embeds == True:
    # Generate embeddings
    num_pts = len(df)
    embeds = [None]* num_pts
    start = time.time()
    for row_idx in range(num_pts):
        note_data = df.iloc[row_idx]["text"]
        embeds[row_idx] = get_sent_rep(model, tokenizer, note_data)
    print(time.time() - start)
    X = np.stack(embeds, axis=0)
    with open(os.path.join(data_path, "embeds.npy"), "wb") as f:
        np.save(f, X)

(f)

You can change the runtime back to CPU to avoid using your GPU allocation

In [None]:
from sklearn.model_selection import train_test_split

# Get train and test data
with open(os.path.join(data_path, "embeds.npy"), "rb") as f:
    # X contains one embedding per row corresponding to
    # the discharge summary of the patient in that row
    # in the dataset
    X = np.load(f)

# y contains whether the patient in a particular row had
# hypertension during their ICU stay
y = df['Hypertension'].tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=456)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Scale the train data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [None]:
#------YOUR CODE HERE--------
# Create and fit your logistic regression model on the training data
# Make sure to use multi_class = "multinomial" and class_weight="balanced"



#------YOUR CODE ENDS--------


(g)

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
#------YOUR CODE HERE--------
# Compute the performance metrics on the training set





#------YOUR CODE ENDS--------

In [None]:
# Scale the test data
X_test_scaled = scaler.transform(X_test)

In [None]:
#------YOUR CODE HERE--------
# Compute the performance metrics on the test set






#------YOUR CODE HERE--------

(h)

In [None]:
from umap import UMAP

In [None]:
#------YOUR CODE HERE--------
# Use UMAP to project the scaled training data onto two dimensions
# Make sure to use random_state=456



#------YOUR CODE ENDS--------

In [None]:
import matplotlib.pyplot as plt
#------YOUR CODE HERE--------
# Plot the UMAP embeddings on a scatter plot





#------YOUR CODE ENDS--------

In [None]:
#------YOUR CODE HERE--------
# Use UMAP to project the scaled test data onto two dimensions
# Make sure to use random_state=456



#------YOUR CODE ENDS--------

In [None]:
#------YOUR CODE HERE--------
# Plot the UMAP embeddings on a scatter plot





#------YOUR CODE ENDS--------

(i) Answer in your report

(j)

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
#------YOUR CODE HERE--------
# Use LDA to project the scaled training data onto a single dimension



#------YOUR CODE ENDS--------

In [None]:
#------YOUR CODE HERE--------
# Plot the LDA embeddings on two histograms on the same plot




#------YOUR CODE ENDS--------

In [None]:
#------YOUR CODE HERE--------
# Use LDA to project the scaled test data onto a single dimension



#------YOUR CODE ENDS--------

In [None]:
#------YOUR CODE HERE--------
# Plot the LDA embeddings on two histograms on the same plot




#------YOUR CODE ENDS--------

(k) Answer in your report

(l) Answer in your report