This notebook was created to support the data preparation required to support our CS 598 DLH project.  The paper we have chosen for the reproducibility project is:
***Ensembling Classical Machine Learning and Deep Learning Approaches for Morbidity Identification from Clinical Notes ***



 

The data cannot be shared publicly due to the agreements required to obtain the data so we are storing the data locally and not putting in GitHub.

In [33]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import datetime
from tqdm import tqdm
import torchtext

# set seed
seed = 24
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
# define data path
DATA_PATH = './obesity_data/'

test_df = pd.read_pickle(DATA_PATH + '/test.pkl') 
train_df = pd.read_pickle(DATA_PATH + '/train.pkl') 
#corpus = pd.read_pickle(DATA_PATH + '/corpus.pkl')
disease_list = test_df['disease'].unique().tolist()
embedding_list = ['GloVe', "FastText"]

In [34]:
#rebuild the vocabulary
#import torchtext, torch, torch.nn.functional as F
#from torchtext.data.utils import get_tokenizer
#from torchtext.vocab import build_vocab_from_iterator

#tokenizer = get_tokenizer("basic_english")
#tokens = [tokenizer(doc) for doc in corpus]

#voc = build_vocab_from_iterator(tokens, specials = ['<pad>'])

**Deep Learning - Bag of Words - All Feature Selections - Averaged**

![DL BagOfWords AllFeatures Averaged](images\DL-BagOfWords-ByFeatureSelection.gif)

****DL Model using word embeddings****

First we start by creating a dataset.  Note this will have to take the disease as part of the init and filter just for those records.

In [35]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class ClinicalNoteDataset(Dataset):

    def __init__(self, dataframe, disease, dataformat):
        """
        TODO: init the Dataset instance.  datafomat is just the column to use from the dataframe 'vector_tokenized' , 'one_hot'
        """
        # your code here
        self.disease = disease
        self.dataformat = dataformat
        self.df = dataframe[dataframe['disease'] == disease].copy()
        self.df = self.df.reset_index()

    def __len__(self):
        """
        TODO: Denotes the total number of samples
        """
        return len(self.df)

    def __getitem__(self, i):
        """
        TODO: Generates one sample of data
            return X, y for the i-th data.
        """
        #Cannot make tensors yet, will need to happen in collate
        Y = self.df.iloc[i]['judgment']
        X = self.df.iloc[i][self.dataformat]

        return X,Y
        
def vectorize_batch_GloVe(batch):
    embedding_size_used = 300
    vec = torchtext.vocab.GloVe(name='6B', dim=embedding_size_used)    
    Xi, Yi = batch[0]
    batch_size = len(batch)

    X = torch.zeros(batch_size, len(Xi), embedding_size_used, dtype=torch.float)
    Y = torch.zeros((batch_size), dtype=torch.long)
    
    for i in range(len(batch)):
        x, y = batch[i]
        #vectors = vec.get_vecs_by_tokens(voc.lookup_tokens(x.tolist()))
        vectors = vec.get_vecs_by_tokens(x)

        X[i] = torch.tensor(vectors).float()
        Y[i] = torch.tensor(y)

    return X,Y

def vectorize_batch_FastText(batch):
    embedding_size_used = 300
    vec = torchtext.vocab.FastText()
    Xi, Yi = batch[0]
    batch_size = len(batch)

    X = torch.zeros(batch_size, len(Xi), embedding_size_used, dtype=torch.float)
    Y = torch.zeros((batch_size), dtype=torch.long)

    for i in range(len(batch)):
        x, y = batch[i]
        #vectors = vec.get_vecs_by_tokens(voc.lookup_tokens(x.tolist()))
        vectors = vec.get_vecs_by_tokens(x)

        X[i] = torch.tensor(vectors).float()
        Y[i] = torch.tensor(y)

    return X,Y 
 
          


In [36]:
##Test DataLoader
batch_size = 128
train_loader = torch.utils.data.DataLoader(ClinicalNoteDataset(train_df, 'Asthma', 'vector_tokenized'), batch_size = batch_size, shuffle=True, collate_fn=vectorize_batch_FastText)
val_loader = torch.utils.data.DataLoader(ClinicalNoteDataset(test_df, 'Asthma', 'vector_tokenized'), batch_size = batch_size, shuffle=False, collate_fn=vectorize_batch_FastText)

print("# of train batches:", len(train_loader))
print("# of val batches:", len(val_loader))

train_iter = iter(train_loader)
x,y = next(train_iter)

print(x.shape)
print(y.shape)


# of train batches: 6
# of val batches: 5


  X[i] = torch.tensor(vectors).float()
  Y[i] = torch.tensor(y)


torch.Size([128, 1430, 300])
torch.Size([128])


In [37]:
print(x)

tensor([[[-0.0511, -0.2768, -0.0731,  ...,  0.5828,  0.1450, -0.1671],
         [ 0.1799,  0.1318,  0.1369,  ...,  0.1043,  0.1077, -0.2493],
         [ 0.3602, -0.2264, -0.6756,  ..., -0.0103,  0.0824,  0.3035],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.1799,  0.1318,  0.1369,  ...,  0.1043,  0.1077, -0.2493],
         [-0.4686, -0.0940, -0.0310,  ...,  0.1116,  0.3388,  0.0599],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.1799,  0.1318,  0.1369,  ...,  0

In [38]:
class ClincalNoteEmbeddingNet(nn.Module):
    def __init__(self, embedding_type, max_tokens):
        super(ClincalNoteEmbeddingNet, self).__init__()
        
        self.embedding_type = embedding_type
        self.max_tokens = max_tokens

        if(embedding_type == 'USE'):
            self.embedding_dimension = 512
        else:
            self.embedding_dimension = 300

        self.hidden_dim1 = 128
        self.hidden_dim2 = 64
        self.num_layers = 1

        #Because it is bidirectional, the output from LTSM is coming in twice the size of the hidden states required.
        #input is (batch, #of tokens * embedding_dimension)
        self.bilstm1 = nn.LSTM(input_size = self.embedding_dimension, hidden_size = self.hidden_dim1, bidirectional = True, batch_first = True, num_layers = self.num_layers) 
        self.bilstm2 = nn.LSTM(input_size = self.hidden_dim1 * 2, hidden_size = self.hidden_dim2, bidirectional = True, batch_first = True, num_layers=self.num_layers)
        #self.fc1 = nn.Linear(self.hidden_dim2*2,2)
        self.fc1 = nn.Linear(self.hidden_dim2 * self.max_tokens * 2, 2)
        #self.fc2 = nn.Linear(self.hidden_dim2, 2)

        #self.relu1 = nn.ReLU()
        #self.relu2 = nn.ReLU()
        #self.do = nn.Dropout()
    
    #def _init_hidden(self, current_batch_size):
    #    """Sets initial hidden and cell states (for LSTM)."""
    #    num_layers = self.num_layers
    #    h0 = torch.zeros(num_layers * 2, current_batch_size, self.hidden_dim2)
    #    c0 = torch.zeros(num_layers * 2, current_batch_size, self.hidden_dim2)

    #    return h0, c0        

    def forward(self, x):
        #reshape as it is a 2 dimensional embedding
        
        #h, c = self._init_hidden(current_batch_size=batch_size)

        x, states = self.bilstm1(x)
        x, states = self.bilstm2(x)

        #output = self.relu1(output)
        #output, states = self.bilstm2(output,states)
        #output = self.relu2(output)
        #output = self.do(output)
        #out[:, -1, :]

        #print(x.shape)
        batch_size = x.shape[0]
        x = x.reshape(batch_size, -1)
        #print(x.shape)

        x = self.fc1(x)
        #x = self.fc2(x)
        #output = F.softmax(output, dim=1)

        return x



In [39]:
def train_model(tmodel, train_dataloader, n_epoch=5, lr=0.003, device=None):
    import torch.optim as optim
    
    device = device or torch.device('cpu')
    tmodel.train()

    loss_history = []

    # your code here
    optimizer = optim.Adam(tmodel.parameters(), lr=lr)
    loss_func = nn.CrossEntropyLoss()

    for epoch in range(n_epoch):
        curr_epoch_loss = []
        start = time.time()
        for X, Y in tqdm(train_dataloader,desc="Training..."):
            # your code here
            optimizer.zero_grad()

            y_hat = tmodel(X)

            loss = loss_func(y_hat, Y)
            
            loss.backward()
            optimizer.step()
            
            curr_epoch_loss.append(loss.cpu().data.numpy())
        end = time.time()
        print(f"epoch{epoch}: curr_epoch_loss={np.mean(curr_epoch_loss)},execution_time={str(datetime.timedelta(seconds = (end-start)))}")
        loss_history += curr_epoch_loss
    return tmodel, loss_history

def eval_model(emodel, dataloader, device=None):
    """
    :return:
        pred_all: prediction of model on the dataloder.
        Y_test: truth labels. Should be an numpy array of ints
    TODO:
        evaluate the model using on the data in the dataloder.
        Add all the prediction and truth to the corresponding list
        Convert pred_all and Y_test to numpy arrays 
    """
    device = device or torch.device('cpu')
    emodel.eval()
    pred_all = []
    Y_test = []
    for X, Y in tqdm(dataloader, desc="Evaluating..."):
        # your code here
        y_hat = emodel(X)
        
        pred_all.append(y_hat.detach().to('cpu'))
        Y_test.append(Y.detach().to('cpu'))
        
    pred_all = np.concatenate(pred_all, axis=0)
    Y_test = np.concatenate(Y_test, axis=0)

    return pred_all, Y_test

In [40]:
def evaluate_predictions(truth, pred):
    """
    TODO: Evaluate the performance of the predictoin via AUROC, and F1 score
    each prediction in pred is a vector representing [p_0, p_1].
    When defining the scores we are interesed in detecting class 1 only
    (Hint: use roc_auc_score and f1_score from sklearn.metrics, be sure to read their documentation)
    return: auroc, f1
    """
    from sklearn.metrics import roc_auc_score, f1_score

    # your code here
    auroc = roc_auc_score(truth, pred[:,1])
    f1 = f1_score(truth, np.argmax(pred,axis=1))
    f1_macro = f1_score(truth, np.argmax(pred,axis=1),average='macro')
    f1_micro = f1_score(truth, np.argmax(pred,axis=1),average='micro')

    return auroc, f1, f1_macro, f1_micro

Need to create a loop to train and evaluate

In [41]:
device = torch.device('cpu')
#device = 'cuda' if torch.cuda.is_available() else 'cpu'
#print(f'Using device: {device}')

n_epoch = 1
lr = 0.003
batch_size = 32
disease_input = 'Asthma'
dataformat = 'vector_tokenized'
#embedding_type = 'GloVe'
embedding_type = 'FastText'
max_tokens = 1430

if embedding_type == 'GloVe':
    train_loader = torch.utils.data.DataLoader(ClinicalNoteDataset(train_df, disease_input, dataformat), batch_size = batch_size, shuffle=True, collate_fn=vectorize_batch_GloVe)
    val_loader = torch.utils.data.DataLoader(ClinicalNoteDataset(test_df, disease_input, dataformat), batch_size = batch_size, shuffle=False, collate_fn=vectorize_batch_GloVe)
if embedding_type == 'FastText':
    train_loader = torch.utils.data.DataLoader(ClinicalNoteDataset(train_df, disease_input, dataformat), batch_size = batch_size, shuffle=True, collate_fn=vectorize_batch_FastText)
    val_loader = torch.utils.data.DataLoader(ClinicalNoteDataset(test_df, disease_input, dataformat), batch_size = batch_size, shuffle=False, collate_fn=vectorize_batch_FastText)


model = ClincalNoteEmbeddingNet(embedding_type, max_tokens = max_tokens)
model = model.to(device)

start_train = time.time()
model, loss_history = train_model(model, train_loader, n_epoch=n_epoch, lr=lr, device=device)
end_train = time.time()
start_eval = time.time()
pred, truth = eval_model(model, val_loader, device=device)
end_eval = time.time()

print(f"Train,Eval,Total execution_time={str(datetime.timedelta(seconds = (end_train-start_train)))},{str(datetime.timedelta(seconds = (end_eval-start_eval)))},{str(datetime.timedelta(seconds = (end_eval-start_train)))}")

  X[i] = torch.tensor(vectors).float()
  Y[i] = torch.tensor(y)
Training...: 100%|██████████| 21/21 [01:34<00:00,  4.51s/it]


epoch0: curr_epoch_loss=3.7445526123046875,execution_time=0:01:34.806865


Evaluating...:  44%|████▍     | 8/18 [00:27<00:34,  3.47s/it]

In [None]:
auroc, f1, f1_macro, f1_micro = evaluate_predictions(truth, pred)

print(f"AUROC={auroc} and F1={f1} and F1_macro={f1_macro} and F1_micro={f1_micro}")

pred

AUROC=0.5143474236346807 and F1=0.0 and F1_macro=0.45841584158415843 and F1_micro=0.8464351005484461


array([[ 2.5655508, -2.5954227],
       [ 1.0780275, -1.10735  ],
       [ 1.9738563, -1.9915907],
       ...,
       [ 2.1448095, -2.1703727],
       [ 0.5936565, -0.5840215],
       [ 1.7524635, -1.7765573]], dtype=float32)

**Deep Learning - Word Embeddings - All Features - With Stop Words**

![DL BagOfWords AllFeatures Averaged](images\dl-we-swyes.gif)