This notebook was created to support the data preparation required to support our CS 598 DLH project.  The paper we have chosen for the reproducibility project is:
***Ensembling Classical Machine Learning and Deep Learning Approaches for Morbidity Identification from Clinical Notes ***



 

The data cannot be shared publicly due to the agreements required to obtain the data so we are storing the data locally and not putting in GitHub.

In [1]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

# set seed
seed = 24
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
# define data path
DATA_PATH = './obesity_data/'

test_df = pd.read_pickle(DATA_PATH + '/test.pkl') 
train_df = pd.read_pickle(DATA_PATH + '/train.pkl') 
corpus = pd.read_pickle(DATA_PATH + '/corpus.pkl')

In [2]:
#rebuild the vocabulary
import torchtext, torch, torch.nn.functional as F
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer("basic_english")
tokens = [tokenizer(doc) for doc in corpus]

voc = build_vocab_from_iterator(tokens, specials = ['<pad>'])

**Deep Learning - Bag of Words - All Feature Selections - Averaged**

![DL BagOfWords AllFeatures Averaged](images\DL-BagOfWords-ByFeatureSelection.gif)

****DL Model using word embeddings****

First we start by creating a dataset.  Note this will have to take the disease as part of the init and filter just for those records.

In [65]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class ClinicalNoteDataset(Dataset):

    def __init__(self, dataframe, disease, embedding):
        """
        TODO: init the Dataset instance.
        """
        # your code here
        self.disease = disease
        self.embedding = embedding
        self.df = dataframe[dataframe['disease'] == disease]

    def __len__(self):
        """
        TODO: Denotes the total number of samples
        """
        return len(self.df)

    def __getitem__(self, i):
        """
        TODO: Generates one sample of data
            return X, y for the i-th data.
        """
        Y = torch.tensor(self.df.iloc[i]['judgment']).long()

        X = self.df.iloc[i]['one_hot']

        if self.embedding == 'GloVe':
            vec = torchtext.vocab.GloVe(name='6B', dim=300)
            X = vec.get_vecs_by_tokens(voc.lookup_tokens(X))
        elif self.embedding == 'FastText':
            vec = torchtext.vocab.FastText()
            X = vec.get_vecs_by_tokens(voc.lookup_tokens(X))
        else:
            X = torch.tensor(X)
            
        return X,Y
        



In [61]:
##Test DataLoader
batch_size = 128
train_loader = torch.utils.data.DataLoader(ClinicalNoteDataset(train_df, 'Asthma', 'GloVe'), batch_size = batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(ClinicalNoteDataset(test_df, 'Asthma', 'GloVe'), batch_size = batch_size, shuffle=False)

print("# of train batches:", len(train_loader))
print("# of val batches:", len(val_loader))

train_iter = iter(train_loader)
x,y = next(train_iter)

print(x.shape)
print(y.shape)


# of train batches: 6
# of val batches: 5


  Y = torch.tensor(self.df.iloc[i]['judgment']).int()


torch.Size([128, 1430, 300])
torch.Size([128])


In [62]:
class ClincalNoteEmbeddingNet(nn.Module):
    def __init__(self, embedding_type, max_tokens):
        super(ClincalNoteEmbeddingNet, self).__init__()
        
        self.embedding_type = embedding_type
        self.max_tokens = max_tokens

        if(embedding_type == 'USE'):
            self.embedding_dimension = 512
        else:
            self.embedding_dimension = 300

        #Because it is bidirectional, the output from LTSM is coming in twice the size of the hidden states required.
        #input is (batch, #of tokens * embedding_dimension)
        self.bilstm1 = nn.LSTM(self.embedding_dimension * self.max_tokens,64, bidirectional = True, batch_first = True) 
        self.bilstm2 = nn.LSTM(64*2, 64, bidirectional = True, batch_first = True)
        self.fc1 = nn.Linear(64*2,2)
        

    def forward(self, x):
        batch_size = x.shape[0]

        #reshape as it is a 2 dimensional embedding
        x = x.reshape(batch_size, -1)

        (output,states) = self.bilstm1(x)
        (output,states) = self.bilstm2(output)
        output = self.fc1(output)
        output = F.softmax(output, dim=1)

        return output



In [63]:
def train_model(model, train_dataloader, n_epoch=5, lr=0.003, device=None):
    import torch.optim as optim
    
    device = device or torch.device('cpu')
    model.train()

    loss_history = []

    # your code here
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_func = nn.CrossEntropyLoss()

    for epoch in range(n_epoch):
        curr_epoch_loss = []
        for X, Y in train_dataloader:
            # your code here
            optimizer.zero_grad()

            y_hat = model(X)

            loss = loss_func(y_hat, Y)
            
            loss.backward()
            optimizer.step()
            
            curr_epoch_loss.append(loss.cpu().data.numpy())
        print(f"epoch{epoch}: curr_epoch_loss={np.mean(curr_epoch_loss)}")
        loss_history += curr_epoch_loss
    return model, loss_history

def eval_model(model, dataloader, device=None):
    """
    :return:
        pred_all: prediction of model on the dataloder.
        Y_test: truth labels. Should be an numpy array of ints
    TODO:
        evaluate the model using on the data in the dataloder.
        Add all the prediction and truth to the corresponding list
        Convert pred_all and Y_test to numpy arrays 
    """
    device = device or torch.device('cpu')
    model.eval()
    pred_all = []
    Y_test = []
    for X, Y in dataloader:
        # your code here
        y_hat = model(X)
        
        pred_all.append(y_hat.detach().to('cpu'))
        Y_test.append(Y.detach().to('cpu'))
        
    pred_all = np.concatenate(pred_all, axis=0)
    Y_test = np.concatenate(Y_test, axis=0)

    return pred_all, Y_test

Need to create a loop to train and evaluate

In [70]:
device = torch.device('cpu')
n_epoch = 20
lr = 0.003
batch_size = 32
disease_input = 'Asthma'
embedding_input = 'GloVe'
max_tokens = 1430

train_loader = torch.utils.data.DataLoader(ClinicalNoteDataset(train_df, disease_input, embedding_input), batch_size = batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(ClinicalNoteDataset(test_df, disease_input, embedding_input), batch_size = batch_size, shuffle=False)


model =ClincalNoteEmbeddingNet('GloVe', max_tokens = max_tokens)
model = model.to(device)


model, loss_history = train_model(model, train_loader, n_epoch=n_epoch, lr=lr, device=device)
pred, truth = eval_model(model, val_loader, device=device)

  Y = torch.tensor(self.df.iloc[i]['judgment']).long()


torch.Size([32, 2])
torch.Size([32])
torch.Size([32, 2])
torch.Size([32])


In [None]:
def evaluate_predictions(truth, pred):
    """
    TODO: Evaluate the performance of the predictoin via AUROC, and F1 score
    each prediction in pred is a vector representing [p_0, p_1].
    When defining the scores we are interesed in detecting class 1 only
    (Hint: use roc_auc_score and f1_score from sklearn.metrics, be sure to read their documentation)
    return: auroc, f1
    """
    from sklearn.metrics import roc_auc_score, f1_score

    # your code here
    auroc = roc_auc_score(truth, pred[:,1])
    f1 = f1_score(truth, np.argmax(pred,axis=1))
    f1_macro = f1_score(truth, np.argmax(pred,axis=1),average='macro')
    f1_micro = f1_score(truth, np.argmax(pred,axis=1),average='micro')

    return auroc, f1, f1_macro, f1_micro

In [None]:
pred, truth = eval_model(model, val_loader, device=device)
auroc, f1, f1_macro, f1_micro = evaluate_predictions(truth, pred)
print(f"AUROC={auroc} and F1={f1} and F1_macro={f1_macro} and F1_micro={f1_micro}")



  Y = torch.tensor(self.df.iloc[i]['judgment']).long()


AUROC=0.5616450683945284 and F1=0.0 and F1_macro=0.45841584158415843 and F1_micro=0.8464351005484461


**Deep Learning - Word Embeddings - All Features - Averaged with Stop Words**

![DL BagOfWords AllFeatures Averaged](images\dl-we-swyes.gif)