This notebook was created to support the data preparation required to support our CS 598 DLH project.  The paper we have chosen for the reproducibility project is:
***Ensembling Classical Machine Learning and Deep Learning Approaches for Morbidity Identification from Clinical Notes ***



 

The data cannot be shared publicly due to the agreements required to obtain the data so we are storing the data locally and not putting in GitHub.

In [60]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

# set seed
seed = 24
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
# define data path
DATA_PATH = './obesity_data/'

test_df = pd.read_pickle(DATA_PATH + '/test.pkl') 
train_df = pd.read_pickle(DATA_PATH + '/train.pkl') 
corpus = pd.read_pickle(DATA_PATH + '/corpus.pkl')

In [61]:
#rebuild the vocabulary
import torchtext, torch, torch.nn.functional as F
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer("basic_english")
tokens = [tokenizer(doc) for doc in corpus]

voc = build_vocab_from_iterator(tokens, specials = ['<pad>'])

**Deep Learning - Bag of Words - All Feature Selections - Averaged**

![DL BagOfWords AllFeatures Averaged](images\DL-BagOfWords-ByFeatureSelection.gif)

****DL Model using word embeddings****

First we start by creating a dataset.  Note this will have to take the disease as part of the init and filter just for those records.

In [73]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class ClinicalNoteDataset(Dataset):

    def __init__(self, dataframe, disease, embedding):
        """
        TODO: init the Dataset instance.
        """
        # your code here
        self.disease = disease
        self.embedding = embedding
        self.df = dataframe[dataframe['disease'] == disease]

    def __len__(self):
        """
        TODO: Denotes the total number of samples
        """
        return len(self.df)

    def __getitem__(self, i):
        """
        TODO: Generates one sample of data
            return X, y for the i-th data.
        """
        Y = torch.tensor(self.df.iloc[i]['judgment'])

        X = self.df.iloc[i]['one_hot']

        if self.embedding == 'GloVe':
            vec = torchtext.vocab.GloVe(name='6B', dim=300)
            X = vec.get_vecs_by_tokens(voc.lookup_tokens(X))
        elif self.embedding == 'FastText':
            vec = torchtext.vocab.FastText()
            X = vec.get_vecs_by_tokens(voc.lookup_tokens(X))
        else:
            X = torch.tensor(X)
            
        return (X,Y)
        



In [74]:
##Test DataLoader
batch_size = 128
train_loader = torch.utils.data.DataLoader(ClinicalNoteDataset(train_df, 'Asthma', 'GloVe'), batch_size = batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(ClinicalNoteDataset(test_df, 'Asthma', 'GloVe'), batch_size = batch_size, shuffle=False)

print("# of train batches:", len(train_loader))
print("# of val batches:", len(val_loader))

train_iter = iter(train_loader)
x,y = next(train_iter)

print(x.shape)
print(y.shape)


# of train batches: 6
# of val batches: 5


  Y = torch.tensor(self.df.iloc[i]['judgment'])


torch.Size([128, 1430, 300])
torch.Size([128])


In [None]:
class ClincalNoteNet(nn.Module):
    def __init__(self):
        super(ClincalNoteNet, self).__init__()
        
        # your code here
        self.fc1 = nn.Linear(1473,64)
        self.fc2 = nn.Linear(64,32)
        self.dropout = nn.Dropout(.5)
        self.fc3 = nn.Linear(32,1)
        
        self.net = nn.Sequential(self.fc1,
                                 nn.ReLU(),
                                 self.fc2,
                                 nn.ReLU(),
                                 self.dropout,
                                 self.fc3,
                                 nn.Sigmoid())

    def forward(self, x):
        # your code here
        return self.net(x)

# initialize the NN
model = ClincalNoteNet()
print(model)

In [None]:
def train_model(model, train_dataloader, n_epoch=5, lr=0.003, device=None):
    import torch.optim as optim
    """
    :param model: The instance of FreqNet that we are training
    :param train_dataloader: the DataLoader of the training data
    :param n_epoch: number of epochs to train
    :return:
        model: trained model
        loss_history: recorded training loss history - should be just a list of float
    TODO:
        Specify the optimizer (*optimizer*) to be optim.Adam
        Specify the loss function (*loss_func*) to be CrossEntropyLoss
        Within the loop, do the normal training procedures:
            pass the input through the model
            pass the output through loss_func to compute the loss
            zero out currently accumulated gradient, use loss.basckward to backprop the gradients, then call optimizer.step
    """
    device = device or torch.device('cpu')
    model.train()

    loss_history = []

    # your code here
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_func = nn.CrossEntropyLoss()

    for epoch in range(n_epoch):
        curr_epoch_loss = []
        for (X, K_beat, K_rhythm, K_freq), Y in train_dataloader:
            # your code here
            optimizer.zero_grad()
            y_hat, _ = model(X, K_beat, K_rhythm, K_freq)

            loss = loss_func(y_hat, Y)
            
            loss.backward()
            optimizer.step()
            
            curr_epoch_loss.append(loss.cpu().data.numpy())
        print(f"epoch{epoch}: curr_epoch_loss={np.mean(curr_epoch_loss)}")
        loss_history += curr_epoch_loss
    return model, loss_history



Need to create a loop to train and evaluate

In [None]:
device = torch.device('cpu')
n_epoch = 4
lr = 0.003
n_channel = 4
n_dim=3000
T=50

model = FreqNet(n_channel, n_dim, T)
model = model.to(device)

model, loss_history = train_model(model, train_loader, n_epoch=n_epoch, lr=lr, device=device)
pred, truth = eval_model(model, test_loader, device=device)

In [None]:
def evaluate_predictions(truth, pred):
    """
    TODO: Evaluate the performance of the predictoin via AUROC, and F1 score
    each prediction in pred is a vector representing [p_0, p_1].
    When defining the scores we are interesed in detecting class 1 only
    (Hint: use roc_auc_score and f1_score from sklearn.metrics, be sure to read their documentation)
    return: auroc, f1
    """
    from sklearn.metrics import roc_auc_score, f1_score

    # your code here
    auroc = roc_auc_score(truth, pred[:,1])
    f1 = f1_score(truth, np.argmax(pred,axis=1))

    return auroc, f1

In [None]:
pred, truth = eval_model(model, test_loader, device=device)
auroc, f1 = evaluate_predictions(truth, pred)
print(f"AUROC={auroc} and F1={f1}")

assert auroc > 0.8 and f1 > 0.7, "Performance is too low {}. Something's probably off.".format((auroc, f1))

**Deep Learning - Word Embeddings - All Features - Averaged with Stop Words**

![DL BagOfWords AllFeatures Averaged](images\dl-we-swyes.gif)