In [5]:

import matplotlib.pyplot as plt
from config import Config
from patient_data_reader import PatientReader
import os
import time
import numpy as np
import pandas as pd


import torch

In [6]:
# Number of units in the hidden (recurrent) layer
N_HIDDEN = 200
# Number of training sequences in each batch


# All gradients above this will be clipped
GRAD_CLIP = 100
# How often should we check the output?
EPOCH_SIZE = 100
# Number of epochs to train the net
num_epochs = 6

MAX_LENGTH = 300

In [7]:
def prepare_data(seqs, labels, vocabsize, maxlen=None):
    """Create the matrices from the datasets.

    This pad each sequence to the same lenght: the lenght of the
    longuest sequence or maxlen.

    if maxlen is set, we will cut all sequence to this maximum
    lenght.

    This swap the axis!
    """
    # x: a list of sentences
    lengths = [len(s) for s in seqs]

    eventSeq = []

    for seq in seqs:
        t = []
        for visit in seq:
            t.extend(visit)
        eventSeq.append(t)
    eventLengths = [len(s) for s in eventSeq]

    if maxlen is not None:
        new_seqs = []
        new_lengths = []
        new_labels = []
        for l, s, la in zip(lengths, seqs, labels):
            if l < maxlen:
                new_seqs.append(s)
                new_lengths.append(l)
                new_labels.append(la)
            else:
                new_seqs.append(s[:maxlen])
                new_lengths.append(maxlen)
                new_labels.append(la[:maxlen])
        lengths = new_lengths
        seqs = new_seqs
        labels = new_labels

        if len(lengths) < 1:
            return None, None, None

    n_samples = len(seqs)
    maxlen = np.max(lengths)

    x = np.zeros((n_samples, maxlen, vocabsize)).astype('int64')
    x_mask = np.zeros((n_samples, maxlen)).astype(float)
    y = np.ones((n_samples, maxlen)).astype(float)
    for idx, s in enumerate(seqs):
        x_mask[idx, :lengths[idx]] = 1
        for j, sj in enumerate(s):
            for tsj in sj:
                x[idx, j, tsj - 1] = 1
    for idx, t in enumerate(labels):
        y[idx, :lengths[idx]] = t
        # if lengths[idx] < maxlen:
        #     y[idx,lengths[idx]:] = t[-1]

    return x, x_mask, y, lengths, eventLengths

In [None]:
X_raw_data

In [8]:
FLAGS = Config()
data_sets = PatientReader(FLAGS)

X_raw_data, Y_raw_data = data_sets.get_data_from_type("train")
trainingAdmiSeqs, trainingMask, trainingLabels, trainingLengths, ltr = prepare_data(X_raw_data, Y_raw_data,
                                                                                    vocabsize=619,
                                                                                    maxlen=MAX_LENGTH)
Num_Samples, MAX_LENGTH, N_VOCAB = trainingAdmiSeqs.shape

X_valid_data, Y_valid_data = data_sets.get_data_from_type("valid")
validAdmiSeqs, validMask, validLabels, validLengths, lval = prepare_data(X_valid_data, Y_valid_data, vocabsize=619,
                                                                         maxlen=MAX_LENGTH)

X_test_data, Y_test_data = data_sets.get_data_from_type("test")
test_admiSeqs, test_mask, test_labels, testLengths, ltes = prepare_data(X_test_data, Y_test_data, vocabsize=619,
                                                                        maxlen=MAX_LENGTH)
alllength = sum(trainingLengths) + sum(validLengths) + sum(testLengths)
print(alllength)
eventNum = sum(ltr) + sum(lval) + sum(ltes)
print(eventNum)

 [*] load resource\vocab.pkl
 [*] load resource/X_train.pkl
 [*] load resource/Y_train.pkl
 [*] load resource/X_valid.pkl
 [*] load resource/Y_valid.pkl
 [*] load resource/X_test.pkl
 [*] load resource/Y_test.pkl
vocabulary size: 619
number of training documents: 2000
number of validation documents: 500
number of testing documents: 500
239887
685482


In [114]:
trainingLabels[0]

array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [11]:
N_VOCAB

619

In [19]:
trainingMask[0]

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [14]:
Y_raw_data[0][0]

0

In [37]:
import random
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

In [45]:
X_raw_data, Y_raw_data = data_sets.get_data_from_type("train")

class CustomDataset(Dataset):
    def __init__(self, seqs, hfs):
        self.x = seqs
        self.y = hfs
    def __len__(self):
        return len(self.x)
    def __getitem__(self, index):
        return (self.x[index], self.y[index])
    
dataset = CustomDataset(X_raw_data, Y_raw_data)


In [93]:
def collate_fn(data):
    """
    TODO: Collate the the list of samples into batches. For each patient, you need to pad the diagnosis
        sequences to the sample shape (max # visits, max # diagnosis codes). The padding infomation
        is stored in `mask`.
    
    Arguments:
        data: a list of samples fetched from `CustomDataset`
        
    Outputs:
        x: a tensor of shape (# patiens, max # visits, max # diagnosis codes) of type torch.long
        masks: a tensor of shape (# patiens, max # visits, max # diagnosis codes) of type torch.bool
        rev_x: same as x but in reversed time. This will be used in our RNN model for masking 
        rev_masks: same as mask but in reversed time. This will be used in our RNN model for masking
        y: a tensor of shape (# patiens) of type torch.float
        
    Note that you can obtains the list of diagnosis codes and the list of hf labels
        using: `sequences, labels = zip(*data)`
    """

    sequences, labels = zip(*data)

    x, x_mask, y, lengths, eventLengths = prepare_data(seqs=sequences, labels=labels, vocabsize=619, maxlen=None)
    
    x = torch.from_numpy(x)
    x_mask = torch.from_numpy(x_mask)
    y = torch.from_numpy(y)
    return x, x_mask, y, lengths, eventLengths
    
#     y = torch.tensor(labels, dtype=torch.float)
    
#     num_patients = len(sequences)
#     num_visits = [len(patient) for patient in sequences]
#     num_codes = [len(visit) for patient in sequences for visit in patient]

#     max_num_visits = max(num_visits)
#     max_num_codes = max(num_codes)
    
#     x = torch.zeros((num_patients, max_num_visits, max_num_codes), dtype=torch.long)
#     rev_x = torch.zeros((num_patients, max_num_visits, max_num_codes), dtype=torch.long)
#     masks = torch.zeros((num_patients, max_num_visits, max_num_codes), dtype=torch.bool)
#     rev_masks = torch.zeros((num_patients, max_num_visits, max_num_codes), dtype=torch.bool)
#     for i_patient, patient in enumerate(sequences):
#         v = len(patient)
#         for j_visit, visit in enumerate(patient):
#             """
#             TODO: update `x`, `rev_x`, `masks`, and `rev_masks`
#             """
#             # your code here
# #             raise NotImplementedError
#             l = len(visit)
#             x[i_patient, j_visit, :l] = torch.tensor(visit, dtype=torch.long)
#             masks[i_patient, j_visit, :l].fill_(1)
#             # print(v-j_visit)
#             rev_x[i_patient, v-j_visit-1, :l] = torch.tensor(visit, dtype=torch.long)
#             rev_masks[i_patient, v-j_visit-1, :l].fill_(1)    
    
#     return x, masks, rev_x, rev_masks, y

In [94]:
from torch.utils.data.dataset import random_split

split = int(len(dataset)*0.8)

lengths = [split, len(dataset) - split]
train_dataset, val_dataset = random_split(dataset, lengths)

print("Length of train dataset:", len(train_dataset))
print("Length of val dataset:", len(val_dataset))

Length of train dataset: 1600
Length of val dataset: 400


In [115]:
from torch.utils.data import DataLoader

def load_data(train_dataset, val_dataset, collate_fn):
    
    '''
    TODO: Implement this function to return the data loader for  train and validation dataset. 
    Set batchsize to 32. Set `shuffle=True` only for train dataloader.
    
    Arguments:
        train dataset: train dataset of type `CustomDataset`
        val dataset: validation dataset of type `CustomDataset`
        collate_fn: collate function
        
    Outputs:
        train_loader, val_loader: train and validation dataloaders
    
    Note that you need to pass the collate function to the data loader `collate_fn()`.
    '''
    
    batch_size=32
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    
    return train_loader, val_loader

train_loader, val_loader = load_data(train_dataset, val_dataset, collate_fn)

In [116]:
   
# loader_iter = iter(train_loader)
x, x_mask, y, lengths, eventLengths = next(loader_iter)

In [117]:
y[0]

array([1., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 1., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [97]:
x.shape

(32, 177, 619)

In [98]:
x_mask.shape

(32, 177)

In [99]:
y.shape

(32, 177)

In [100]:
batch_size=32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [101]:
def get_last_visit(hidden_states, masks):
    """
    TODO: obtain the hidden state for the last true visit (not padding visits)

    Arguments:
        hidden_states: the hidden states of each visit of shape (batch_size, # visits, embedding_dim)
        masks: the padding masks of shape (batch_size, # visits, # diagnosis codes)

    Outputs:
        last_hidden_state: the hidden state for the last true visit of shape (batch_size, embedding_dim)
        
    NOTE: DO NOT use for loop.
    
    HINT: First convert the mask to a vector of shape (batch_size,) containing the true visit length; 
          and then use this length vector as index to select the last visit.
    """
    # your code here
    # raise NotImplementedError
    sum_masks = masks.sum(axis = 2)
    lens = ((sum_masks > 0).sum(axis = 1) - 1) 
    lens = lens.unsqueeze(-1)
    indices = lens.repeat(1, hidden_states.shape[2])
    indices = indices.unsqueeze(1)
    # print(hidden_states.shape, masks.shape, sum_masks.shape, lens.shape, indices.shape)

    last_visit = torch.gather(hidden_states, 1, indices)
    last_visit = last_visit.squeeze(1)
    
    return last_visit

In [102]:
# input = torch.randn(batch_size, sequence_length, input_size)

input_size = 619

class GRUModel(nn.Module):
    def __init__(self):
        super(GRUModel, self).__init__()

        self.gru = nn.GRU(input_size=619, hidden_size=256, batch_first=True)
        self.fc = self.fc = nn.Linear(in_features= 256, out_features=1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, masks):
        
        # 3. Pass the embegginds through the RNN layer;
        output, h_n = self.gru(x)
        # 4. Obtain the hidden state at the last visit.
        true_h_n = get_last_visit(output, masks)
        
        output, hidden = self.gru(input, hidden)
        logits = self.fc(true_h_n)        
        probs = self.sigmoid(logits)
        return probs.view(batch_size)

In [103]:
naive_rnn = GRUModel()
naive_rnn

GRUModel(
  (gru): GRU(619, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [104]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(naive_rnn.parameters(), lr=0.001)

In [105]:
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score


def eval_model(model, val_loader):
    
    """
    TODO: evaluate the model.
    
    Arguments:
        model: the RNN model
        val_loader: validation dataloader
        
    Outputs:
        precision: overall precision score
        recall: overall recall score
        f1: overall f1 score
        roc_auc: overall roc_auc score
        
    Note that please pass all four arguments to the model so that we can use this function for both 
    models. (Use `model(x, masks, rev_x, rev_masks)`.)
        
    HINT: checkout https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
    """
    
    model.eval()
    y_pred = torch.LongTensor()
    y_score = torch.Tensor()
    y_true = torch.LongTensor()
    model.eval()
    for x, x_mask, y, lengths, eventLengths in val_loader:
        y_hat = model(x, x_mask)
        y_score = torch.cat((y_score,  y_hat.detach().to('cpu')), dim=0)
        y_hat = (y_hat > 0.5).int()
        y_pred = torch.cat((y_pred,  y_hat.detach().to('cpu')), dim=0)
        y_true = torch.cat((y_true, y.detach().to('cpu')), dim=0)
    """
    TODO:
        Calculate precision, recall, f1, and roc auc scores.
        Use `average='binary'` for calculating precision, recall, and fscore.
    """
    p, r, f, _ = precision_recall_fscore_support(y_true, np.array(y_pred)>0.5, average='binary')
    roc_auc = roc_auc_score(y_true, y_score)
#     p, r, f, roc_auc = None, None, None, None
    # your code here
#     raise NotImplementedError
    return p, r, f, roc_auc

In [106]:
def train(model, train_loader, val_loader, n_epochs):
    """
    TODO: train the model.
    
    Arguments:
        model: the RNN model
        train_loader: training dataloder
        val_loader: validation dataloader
        n_epochs: total number of epochs
        
    You need to call `eval_model()` at the end of each training epoch to see how well the model performs 
    on validation data.
        
    Note that please pass all four arguments to the model so that we can use this function for both 
    models. (Use `model(x, masks, rev_x, rev_masks)`.)
    """
    
    for epoch in range(n_epochs):
        model.train()
        train_loss = 0
        for x, x_mask, y, lengths, eventLengths in train_loader:
            """
            TODO:
                1. zero grad
                2. model forward
                3. calculate loss
                4. loss backward
                5. optimizer step
            """
#             loss = None
            outputs = model(x, x_mask)
            loss = criterion(outputs, y) 
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
#             # your code here
#             raise NotImplementedError
            train_loss += loss.item()
        train_loss = train_loss / len(train_loader)
        print('Epoch: {} \t Training Loss: {:.6f}'.format(epoch+1, train_loss))
        p, r, f, roc_auc = eval_model(model, val_loader)
        print('Epoch: {} \t Validation p: {:.2f}, r:{:.2f}, f: {:.2f}, roc_auc: {:.2f}'
              .format(epoch+1, p, r, f, roc_auc))

In [107]:
n_epochs = 5
train(naive_rnn, train_loader, val_loader, n_epochs)

RuntimeError: expected scalar type Long but found Float

In [None]:
class RecognitionNet(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.fc1 = nn.Linear(1473, 64)
        
        l_in = lasagne.layers.InputLayer(shape=(N_BATCH, MAX_LENGTH, N_VOCAB))
        l_in = nn.Sequential(
            nn.Linear(in_features=N_VOCAB, out_features=N_VOCAB),
            nn.Dropout(p=0.5),
            nn.Linear(in_features=N_VOCAB, out_features=MAX_LENGTH * N_VOCAB),
            nn.Reshape(N_BATCH, MAX_LENGTH, N_VOCAB)
        )