In [10]:
import os
import sys
import pickle
import psutil
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

# set seed
seed = 24
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

# Define data path
DATA_PATH = "data"

In [11]:
pids = pickle.load(open(os.path.join(DATA_PATH,'pids.pkl'), 'rb'))
vids = pickle.load(open(os.path.join(DATA_PATH,'vids.pkl'), 'rb'))
targets = pickle.load(open(os.path.join(DATA_PATH,'targets.pkl'), 'rb'))
prob_targets = pickle.load(open(os.path.join(DATA_PATH,'prob_targets.pkl'), 'rb'))
prob_targets_allvisits = pickle.load(open(os.path.join(DATA_PATH,'prob_targets_allvisits.pkl'), 'rb'))
seqs = pickle.load(open(os.path.join(DATA_PATH,'seqs.pkl'), 'rb'))
diags = pickle.load(open(os.path.join(DATA_PATH,'diags.pkl'), 'rb'))
categories = pickle.load(open(os.path.join(DATA_PATH,'categories.pkl'), 'rb'))
sub_categories = pickle.load(open(os.path.join(DATA_PATH,'subcategories.pkl'), 'rb'))
codes = pickle.load(open(os.path.join(DATA_PATH,'icd9.pkl'), 'rb'))
assert len(pids) == len(vids) == len(targets) == len(seqs)

In [12]:
from torch.utils.data import Dataset


class CustomDataset(Dataset):
    
    def __init__(self, seqs, targets):
        
        """
        TODO: Store `seqs`. to `self.x` and `hfs` to `self.y`.
        
        Note that you DO NOT need to covert them to tensor as we will do this later.
        Do NOT permute the data.
        """
        self.x = seqs
        self.y = targets
    
    def __len__(self):
        
        """
        TODO: Return the number of samples (i.e. patients).
        """
        
        return(len(self.x))
    
    def __getitem__(self, index):
        
        """
        TODO: Generates one sample of data.
        
        Note that you DO NOT need to covert them to tensor as we will do this later.
        """
        return (self.x[index], self.y[index])

In [13]:
dataset = CustomDataset(seqs, prob_targets_allvisits)

In [14]:
def collate_fn(data):
    """
    TODO: Collate the the list of samples into batches. For each patient, you need to pad the diagnosis
        sequences to the sample shape (max # visits, max # diagnosis codes). The padding infomation
        is stored in `mask`.
    
    Arguments:
        data: a list of samples fetched from `CustomDataset`
        
    Outputs:
        x: a tensor of shape (# patiens, max # visits, max # diagnosis codes) of type torch.long
        masks: a tensor of shape (# patiens, max # visits, max # diagnosis codes) of type torch.bool
        rev_x: same as x but in reversed time. This will be used in our RNN model for masking 
        rev_masks: same as mask but in reversed time. This will be used in our RNN model for masking
        y: a tensor of shape (# patiens) of type torch.float
        
    Note that you can obtains the list of diagnosis codes and the list of hf labels
        using: `sequences, labels = zip(*data)`
    """
    sequences, targets = zip(*data)

#     y = torch.tensor(targets, dtype=torch.float)
    #import pdb; pdb.set_trace()
    num_patients = len(sequences)
    num_visits = [len(patient) for patient in sequences]
    num_codes = [len(visit) for patient in sequences for visit in patient]
    num_categories = len(targets[0][0])

    max_num_visits = max(num_visits)
    max_num_codes = max(num_codes)
    
    sum_visits = sum(num_visits)
    
    x = torch.zeros((sum_visits - num_patients, max_num_codes), dtype=torch.int)
    y = torch.zeros((sum_visits - num_patients, num_categories), dtype=torch.float32)
    x_masks = torch.zeros((sum_visits - num_patients, max_num_codes), dtype=torch.bool)

#     for i_patient, patient in enumerate(sequences):   
#         for j_visit, visit in enumerate(patient):
#             """
#             TODO: update `x`, `rev_x`, `masks`, and `rev_masks`
#             """ 
#             x[i_patient, j_visit] = torch.Tensor(visit)
#             #x_masks[i_patient, j_visit] = torch.Tensor(np.ones(num_codes, dtype=int))
#             x_masks[i_patient, j_visit] = 1
#     import pdb; pdb.set_trace()
    n = 0
    for i,patient in enumerate(sequences):
        for j,visit in enumerate(patient):
            if j == len(patient) - 1:
                break
            for k,code in enumerate(visit):
                x[n,k] = code
                x_masks[n,k] = 1
            n+=1
    n = 0
    for i,patient in enumerate(targets):
        for j,visit in enumerate(patient):
            if j == len(patient) - 1:
                break
            y[n] = torch.tensor(patient[j+1])
            n += 1
    
    
    return x, x_masks, y

In [15]:
train_split = int(len(dataset)*0.75)
test_split = int(len(dataset)*0.15)
val_split = int(len(dataset)*0.10)

In [16]:
from torch.utils.data.dataset import random_split

train_split = int(len(dataset)*0.75)
test_split = int(len(dataset)*0.15)

lengths = [train_split, test_split, len(dataset) - (train_split + test_split)]
train_dataset, test_dataset, val_dataset = random_split(dataset, lengths)

print("Length of train dataset:", len(train_dataset))
print("Length of test dataset:", len(test_dataset))
print("Length of val dataset:", len(val_dataset))

Length of train dataset: 6561
Length of test dataset: 1312
Length of val dataset: 875


In [17]:
from torch.utils.data import DataLoader

def load_data(train_dataset, test_dataset, val_dataset, collate_fn):
    
    '''
    TODO: Implement this function to return the data loader for  train and validation dataset. 
    Set batchsize to 32. Set `shuffle=True` only for train dataloader.
    
    Arguments:
        train dataset: train dataset of type `CustomDataset`
        val dataset: validation dataset of type `CustomDataset`
        collate_fn: collate function
        
    Outputs:
        train_loader, val_loader: train and validation dataloaders
    
    Note that you need to pass the collate function to the data loader `collate_fn()`.
    '''
    
    batch_size = 100
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size,
                                               collate_fn=collate_fn,
                                               shuffle=False)
    test_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=batch_size,
                                           collate_fn=collate_fn,
                                           shuffle=False)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=batch_size,
                                             collate_fn=collate_fn,
                                             shuffle=False)
    
    return train_loader, test_loader, val_loader


train_loader, test_loader, val_loader = load_data(train_dataset, test_dataset, val_dataset, collate_fn)

In [18]:
def sum_embeddings_with_mask(x, masks):
    """
    Mask select the embeddings for true visits (not padding visits) and then sum the embeddings for each visit up.

    Arguments:
        x: the embeddings of diagnosis sequence of shape (batch_size, # visits, # diagnosis codes, embedding_dim)
        masks: the padding masks of shape (batch_size, # visits, # diagnosis codes)

    Outputs:
        sum_embeddings: the sum of embeddings of shape (batch_size, # visits, embedding_dim)
    """
    
    x = x * masks.unsqueeze(-1)
    x = torch.sum(x, dim = -2)
    return x

In [19]:
def indices_to_multihot(indices, masks, dim):
    #import pdb; pdb.set_trace()
    #indices = indices[masks.any(dim=1)]
    multihot = torch.zeros((indices.shape[0], dim), dtype=torch.int)
    for idx, row in enumerate(indices):
        y_idx = row[masks[idx]].unique()
        multihot[idx] = F.one_hot(y_idx.to(torch.int64), multihot.shape[1]).sum(0)
    return multihot

In [20]:
class BaselineMLP(nn.Module):
    
    """
    TODO: implement the naive RNN model above.
    """
    
    def __init__(self, num_codes, num_categories):
        super().__init__()
        """
        TODO: 
            1. Define the embedding layer using `nn.Embedding`. Set `embDimSize` to 128.
            2. Define the RNN using `nn.GRU()`; Set `hidden_size` to 128. Set `batch_first` to True.
            2. Define the RNN for the reverse direction using `nn.GRU()`;
               Set `hidden_size` to 128. Set `batch_first` to True.
            3. Define the linear layers using `nn.Linear()`; Set `in_features` to 256, and `out_features` to 1.
            4. Define the final activation layer using `nn.Sigmoid().

        Arguments:
            num_codes: total number of diagnosis codes
        """
        self.padding_idx = 0
        self.embedding = nn.Embedding(num_codes, embedding_dim=128)
        self.fc = nn.Linear(128, num_categories)
        self.softmax = nn.Softmax(dim=-1)
        
    
    def forward(self, x, masks):
        """
        Arguments:
            x: the diagnosis sequence of shape (batch_size, # visits, # diagnosis codes)
            masks: the padding masks of shape (batch_size, # visits, # diagnosis codes)

        Outputs:
            probs: probabilities of shape (batch_size)
        """
        #import pdb; pdb.set_trace()
#         num_codes = self.embedding.weight.shape[0]
#         x = indices_to_multihot(x, masks, num_codes)
#         x[~masks] = self.padding_idx
        x = self.embedding(x)
        x = sum_embeddings_with_mask(x, masks)
        logits = self.fc(x)
#         logits = logits.mean(dim=1)
        probs = self.softmax(logits)
        return logits
    

# load the model here
baseline_mlp = BaselineMLP(num_codes = len(codes), num_categories=len(sub_categories))
baseline_mlp

BaselineMLP(
  (embedding): Embedding(4903, 128)
  (fc): Linear(in_features=128, out_features=184, bias=True)
  (softmax): Softmax(dim=-1)
)

In [21]:
criterion = nn.CrossEntropyLoss()
#optimizer = torch.optim.Adam(baseline_mlp.parameters(), lr=0.001)
optimizer = torch.optim.Adadelta(baseline_mlp.parameters(), weight_decay=0.001)

In [22]:
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score


def eval_model(model, test_loader, k=15, n=-1):
    
    """
    TODO: evaluate the model.
    
    Arguments:
        model: the RNN model
        val_loader: validation dataloader
        
    Outputs:
        precision: overall precision score
        recall: overall recall score
        f1: overall f1 score
        roc_auc: overall roc_auc score
        
    Note that please pass all four arguments to the model so that we can use this function for both 
    models. (Use `model(x, masks, rev_x, rev_masks)`.)
        
    HINT: checkout https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
    """
    y_pred = torch.LongTensor()
    y_score = torch.Tensor()
    y_true = torch.LongTensor()
    all_precision = []
    all_accuracy = []
    
    model.eval()
    with torch.no_grad():
        for x, masks, y in test_loader:
#             import pdb; pdb.set_trace()
            nn = y.shape[0] - 1 if n == -1 else n
            y_hat = model(x, masks)
            y_hat = F.softmax(y_hat, dim=-1)
            num_labels = y_hat.shape[1]
            num_categories = torch.count_nonzero(y, dim=1)
            nz_rows, nz_cols = torch.nonzero(y, as_tuple=True)
            k_correct = 0
            predictions = 0
            total_precision = 0
            total_accuracy = 0
            for i in range(nn):
                visit_correct = 0
                y_true = nz_cols[nz_rows == i]
                _, y_pred = torch.topk(y_hat[i], k)
#                 for v in y_pred:
#                     if v in y_true:
#                         visit_correct += 1
                for v in y_true:
                    if v in y_pred:
                        visit_correct += 1
                predictions += len(y_true)
                visit_precision = visit_correct / min(k, len(y_true))
                visit_accuracy = visit_correct / len(y_true)
                #print(f'visit {i}: precision: {visit_precision:0.2f} accuracy: {visit_accuracy:0.2f}')
                k_correct += visit_correct
                total_precision += visit_precision
                total_accuracy += visit_accuracy
            #import pdb; pdb.set_trace()
#             precision_k = precision / k
#             accuracy_k = k_correct / predictions
            precision_k = total_precision / nn
            accuracy_k = total_accuracy / nn
            all_precision.append(precision_k)
            all_accuracy.append(accuracy_k)
            
#             y_score = torch.cat((y_score,  y_hat.detach().to('cpu')), dim=0)
#             y_hat = (y_hat > 0.5).int()
#             y_pred = torch.cat((y_pred,  y_hat.detach().to('cpu')), dim=0)
#             y_true = torch.cat((y_true, y.detach().to('cpu')), dim=0)
    """
    TODO:
        Calculate precision, recall, f1, and roc auc scores.
        Use `average='binary'` for calculating precision, recall, and fscore.
    """
#     p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average='binary')
#     roc_auc = roc_auc_score(y_true, y_score)
    total_precision_k = np.mean(all_precision)
    total_accuracy_k = np.mean(all_accuracy)
    return total_precision_k, total_accuracy_k

In [23]:
def train(model, train_loader, test_loader, n_epochs):
    """
    TODO: train the model.
    
    Arguments:
        model: the RNN model
        train_loader: training dataloder
        val_loader: validation dataloader
        n_epochs: total number of epochs
        
    You need to call `eval_model()` at the end of each training epoch to see how well the model performs 
    on validation data.
        
    Note that please pass all four arguments to the model so that we can use this function for both 
    models. (Use `model(x, masks, rev_x, rev_masks)`.)
    """
    base_cpu, base_ram = print_cpu_usage()
    for epoch in range(n_epochs):
        model.train()
        train_loss = 0
#         import pdb; pdb.set_trace()
        for x, masks, y in train_loader:
            """
            TODO:
                1. zero grad
                2. model forward
                3. calculate loss
                4. loss backward
                5. optimizer step
            """
            y_hat = model(x, masks)
#             mask_idxs = masks.sum(dim=1) - 1
#             y_hat = y_hat[range(len(masks)), mask_idxs]
            loss = criterion(y_hat, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss = train_loss / len(train_loader)
        print_cpu_usage()
        print(f'Epoch: {epoch+1} \t Training Loss: {train_loss:.6f}')
        for k in range(5, 31, 5):
            precision_k, accuracy_k = eval_model(model, val_loader, k=k)
            print(f'Epoch: {epoch+1} \t Validation precision@k{k}: {precision_k:.4f}, accuracy@k{k}: {accuracy_k:.4f}')
    final_cpu, final_ram = print_cpu_usage()

In [26]:
def print_cpu_usage():
    load = psutil.getloadavg()[2]
    cpu_usage = (load/os.cpu_count()) * 100
    ram = psutil.virtual_memory()[2]
    print(f"CPU: {cpu_usage:0.2f}")
    print(f"RAM %: {ram}")
    return cpu_usage, ram

In [27]:
n_epochs = 100
%time train(baseline_mlp, train_loader, test_loader, n_epochs)

CPU: 16.14
RAM %: 65.0
CPU: 16.20
RAM %: 65.0
Epoch: 1 	 Training Loss: 3.799349
Epoch: 1 	 Validation precision@k5: 0.6240, accuracy@k5: 0.3241
Epoch: 1 	 Validation precision@k10: 0.5852, accuracy@k10: 0.4892
Epoch: 1 	 Validation precision@k15: 0.6282, accuracy@k15: 0.6010
Epoch: 1 	 Validation precision@k20: 0.6881, accuracy@k20: 0.6829
Epoch: 1 	 Validation precision@k25: 0.7460, accuracy@k25: 0.7454
Epoch: 1 	 Validation precision@k30: 0.7917, accuracy@k30: 0.7917
CPU: 16.20
RAM %: 65.0
Epoch: 2 	 Training Loss: 3.759209
Epoch: 2 	 Validation precision@k5: 0.6319, accuracy@k5: 0.3284
Epoch: 2 	 Validation precision@k10: 0.5927, accuracy@k10: 0.4956
Epoch: 2 	 Validation precision@k15: 0.6349, accuracy@k15: 0.6077
Epoch: 2 	 Validation precision@k20: 0.6943, accuracy@k20: 0.6890
Epoch: 2 	 Validation precision@k25: 0.7522, accuracy@k25: 0.7516
Epoch: 2 	 Validation precision@k30: 0.7968, accuracy@k30: 0.7968
CPU: 16.17
RAM %: 65.0
Epoch: 3 	 Training Loss: 3.734512
Epoch: 3 	 Vali

CPU: 16.72
RAM %: 65.0
Epoch: 19 	 Training Loss: 3.582683
Epoch: 19 	 Validation precision@k5: 0.6704, accuracy@k5: 0.3470
Epoch: 19 	 Validation precision@k10: 0.6297, accuracy@k10: 0.5268
Epoch: 19 	 Validation precision@k15: 0.6677, accuracy@k15: 0.6391
Epoch: 19 	 Validation precision@k20: 0.7241, accuracy@k20: 0.7185
Epoch: 19 	 Validation precision@k25: 0.7761, accuracy@k25: 0.7755
Epoch: 19 	 Validation precision@k30: 0.8216, accuracy@k30: 0.8216
CPU: 16.72
RAM %: 65.0
Epoch: 20 	 Training Loss: 3.577091
Epoch: 20 	 Validation precision@k5: 0.6722, accuracy@k5: 0.3480
Epoch: 20 	 Validation precision@k10: 0.6324, accuracy@k10: 0.5292
Epoch: 20 	 Validation precision@k15: 0.6701, accuracy@k15: 0.6414
Epoch: 20 	 Validation precision@k20: 0.7245, accuracy@k20: 0.7190
Epoch: 20 	 Validation precision@k25: 0.7769, accuracy@k25: 0.7763
Epoch: 20 	 Validation precision@k30: 0.8236, accuracy@k30: 0.8236
CPU: 16.70
RAM %: 65.0
Epoch: 21 	 Training Loss: 3.571705
Epoch: 21 	 Validation 

Epoch: 36 	 Validation precision@k30: 0.8343, accuracy@k30: 0.8343
CPU: 17.54
RAM %: 65.2
Epoch: 37 	 Training Loss: 3.512620
Epoch: 37 	 Validation precision@k5: 0.6956, accuracy@k5: 0.3604
Epoch: 37 	 Validation precision@k10: 0.6535, accuracy@k10: 0.5461
Epoch: 37 	 Validation precision@k15: 0.6872, accuracy@k15: 0.6574
Epoch: 37 	 Validation precision@k20: 0.7382, accuracy@k20: 0.7324
Epoch: 37 	 Validation precision@k25: 0.7926, accuracy@k25: 0.7920
Epoch: 37 	 Validation precision@k30: 0.8346, accuracy@k30: 0.8346
CPU: 17.51
RAM %: 65.2
Epoch: 38 	 Training Loss: 3.510499
Epoch: 38 	 Validation precision@k5: 0.6965, accuracy@k5: 0.3610
Epoch: 38 	 Validation precision@k10: 0.6541, accuracy@k10: 0.5466
Epoch: 38 	 Validation precision@k15: 0.6880, accuracy@k15: 0.6582
Epoch: 38 	 Validation precision@k20: 0.7379, accuracy@k20: 0.7322
Epoch: 38 	 Validation precision@k25: 0.7933, accuracy@k25: 0.7926
Epoch: 38 	 Validation precision@k30: 0.8351, accuracy@k30: 0.8351
CPU: 17.51
RAM 

Epoch: 54 	 Validation precision@k25: 0.7956, accuracy@k25: 0.7949
Epoch: 54 	 Validation precision@k30: 0.8374, accuracy@k30: 0.8374
CPU: 17.13
RAM %: 65.3
Epoch: 55 	 Training Loss: 3.491141
Epoch: 55 	 Validation precision@k5: 0.7037, accuracy@k5: 0.3654
Epoch: 55 	 Validation precision@k10: 0.6615, accuracy@k10: 0.5527
Epoch: 55 	 Validation precision@k15: 0.6914, accuracy@k15: 0.6612
Epoch: 55 	 Validation precision@k20: 0.7446, accuracy@k20: 0.7388
Epoch: 55 	 Validation precision@k25: 0.7960, accuracy@k25: 0.7953
Epoch: 55 	 Validation precision@k30: 0.8374, accuracy@k30: 0.8374
CPU: 17.29
RAM %: 65.3
Epoch: 56 	 Training Loss: 3.490605
Epoch: 56 	 Validation precision@k5: 0.7037, accuracy@k5: 0.3654
Epoch: 56 	 Validation precision@k10: 0.6618, accuracy@k10: 0.5529
Epoch: 56 	 Validation precision@k15: 0.6911, accuracy@k15: 0.6609
Epoch: 56 	 Validation precision@k20: 0.7446, accuracy@k20: 0.7388
Epoch: 56 	 Validation precision@k25: 0.7961, accuracy@k25: 0.7955
Epoch: 56 	 Val

Epoch: 72 	 Validation precision@k20: 0.7464, accuracy@k20: 0.7406
Epoch: 72 	 Validation precision@k25: 0.7954, accuracy@k25: 0.7948
Epoch: 72 	 Validation precision@k30: 0.8371, accuracy@k30: 0.8371
CPU: 17.41
RAM %: 65.3
Epoch: 73 	 Training Loss: 3.485427
Epoch: 73 	 Validation precision@k5: 0.7022, accuracy@k5: 0.3647
Epoch: 73 	 Validation precision@k10: 0.6619, accuracy@k10: 0.5527
Epoch: 73 	 Validation precision@k15: 0.6914, accuracy@k15: 0.6612
Epoch: 73 	 Validation precision@k20: 0.7466, accuracy@k20: 0.7407
Epoch: 73 	 Validation precision@k25: 0.7955, accuracy@k25: 0.7948
Epoch: 73 	 Validation precision@k30: 0.8370, accuracy@k30: 0.8370
CPU: 17.46
RAM %: 65.3
Epoch: 74 	 Training Loss: 3.485266
Epoch: 74 	 Validation precision@k5: 0.7022, accuracy@k5: 0.3647
Epoch: 74 	 Validation precision@k10: 0.6617, accuracy@k10: 0.5525
Epoch: 74 	 Validation precision@k15: 0.6912, accuracy@k15: 0.6610
Epoch: 74 	 Validation precision@k20: 0.7466, accuracy@k20: 0.7407
Epoch: 74 	 Val

Epoch: 90 	 Validation precision@k15: 0.6912, accuracy@k15: 0.6610
Epoch: 90 	 Validation precision@k20: 0.7457, accuracy@k20: 0.7399
Epoch: 90 	 Validation precision@k25: 0.7961, accuracy@k25: 0.7955
Epoch: 90 	 Validation precision@k30: 0.8368, accuracy@k30: 0.8368
CPU: 17.51
RAM %: 65.5
Epoch: 91 	 Training Loss: 3.483665
Epoch: 91 	 Validation precision@k5: 0.7021, accuracy@k5: 0.3647
Epoch: 91 	 Validation precision@k10: 0.6619, accuracy@k10: 0.5526
Epoch: 91 	 Validation precision@k15: 0.6912, accuracy@k15: 0.6610
Epoch: 91 	 Validation precision@k20: 0.7456, accuracy@k20: 0.7398
Epoch: 91 	 Validation precision@k25: 0.7963, accuracy@k25: 0.7956
Epoch: 91 	 Validation precision@k30: 0.8368, accuracy@k30: 0.8368
CPU: 17.48
RAM %: 65.5
Epoch: 92 	 Training Loss: 3.483615
Epoch: 92 	 Validation precision@k5: 0.7022, accuracy@k5: 0.3648
Epoch: 92 	 Validation precision@k10: 0.6618, accuracy@k10: 0.5526
Epoch: 92 	 Validation precision@k15: 0.6914, accuracy@k15: 0.6612
Epoch: 92 	 Val

In [28]:
for k in range(5, 31, 5):
    precision_k, accuracy_k = eval_model(baseline_mlp, test_loader, k=k)
    print(f'Validation precision@k{k}: {precision_k:.4f}, accuracy@k{k}: {accuracy_k:.4f}')

Validation precision@k5: 0.7461, accuracy@k5: 0.3857
Validation precision@k10: 0.6910, accuracy@k10: 0.5794
Validation precision@k15: 0.7183, accuracy@k15: 0.6919
Validation precision@k20: 0.7696, accuracy@k20: 0.7653
Validation precision@k25: 0.8185, accuracy@k25: 0.8183
Validation precision@k30: 0.8573, accuracy@k30: 0.8573
