Install OS dependencies.  This only needs to be run once for each new notebook instance.

Import Libraries

In [1]:
from __future__ import print_function
import numpy as np
import string
import datetime
import pandas as pd
import matplotlib.pyplot as plt
import os
import torch
import torch.nn as nn
import time
import pickle
import random
import math
from torch.utils.data import Dataset
from torch.utils.data.dataset import random_split
from torch.utils.data import DataLoader, Subset
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score

In [2]:
# 
seed = 230729
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
max_code = pickle.load(open('events_maxcode.p', 'rb')) + 1
assert max_code==127, "EVENTS MAX CODE changed?"

In [20]:
patients_max_visits = 505

Setting the CUDA device, if no cuda, we will use CPU:

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda

NVIDIA GeForce GTX 1060 6GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


The following code is to enable us to import data from athena services, be able to query parquet files through SQL queries

First we will need a function to load the pre-processed train and test datasets:

In [4]:
def load_events_dataset_object(prefix=''):
    return pickle.load( open(prefix + "events_item.p", "rb" )), pickle.load(open(prefix + "events_value.p", "rb"))

In [5]:
def load_notes_dataset_object(prefix = ''):
    
    patient_subject_id = np.load(prefix + 'subject_id.npy', allow_pickle=True).tolist()
    patients_notes_fetures = np.load(prefix + 'patients_notes_fetures.npy', allow_pickle=True)
    index_0 = np.load(prefix + 'index_0.npy', allow_pickle=True)
    index_1 = np.load(prefix + 'index_1.npy', allow_pickle=True)
    patient_mortality = np.load(prefix + 'patient_mortality.npy', allow_pickle=True)

    return patient_subject_id, patients_notes_fetures, index_0, index_1, patient_mortality

In [6]:
class NotesEventsDataset(Dataset):
    
    def __init__(self, patient_id, patients_notes, notes_mask, events_items, events_values, mortality):
        
        self.patient_id = patient_id
        len_events_patients = len(events_items['subject_id'].unique())
        self.x = patients_notes.to(device, non_blocking=True)
        self.notes_mask = notes_mask.to(device, non_blocking=True)
        self.y = mortality.to(device, non_blocking=True)
        self.items = events_items.groupby('subject_id').agg('codes').apply(list).values
        self.values = events_values.groupby('subject_id').agg('values').apply(list).values
        assert len(self.x) == len_events_patients, 'Notes patients and events patients counts do not match!'
        r = random.randrange(len(self.x))
        assert events_items['subject_id'].unique()[r] == self.patient_id[r], 'Notes and events patient id=' + str(r) + ' does not match'
    
    def __len__(self):

        return len(self.x)
    
    def __getitem__(self, index):
        
        events = np.zeros([len(self.items[index]), max_code])

        for i, codes in enumerate(self.items[index]):
            for j, code in enumerate(codes):
                v = self.values[index][i][j]
                events[i, code] = v if not math.isnan(v) else 0.0
        

        return(self.x[index].to_dense(), self.notes_mask[index].to_dense(), events, self.y[index])

In [7]:
def create_dataset (cohort_type = 'original'):
    """
    cohort_type = 'original' -> Unbalanced cohort will be created
    cohort_type = 'essential'  ->  Unbalanced cohort with just the minimum set of events features
    cohort_type = 'balanced_train' -> Balanced cohort for training will be created
    cohort_type = 'balanced_test' -> Balanced cohort for testing will be created
    """
    notes_prefix = "orig_" if cohort_type in ['original','essential'] else "train_" if cohort_type == 'balanced_train' else "test_"
    subject_id, patients_notes_fetures, index_0, index_1, patient_mortality= load_notes_dataset_object(prefix = notes_prefix)
    index = [index_0, index_1]
    patients_notes_fetures = torch.sparse_coo_tensor(index, patients_notes_fetures, (len(subject_id),patients_max_visits,200), dtype = torch.float)
    
    ones = np.ones((len(index_0),200))
    notes_mask = torch.sparse_coo_tensor(index, ones, (len(subject_id),patients_max_visits,200), dtype = torch.float)
    
    patient_mortality = torch.from_numpy(patient_mortality).float()    
    
    events_prefix = "" if cohort_type=='original' else cohort_type +"_" 
    events_items, events_values = load_events_dataset_object(events_prefix)

    assert len(events_items)==len(events_values) and len(events_values['subject_id'].unique()) == len(events_items['subject_id'].unique()) == len(patient_mortality) , "Wrong events dataframes?"
        
    dataset = NotesEventsDataset(subject_id, patients_notes_fetures, notes_mask, events_items, events_values, patient_mortality)
    assert len(patient_mortality) == len(dataset), 'Wrong dataset length!'
    print ("Number of Patients:", len(patient_mortality))


    return dataset

In [8]:
batch_size = 50

In [9]:
def collate_fn(data):
    x, notes_mask, events, mortality_flag = zip(*data)
    
    maxvisits = max([len(p) for p in events])
    
    events_result = torch.tensor([np.concatenate((p, np.zeros([maxvisits - len(p), max_code]))) for p in events]).float()
    events_mask = torch.tensor([np.concatenate((np.ones(len(p)), np.zeros(maxvisits - len(p)))) for p in events]).int()
    x = torch.stack(x)
    notes_mask = torch.stack(notes_mask)
    mortality_flag = torch.stack(mortality_flag)
    events_result = events_result.to(device, non_blocking=True)
    events_mask = events_mask.to(device, non_blocking=True)
    return x, notes_mask, events_result, events_mask, mortality_flag

In [10]:
def get_unbalanced_dataloaders (max_size = 0):

    dataset = create_dataset('original')
    if (max_size > 0):
        print ("***** Slicing to " + str(max_size))
        dataset = Subset(dataset, np.arange(max_size))

    split = int(len(dataset)*0.8)
    lengths = [split, len(dataset) - split]

    train_dataset, val_dataset = random_split(dataset, lengths)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

    return train_loader, val_loader

In [25]:
def get_balanced_dataloaders (max_size = 0):

    print ("* Train dataset *")
    balanced_train_dataset = create_dataset('balanced_train')
    print ("* Test dataset *")
    balanced_test_dataset = create_dataset('balanced_test')

    if (max_size > 0):
        print ("***** Slicing to " + str(max_size))
        balanced_train_dataset = Subset(balanced_train_dataset, np.arange(max_size))
        balanced_test_dataset = Subset(balanced_test_dataset, np.arange(max_size))

    train_loader = DataLoader(balanced_train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
    val_loader = DataLoader(balanced_test_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

    return train_loader, val_loader

In [22]:
class AlphaAttention(torch.nn.Module):
    """
    Alpha attention mechanism to compute the attention weights corresponding to each date with events data.
    """

    def __init__(self, hidden_dim):
        super().__init__()
        """        
        Arguments:
            hidden_dim: the hidden layer dimension
        """
        
        self.a_att = nn.Linear(hidden_dim, 1)

    def forward(self, g):
        """        
        Arguments:
            g: the output tensor from RNN-alpha of shape (batch_size, seq_length, hidden_dim) 
        
        Outputs:
            alpha: the corresponding attention weights of shape (batch_size, seq_length, 1)
            
        """
        
        return torch.softmax(self.a_att(g), dim=1)

In [13]:
class BetaAttention(torch.nn.Module):
    """
    Beta attention mechanism to compute the attention weights corresponding to each event code.
    """

    def __init__(self, input_dim, emb_dim):
        super().__init__()
        """
        Arguments:
            input_dim: the hidden layer dimension
            emb_dim: the number of events codes
        """
        
        self.b_att = nn.Linear(input_dim, emb_dim)


    def forward(self, h):
        """
        Arguments:
            h: the output tensor from RNN-beta of shape (batch_size, seq_length, input_dim) 
        
        Outputs:
            beta: the corresponding attention weights of shape (batch_size, seq_length, # of events codes)
        """
        
        return torch.tanh(self.b_att(h))

In [14]:
class EventsRNN(nn.Module):
    
    def attention_sum(self, alpha, beta, x, masks):
        """
            Performs the weighted sum of the events data using alpha and beta attention weights. 
            It also sets to 0 the positions corresponding to dates without events data using the masks information.

        Arguments:
            alpha: the alpha attention weights of shape (batch_size, seq_length, 1)
            beta: the beta attention weights of shape (batch_size, seq_length, hidden_dim)
            x: the events data for each date with shape (batch_size, # of dates, # of events codes)
            masks: the padding masks in time of shape (batch_size, # of dates, # of events codes)

        Outputs:
            c: the context vector of shape (batch_size, hidden_dim)
        """
        
        masks = masks.unsqueeze(-1)
        return torch.sum( beta * x * alpha * masks , dim=1 )


    def __init__(self, num_codes, emb_size=128):
        super().__init__()

        # Define the RNN-alpha using `nn.GRU()`
        self.rnn_a = nn.GRU(num_codes, 128, batch_first=True)
        # Define the RNN-beta using `nn.GRU()`
        self.rnn_b = nn.GRU(num_codes, 128, batch_first=True)
        # Define the alpha-attention using `AlphaAttention()`
        self.att_a = AlphaAttention(128)
        # Define the beta-attention using `BetaAttention()`
        self.att_b = BetaAttention(128, num_codes)
        # Define the linear layers using `nn.Linear()`
        self.fc = nn.Linear(num_codes, 1)
        # Define the final activation layer using `nn.Sigmoid().
        self.sigmoid = nn.Sigmoid()

    
    def forward(self, events, masks):

        # Pass the events data through RNN-alpha
        g, _ = self.rnn_a(events)
        # Pass the events data through RNN-beta
        h, _ = self.rnn_b(events)
        # Obtain the alpha and beta attentions using `AlphaAttention()` and `BetaAttention()`;
        alpha = self.att_a(g)
        beta = self.att_b(h)
        # Perform the weighted sum of the events data using the attention weights for the dates with events data
        c = self.attention_sum(alpha, beta, events, masks)
        # Pass the context vector through the linear and activation layers.
        logits = self.fc(c)
        probs = self.sigmoid(logits)
        return probs.squeeze()

In [15]:
def create_model_and_optimizer():
    model = EventsRNN(max_code, emb_size=128)
    if torch.cuda.device_count() >0:
        model.cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    #optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum = 0.9, nesterov = True)
    return model, optimizer

In [16]:
def train(model, train_loader, n_epochs):
    model.train() # prep model for training
    
    for epoch in range(n_epochs):
        curr_epoch_loss = []
        print('Batch :', end = ' ')
        for step, batch in enumerate(train_loader):
            if step % 10 == 0 and step>0:
                print(str(step)+',', end=' ' )
            x, masks, events, events_masks, labels = batch
        
            """ Step 1. clear gradients """
            optimizer.zero_grad()
            """ Step 2. evaluate model ouput  """
            probs = model(events, events_masks)
            """ Step 3. Calculate loss  """
            loss = criterion(probs, labels)
            """ Step 4. Backward propagation  """
            loss.backward()
            """ Step 5. optimization """
            optimizer.step()
            """ Step 6. record loss """
            curr_epoch_loss.append(loss.cpu().data.numpy())
        
        
        print(f"Epoch {epoch}: curr_epoch_loss={np.mean(curr_epoch_loss)}")
    return model

In [17]:
def eval_model(model, val_loader):
    model.eval()
    val_labels = []
    val_probs = []
    
    for step, batch in enumerate(val_loader):
        x, masks, events, events_masks, labels = batch
        
        with torch.no_grad():
            
            probs = model(events, events_masks)
            val_labels.extend(labels.detach().cpu().numpy().tolist())
            val_probs.extend(probs.detach().cpu().numpy().reshape(-1).tolist())

    precision, recall, f1, _ = precision_recall_fscore_support(val_labels, np.array(val_probs)>0.5, average='binary')
    roc_auc = roc_auc_score(val_labels, val_probs)
    
    return precision, recall, f1, roc_auc

In [18]:
def train_and_eval(model, train_loader, val_loader, n_epochs=10, filename='model.pt'):
    t0 = time.time()
    train(model, train_loader, n_epochs)
    t1 = time.time()
    processing_time = t1-t0
    print('Model Training time: ' + str(processing_time))
    
    p, r, f, roc_auc = eval_model(model, val_loader)
    print ("Learning rate: " + str(learning_rate))
    print("Model Training time: " + str(processing_time))
    print("Precision = ",p)
    print("Recall    = ", r)
    print("F1        = ", f)
    print("ROC AUC   = ", roc_auc)
    print(p,"\t",r,"\t",f,"\t",roc_auc)
    """
    if filename is not None:
        torch.save(model.state_dict(), filename)
    """
    return p, r, f, roc_auc

In [23]:
learning_rate = 0.001
n_epochs = 10
criterion = nn.BCELoss()
print('Learning Rate: ' + str(learning_rate))
print ("Number of Epochs: " + str(n_epochs))

print ('')
print ('--------------')
print ('Original model')
print ('--------------')
model, optimizer = create_model_and_optimizer()
train_loader, val_loader = get_unbalanced_dataloaders()   # You can pass a number to limit the number of samples
train_and_eval(model, train_loader, val_loader, n_epochs, 'unbalanced_model.pt')
#load_and_eval(model, 'unbalanced_model.pt', val_loader)


Learning Rate: 0.001
Number of Epochs: 10

--------------
Original model
--------------
Number of Patients: 9822
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, Epoch 0: curr_epoch_loss=0.429216206073761
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, Epoch 1: curr_epoch_loss=0.2886306941509247
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, Epoch 2: curr_epoch_loss=0.27539151906967163
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, Epoch 3: curr_epoch_loss=0.23904095590114594
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, Epoch 4: curr_epoch_loss=0.23107142746448517
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, Epoch 5: curr_epoch_loss=0.22080597281455994
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, Epoch 6: curr_epoch_loss=0.22208595275878906
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120

(0.6032608695652174,
 0.46443514644351463,
 0.524822695035461,
 0.8506087066135938)

In [26]:
learning_rate = 0.0001
n_epochs = 10
criterion = nn.BCELoss()
print('Learning Rate: ' + str(learning_rate))
print ("Number of Epochs: " + str(n_epochs))
print ('')
print ('')
print ('--------------')
print ('Balanced model')
print ('--------------')
model, optimizer = create_model_and_optimizer()
train_loader, val_loader = get_balanced_dataloaders()       # You can pass a number to limit the number of samples
train_and_eval(model, train_loader, val_loader, n_epochs, 'balanced_model.pt')
#load_and_eval(model, 'balanced_model.pt', val_loader)


Learning Rate: 0.0001
Number of Epochs: 10


--------------
Balanced model
--------------
* Train dataset *
Number of Patients: 13790
* Test dataset *
Number of Patients: 1965
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, Epoch 0: curr_epoch_loss=0.6428784728050232
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, Epoch 1: curr_epoch_loss=0.46494531631469727
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, Epoch 2: curr_epoch_loss=0.3550058901309967
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, Epoch 3: curr_epoch_loss=0.28842878341674805
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 2

(0.7318840579710145, 0.8632478632478633, 0.792156862745098, 0.9600473023349972)