Import Libraries

In [1]:
from __future__ import print_function
import numpy as np
import string
# import nltk
#from nltk import word_tokenize
import datetime
import pandas as pd
import matplotlib.pyplot as plt
import os
#import boto3
#from botocore.client import ClientError
# below is used to print out pretty pandas dataframes
#from IPython.display import display, HTML

#from pyathena import connect
#from pyathena.pandas.util import as_pandas
import torch
import torch.nn as nn
import time
import pickle
import random
import math
#nltk.download('punkt')

In [2]:
# set seed
seed = 230729
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

PATIENTS = 0  # 0 - ALL

We know that not all patients have the same number of visit dates, therefore, we need to find what is the maximum number of visit dates for any given patient

In [3]:
patients_max_visits = 505
print(patients_max_visits)

505


In preparation to run the models training on CUDA, we need to make sure that we do have a device and load the tensors and model to CUDA

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda

NVIDIA GeForce GTX 1060 6GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


We will need a function to load the pre-processed train and test datasets:

In [5]:
def load_notes_dataset_object(prefix = ''):
    
    patient_subject_id = np.load(prefix + 'subject_id.npy', allow_pickle=True).tolist()
    patients_notes_fetures = np.load(prefix + 'patients_notes_fetures.npy', allow_pickle=True)
    index_0 = np.load(prefix + 'index_0.npy', allow_pickle=True)
    index_1 = np.load(prefix + 'index_1.npy', allow_pickle=True)
    patients_notes_last_date = np.load(prefix + 'patients_notes_last_date.npy', allow_pickle=True)
    patient_mortality = np.load(prefix + 'patient_mortality.npy', allow_pickle=True)
    
    return patient_subject_id, patients_notes_fetures, index_0, index_1, patients_notes_last_date, patient_mortality

We now load the objects we for train and test dataset:

In [6]:
events_items = pickle.load( open( "events_item.p", "rb" ) )
events_values = pickle.load(open("events_value.p", "rb") )
patients = pickle.load(open('patients.p', 'rb'))
max_code = pickle.load(open('events_maxcode.p', 'rb')) + 1

assert len(events_items)==174288 and len(events_values)==174288 and len(events_values['subject_id'].unique()) == len(events_items['subject_id'].unique()) == len(patients) == 9822, "Wrong events dataframes?"
assert max_code==127, "EVENTS MAX CODE changed?"

In [7]:
orig_subject_id, orig_patients_notes_fetures, orig_index_0, orig_index_1, orig_patients_notes_last_date, orig_patient_mortality = load_notes_dataset_object(prefix = 'orig_')
orig_index = [orig_index_0, orig_index_1]
orig_patients_notes_fetures = torch.sparse_coo_tensor(orig_index, orig_patients_notes_fetures, (len(orig_subject_id),patients_max_visits,200), dtype = torch.float)
orig_patients_notes_last_date = torch.from_numpy(orig_patients_notes_last_date).long()
orig_patient_mortality = torch.from_numpy(orig_patient_mortality).float()

Now we are going to create a custom notes dataset to then partition the data in batches:

In [13]:
from torch.utils.data import Dataset

class NotesEventsDataset(Dataset):
    
    def __init__(self, patient_id, patients_notes, last_date_idx, events_items, events_values, mortality):
        
        self.patient_id = patient_id
        len_events_patients = len(events_items['subject_id'].unique())
        self.x = patients_notes.to(device, non_blocking=True)
        self.mask = last_date_idx.to(device, non_blocking=True)
        self.y = mortality.to(device, non_blocking=True)
        self.items = events_items.groupby('subject_id').agg('codes').apply(list).values
        self.values = events_values.groupby('subject_id').agg('values').apply(list).values
        assert len(self.x) == len_events_patients, 'Notes patients and events patients counts do not match!'
        r = random.randrange(len(self.x))
        assert events_items['subject_id'].unique()[r] == self.patient_id[r], 'Notes and events patient id=' + str(r) + ' does not match'
    
    def __len__(self):

        return len(self.x)
    
    def __getitem__(self, index):
        
        events = np.zeros([len(self.items[index]), max_code])

        for i, codes in enumerate(self.items[index]):
            for j, code in enumerate(codes):
                v = self.values[index][i][j]
                events[i, code] = v if not math.isnan(v) else 0.0
        

        return(self.x[index].to_dense(), self.mask[index], events, self.y[index])

In [14]:
notes_orig_dataset = NotesEventsDataset(orig_subject_id, orig_patients_notes_fetures, orig_patients_notes_last_date, events_items, events_values, orig_patient_mortality)
print ("Patients:", len(patients))
print ("Len of dataset:", len(notes_orig_dataset))
assert len(patients) == len(notes_orig_dataset), 'Wrong dataset length!'


Patients: 9822
Len of dataset: 9822


Now we create the data loaders, splitting the dataset on 80% train, 20% validation:

In [54]:
def collate_fn(data):
    x, notes_mask, events, mortality_flag = zip(*data)
    
    maxvisits = max([len(p) for p in events])
    
    events_result = torch.tensor([np.concatenate((p, np.zeros([maxvisits - len(p), max_code]))) for p in events]).float()
    events_mask = torch.tensor([np.concatenate((np.ones(len(p)), np.zeros(maxvisits - len(p)))) for p in events]).int()
    x = torch.stack(x)
    notes_mask = torch.stack(notes_mask)
    mortality_flag = torch.stack(mortality_flag)
    events_result = events_result.to(device, non_blocking=True)
    events_mask = events_mask.to(device, non_blocking=True)
    return x, notes_mask, events_result, events_mask, mortality_flag

In [55]:

from torch.utils.data.dataset import random_split

torch.manual_seed(230729)
split = int(len(notes_orig_dataset)*0.8)
lengths = [split, len(notes_orig_dataset) - split]

notes_orig_train_dataset, notes_orig_val_dataset = random_split(notes_orig_dataset, lengths)

In [56]:
from torch.utils.data import DataLoader
batch_size = 50
notes_orig_train_loader = DataLoader(notes_orig_train_dataset, batch_size=batch_size, collate_fn=collate_fn)
notes_orig_val_loader = DataLoader(notes_orig_val_dataset, batch_size=batch_size, collate_fn=collate_fn)

loader_iter = iter(notes_orig_train_loader)
x, masks, events, events_masks, y = next(loader_iter)

Now We can proceed to create our RNN

In [57]:
class NotesRNN(nn.Module):
    
    def __init__(self, notes_emb_size=128, input_notes_emb_size=200):
        super().__init__()
        
        self.emb_size = notes_emb_size
        self.RNN = nn.GRU(input_size = input_notes_emb_size, hidden_size = notes_emb_size, batch_first = True)
        self.fc1 = nn.Linear(notes_emb_size, notes_emb_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout()
        #self.fc2 = nn.Linear(notes_emb_size,128)
        #self.sig = nn.Sigmoid()
        
    def forward(self, x, masks, step):
                
        rnn_out = self.RNN(x)
        last_note_date_hs = get_last_note_date(rnn_out[0],masks)
        fc1_out = self.fc1(last_note_date_hs)
        fc1_out = self.relu(fc1_out)
        dp_out = self.dropout(fc1_out)
        #fc2_out = self.fc2(dp_out)
        #out = self.sig(fc2_out).flatten()

        return dp_out

Since the number of date_notes is not the same for each patient, we need to get the hidden state for the last note date for each patient, for that we implement the following function:

In [58]:
def get_last_note_date(hidden_states, masks):   
    #last_visit = ((masks.sum(axis = 2) > 0).sum(axis = 1) - 1).unsqueeze(-1)
    #if(step == 134):
    #print(masks)
    #print(hidden_states.shape)
    last_visit = masks.expand(-1,hidden_states.shape[2]).unsqueeze(1)
    
    out = torch.gather(hidden_states,dim = 1,index = last_visit)[:,-1,:]
    return out

In [59]:
def conv_output_volume(W, K, S, P):
    

    
    return  (((W-K+2*P)//S)+1)

## Events Network

In [60]:
class EventsRNN(nn.Module):
    
    def __init__(self, num_codes=max_code, emb_size=128):
        super().__init__()
        
       # self.embedding = nn.Embedding(num_codes, emb_size)
        self.rnn = nn.GRU(num_codes, hidden_size=emb_size, batch_first=True)
        #self.fc1 = nn.Linear(emb_size, 128)
        #self.sig = nn.Sigmoid()

    
    def forward(self, events, masks):
        
        rnn_hidden_states, _ = self.rnn(events)        
        real_hidden_states = rnn_hidden_states * masks.unsqueeze(-1).expand(rnn_hidden_states.shape)
        sum_hidden_states = real_hidden_states.sum(dim=1)
        
        #fc1 = self.fc1(sum_hidden_states)
        #output = self.sig(fc1).flatten()
        return sum_hidden_states

## Final Classifier Network

In [61]:
class OutNet(nn.Module):
    def __init__(self, notes_embeddings=128, events_embeddings=128):
        super().__init__()
        
        self.notes = NotesRNN(notes_embeddings)
        self.events = EventsRNN(emb_size=events_embeddings)
        if torch.cuda.device_count() >0:
            self.notes.cuda()
            self.events.cuda()
        self.fc1 = nn.Linear(notes_embeddings + events_embeddings, 128)
        self.fc2 = nn.Linear(128, 32)
        self.dropout = nn.Dropout()
        self.fc3 = nn.Linear(32, 1)
        self.sig = nn.Sigmoid()    
    
    def forward(self, x, masks, events, events_masks, step):
        
        notes_emb = self.notes(x, masks, step)
        events_emb = self.events(events, events_masks)
        
        joint_emb = torch.cat([notes_emb, events_emb], dim=1)
        
        fc1 = self.fc1(joint_emb)
        fc2 = self.dropout(self.fc2(fc1))
        fc3 = self.fc3(fc2)
        output = self.sig(fc3).flatten()
        return output

outNet = OutNet()
outNet

OutNet(
  (notes): NotesRNN(
    (RNN): GRU(200, 128, batch_first=True)
    (fc1): Linear(in_features=128, out_features=128, bias=True)
    (relu): ReLU()
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (events): EventsRNN(
    (rnn): GRU(127, 128, batch_first=True)
  )
  (fc1): Linear(in_features=256, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=32, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc3): Linear(in_features=32, out_features=1, bias=True)
  (sig): Sigmoid()
)

In [66]:
def train(model, train_loader, val_loader, n_epochs):
    model.train() # prep model for training
    
    for epoch in range(n_epochs):
        curr_epoch_loss = []
        print('Batch :', end = ' ')
        for step, batch in enumerate(train_loader):
            if step % 10 == 0 and step>0:
                print(str(step)+',', end=' ' )
                #print(f"Epoch {epoch}: curr_epoch_loss={np.mean(curr_epoch_loss)}")
            x, masks, events, events_masks, labels = batch
            #print('Events is cuda:' + str(events.is_cuda))
        
            """ Step 1. clear gradients """
            optimizer.zero_grad()
            """ Step 2. evaluate model ouput  """
            probs = model(x, masks, events, events_masks, step)
            """ Step 3. Calculate loss  """
            loss = criterion(probs, labels)
            """ Step 4. Backward propagation  """
            loss.backward()
            """ Step 5. optimization """
            optimizer.step()
            """ Step 6. record loss """
            curr_epoch_loss.append(loss.cpu().data.numpy())
        
        
        print(f"Epoch {epoch}: curr_epoch_loss={np.mean(curr_epoch_loss)}")
    return model



In [67]:
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score


def eval_model(model, val_loader):
    model.eval()
    val_labels = []
    val_probs = []
    
    for step, batch in enumerate(val_loader):
        x, masks, events, events_masks, labels = batch
        
        with torch.no_grad():
            
            probs = model(x, masks, events, events_masks, 0)
            val_labels.extend(labels.detach().cpu().numpy().tolist())
            val_probs.extend(probs.detach().cpu().numpy().reshape(-1).tolist())

    precision, recall, f1, _ = precision_recall_fscore_support(val_labels, np.array(val_probs)>0.5, average='binary')
    roc_auc = roc_auc_score(val_labels, val_probs)
    
    return precision, recall, f1, roc_auc

In [72]:
learning_rate = 0.001
model = OutNet(notes_embeddings=128, events_embeddings=128)
if torch.cuda.device_count() >0:
    model.cuda()
criterion = nn.BCELoss()
#optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum = 0.9, nesterov = True)

In [69]:
# number of epochs to train the model
t0 = time.time()
print('Learning Rate: ' + str(learning_rate))
n_epochs = 10
train(model, notes_orig_train_loader, notes_orig_val_loader, n_epochs)
t1 = time.time()
processing_time = t1-t0
print('Model Training time: ' + str(processing_time))

Learning Rate: 0.001
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, Epoch 0: curr_epoch_loss=0.8579788208007812
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, Epoch 1: curr_epoch_loss=0.6818565726280212
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, Epoch 2: curr_epoch_loss=0.43188226222991943
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, Epoch 3: curr_epoch_loss=0.3713035583496094
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, Epoch 4: curr_epoch_loss=0.3577061891555786
Model Training time: 149.35582661628723


In [70]:
p, r, f, roc_auc = eval_model(model, notes_orig_val_loader)
print("Precision = ",p)
print("Recall    = ", r)
print("F1        = ", f)
print("ROC AUC   = ", roc_auc)

Precision =  0.616822429906542
Recall    =  0.30697674418604654
F1        =  0.40993788819875776
ROC AUC   =  0.8324651162790696
