Import Libraries

In [62]:
from __future__ import print_function
import numpy as np
import string
# import nltk
#from nltk import word_tokenize
import datetime
import pandas as pd
import matplotlib.pyplot as plt
import os
#import boto3
#from botocore.client import ClientError
# below is used to print out pretty pandas dataframes
#from IPython.display import display, HTML

#from pyathena import connect
#from pyathena.pandas.util import as_pandas
import torch
import torch.nn as nn
import time
import pickle
import random
import math
#nltk.download('punkt')

In [9]:
# set seed
seed = 230729
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

PATIENTS = 0  # 0 - ALL

We know that not all patients have the same number of visit dates, therefore, we need to find what is the maximum number of visit dates for any given patient

In [10]:
patients_max_visits = 505
print(patients_max_visits)

505


In preparation to run the models training on CUDA, we need to make sure that we do have a device and load the tensors and model to CUDA

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cpu



We will need a function to load the pre-processed train and test datasets:

In [12]:
def load_notes_dataset_object(prefix = ''):
    
    patient_subject_id = np.load(prefix + 'subject_id.npy', allow_pickle=True).tolist()
    patients_notes_fetures = np.load(prefix + 'patients_notes_fetures.npy', allow_pickle=True)
    index_0 = np.load(prefix + 'index_0.npy', allow_pickle=True)
    index_1 = np.load(prefix + 'index_1.npy', allow_pickle=True)
    patients_notes_last_date = np.load(prefix + 'patients_notes_last_date.npy', allow_pickle=True)
    patient_mortality = np.load(prefix + 'patient_mortality.npy', allow_pickle=True)
    
    return patient_subject_id, patients_notes_fetures, index_0, index_1, patients_notes_last_date, patient_mortality

We now load the objects we for train and test dataset:

In [13]:
events_items = pickle.load( open( "events_item.p", "rb" ) )
events_values = pickle.load(open("events_value.p", "rb") )
patients = pickle.load(open('patients.p', 'rb'))
max_code = pickle.load(open('events_maxcode.p', 'rb')) + 1

assert len(events_items)==174288 and len(events_values)==174288 and len(events_values['subject_id'].unique()) == len(events_items['subject_id'].unique()) == len(patients) == 9822, "Wrong events dataframes?"
assert max_code==127, "EVENTS MAX CODE changed?"

In [14]:
orig_subject_id, orig_patients_notes_fetures, orig_index_0, orig_index_1, orig_patients_notes_last_date, orig_patient_mortality = load_notes_dataset_object(prefix = 'orig_')
orig_index = [orig_index_0, orig_index_1]
orig_patients_notes_fetures = torch.sparse_coo_tensor(orig_index, orig_patients_notes_fetures, (len(orig_subject_id),patients_max_visits,200), dtype = torch.float)
orig_patients_notes_last_date = torch.from_numpy(orig_patients_notes_last_date).long()
orig_patient_mortality = torch.from_numpy(orig_patient_mortality).float()

Now we are going to create a custom notes dataset to then partition the data in batches:

In [64]:
from torch.utils.data import Dataset

class NotesEventsDataset(Dataset):
    
    def __init__(self, patient_id, patients_notes, last_date_idx, events_items, events_values, mortality):
        
        self.patient_id = patient_id
        len_events_patients = len(events_items['subject_id'].unique())
        if torch.cuda.device_count() >0:
            self.x = patients_notes.to(device, non_blocking=True)
            self.mask = last_date_idx.to(device, non_blocking=True)
            self.y = mortality.to(device, non_blocking=True)
            self.items = events_items.groupby('subject_id').agg('codes').apply(list).values.to(device, non_blocking=True)
            self.values = events_values.groupby('subject_id').agg('values').apply(list).values.to(device, non_blocking=True)            
        else:
            self.x = patients_notes
            self.mask = last_date_idx
            self.y = mortality
            self.items = events_items.groupby('subject_id').agg('codes').apply(list).values
            self.values = events_values.groupby('subject_id').agg('values').apply(list).values
        assert len(self.x) == len_events_patients, 'Notes patients and events patients counts do not match!'
        r = random.randrange(len(self.x))
        assert events_items['subject_id'].unique()[r] == self.patient_id[r], 'Notes and events patient id=' + str(r) + ' does not match'
    
    def __len__(self):

        return len(self.x)
    
    def __getitem__(self, index):
        
        events = np.zeros([len(self.items[index]), max_code])

        for i, codes in enumerate(self.items[index]):
            for j, code in enumerate(codes):
                v = self.values[index][i][j]
                events[i, code] = v if not math.isnan(v) else 0.0
        
        return(self.patient_id , self.x[index].to_dense(), self.mask[index], events, self.y[index])

In [65]:
notes_orig_dataset = NotesEventsDataset(orig_subject_id, orig_patients_notes_fetures, orig_patients_notes_last_date, events_items, events_values, orig_patient_mortality)
print ("Patients:", len(patients))
print ("Len of dataset:", len(notes_orig_dataset))
assert len(patients) == len(notes_orig_dataset), 'Wrong dataset length!'

Patients: 9822
Len of dataset: 9822


Now we create the data loaders, splitting the dataset on 80% train, 20% validation:

In [96]:
def collate_fn(data):
    subject_id, x, notes_mask, events, mortality_flag = zip(*data)
    
    maxvisits = max([len(p) for p in events])
    
    events_result = torch.tensor([np.concatenate((p, np.zeros([maxvisits - len(p), max_code]))) for p in events]).float()
    events_mask = torch.tensor([np.concatenate((np.ones(len(p)), np.zeros(maxvisits - len(p)))) for p in events]).int()
    
    return torch.tensor(subject_id).int(), x, notes_mask, events_result, events_mask, mortality_flag

In [97]:

from torch.utils.data.dataset import random_split

torch.manual_seed(230729)
split = int(len(notes_orig_dataset)*0.8)
lengths = [split, len(notes_orig_dataset) - split]

notes_orig_train_dataset, notes_orig_val_dataset = random_split(notes_orig_dataset, lengths)

In [98]:
from torch.utils.data import DataLoader
batch_size = 50
notes_orig_train_loader = DataLoader(notes_orig_train_dataset, batch_size=batch_size, collate_fn=collate_fn)
notes_orig_val_loader = DataLoader(notes_orig_val_dataset, batch_size=batch_size, collate_fn=collate_fn)

Now We can proceed to create our RNN

In [99]:
class NotesRNN(nn.Module):
    
    def __init__(self, notes_emb_size):
        super().__init__()
        
        self.emb_size = notes_emb_size
        self.RNN = nn.GRU(input_size = notes_emb_size, hidden_size = notes_emb_size, batch_first = True)
        self.fc1 = nn.Linear(notes_emb_size,notes_emb_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout()
        self.fc2 = nn.Linear(notes_emb_size,1)
        self.sig = nn.Sigmoid()
        
    def forward(self, x, masks, step):
                
        rnn_out = self.RNN(x)
        last_note_date_hs = get_last_note_date(rnn_out[0],masks)
        fc1_out = self.fc1(last_note_date_hs)
        fc1_out = self.relu(fc1_out)
        dp_out = self.dropout(fc1_out)
        fc2_out = self.fc2(dp_out)
        out = self.sig(fc2_out).flatten()

        return out

Since the number of date_notes is not the same for each patient, we need to get the hidden state for the last note date for each patient, for that we implement the following function:

In [100]:
def get_last_note_date(hidden_states, masks):   
    #last_visit = ((masks.sum(axis = 2) > 0).sum(axis = 1) - 1).unsqueeze(-1)
    #if(step == 134):
    #print(masks)
    #print(hidden_states.shape)
    last_visit = masks.expand(-1,hidden_states.shape[2]).unsqueeze(1)
    
    out = torch.gather(hidden_states,dim = 1,index = last_visit)[:,-1,:]
    return out

In [101]:
def conv_output_volume(W, K, S, P):
    

    
    return  (((W-K+2*P)//S)+1)

In [106]:
def train(model, train_loader, val_loader, n_epochs):
    model.train() # prep model for training
    
    for epoch in range(n_epochs):
        curr_epoch_loss = []
        print('Batch :', end = ' ')
        for step, batch in enumerate(train_loader):
            if step % 10 == 0 and step>0:
                print(str(step)+',', end=' ' )
                #print(f"Epoch {epoch}: curr_epoch_loss={np.mean(curr_epoch_loss)}")
            subject_id, x, masks, events, events_masks, labels = batch
        
            """ Step 1. clear gradients """
            optimizer.zero_grad()
            """ Step 2. evaluate model ouput  """
            probs = model(x, masks, step)
            """ Step 3. Calculate loss  """
            loss = criterion(probs, labels)
            """ Step 4. Backward propagation  """
            loss.backward()
            """ Step 5. optimization """
            optimizer.step()
            """ Step 6. record loss """
            curr_epoch_loss.append(loss.cpu().data.numpy())
        
        
        print(f"Epoch {epoch}: curr_epoch_loss={np.mean(curr_epoch_loss)}")
    return model



In [107]:
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score


def eval_model(model, val_loader):
    model.eval()
    val_labels = []
    val_probs = []
    
    for step, batch in enumerate(val_loader):
        subject_id, x, masks, events, events_masks, labels = batch
        
        with torch.no_grad():
            
            probs = model(x, masks,0)
            val_labels.extend(labels.detach().cpu().numpy().tolist())
            val_probs.extend(probs.detach().cpu().numpy().reshape(-1).tolist())

    precision, recall, f1, _ = precision_recall_fscore_support(val_labels, np.array(val_probs)>0.5, average='binary')
    roc_auc = roc_auc_score(val_labels, val_probs)
    
    return precision, recall, f1, roc_auc

In [108]:
learning_rate = 0.0001
notes_rnn = NotesRNN(notes_emb_size = 200)
if torch.cuda.device_count() >0:
    notes_rnn.cuda()
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(notes_rnn.parameters(), lr=learning_rate)

In [109]:
# number of epochs to train the model
t0 = time.time()
print('Learning Rate: ' + str(learning_rate))
n_epochs = 5
train(notes_rnn, notes_orig_train_loader, notes_orig_val_loader, n_epochs)
t1 = time.time()
processing_time = t1-t0
print('Model Training time: ' + str(processing_time))

Learning Rate: 0.0001
Batch : 

AttributeError: 'tuple' object has no attribute 'size'

In [17]:
p, r, f, roc_auc = eval_model(notes_rnn, notes_orig_val_loader)
print(p, r, f, roc_auc)

0.7209302325581395 0.14418604651162792 0.24031007751937988 0.7779495016611296
