Install OS dependencies.  This only needs to be run once for each new notebook instance.

!pip install PyAthena

!pip install --upgrade gensim

Import Libraries

In [5]:
from __future__ import print_function
import numpy as np
import string
# import nltk
#from nltk import word_tokenize
import datetime
import pandas as pd
import matplotlib.pyplot as plt
import os
import boto3
from botocore.client import ClientError
# below is used to print out pretty pandas dataframes
#from IPython.display import display, HTML

from pyathena import connect
from pyathena.pandas.util import as_pandas
import torch
import torch.nn as nn
import time
#nltk.download('punkt')

Setting the CUDA device, if no cuda, we will use CPU:

In [307]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

The following code is to enable us to import data from athena services, be able to query parquet files through SQL queries

In [6]:

s3 = boto3.resource('s3')
client = boto3.client("sts")
account_id = client.get_caller_identity()["Account"]
my_session = boto3.session.Session()
region = my_session.region_name
athena_query_results_bucket = 'aws-athena-query-results-'+account_id+'-'+region

try:
    s3.meta.client.head_bucket(Bucket=athena_query_results_bucket)
except ClientError:
    bucket = s3.create_bucket(Bucket=athena_query_results_bucket)
    print('Creating bucket '+athena_query_results_bucket)
cursor = connect(s3_staging_dir='s3://'+athena_query_results_bucket+'/athena/temp').cursor()


# The Glue database name of your MIMIC-III parquet data
#gluedatabase="mimiciii"

We have already pre-processed our cohort set using SQL queries, the following is the query to fetch the list of all cohort patients and then process the notes for each patient

query = 'select cohort.subject_id, cohort.mortality_flag from default.diabetic_patients_cohort cohort order by cohort.subject_id limit 1000'
cursor.execute(query)
cohort_patients_df = as_pandas(cursor)

cohort_patients_df

We know that not all patients have the same number of visit dates, therefore, we need to find what is the maximum number of visit dates for any given patient

In [7]:
query = 'select max(number_dates) from (select nts.subject_id, count(nts.chart_date) as number_dates from default.diabetic_patients_notes_agg nts group by nts.subject_id)'
cursor.execute(query)
patients_max_visits = as_pandas(cursor)
patients_max_visits = patients_max_visits.values[0][0]
print(patients_max_visits)

505


The following function is to pre-process the notes, get rid of numbers, punctuation and tokenize the words.

def preprocess_dataset(df):    
    ''' Preprocess the text data. And return a list of clinical notes. '''
    clinical_notes = []
    
    df.notes_agg = df.notes_agg.fillna(' ')  # remove NA
    df.notes_agg = df.notes_agg.str.replace('\n',' ')  # remove newline
    df.notes_agg = df.notes_agg.str.replace('\r',' ')
    """
    TODO: 1. remove punc;
          2. remove numbers.
          
    HINT: consider using `string.punctuation`, `str.maketrans`, and `str.translate`.
    """
    df.notes_agg = df.notes_agg.str.translate(str.maketrans('', '', string.punctuation)) # remove punctuation
    df.notes_agg = df.notes_agg.str.translate(str.maketrans('', '', '1234567890')) # remove numbers
    
    df.notes_agg = df.notes_agg.str.lower()  # convert to lower case
    
    # tokenize
    for note in df.notes_agg.values:
        note_tokenized = word_tokenize(note)
        clinical_notes.append(note_tokenized)

    return clinical_notes

To process our notes, we first need to load our pre-trained word2vec model:

from gensim.models import Word2Vec
from gensim.models import KeyedVectors


#pubMedWord2VecModel = KeyedVectors.load_word2vec_format('PubMed-w2v.bin', binary=True)
word2vec_model = KeyedVectors.load('note_vectors.kv')

#word2vec_model.get_vector('and')

Now we can start processing the notes for each patient:

t0 = time.time()

patients_notes_fetures = torch.zeros(len(cohort_patients_df), patients_max_visits,200, dtype = torch.float)
patients_notes_fetures_mask = torch.zeros(len(cohort_patients_df), 1, dtype = torch.long)
missing_words = 0
patient_subject_id = []

query = ('select nts.subject_id, nts.chart_date, nts.notes_agg from default.diabetic_patients_notes_agg nts '
         'join (select cohort.subject_id, cohort.mortality_flag from default.diabetic_patients_cohort cohort order by cohort.subject_id limit 1000) cohort '
         'on nts.subject_id = cohort.subject_id '
         'order by nts.subject_id asc, nts.chart_date asc;')
cursor.execute(query)
patients_date_notes = as_pandas(cursor)
    
for patient_idx, patient in enumerate(cohort_patients_df.subject_id.values):
    patient_subject_id.append(patient)
    patient_date_notes_list = preprocess_dataset(patients_date_notes[patients_date_notes.subject_id == patient].copy())
    #print(patient_date_notes_list)
    for date_idx, note in enumerate(patient_date_notes_list):
        patient_date_note = torch.zeros(200)
        #print(note)
        for note_word in note:
            #if note_word not in vocab:
                #vocab.append(note_word)
            try:
                patient_date_note = patient_date_note + torch.FloatTensor(word2vec_model.get_vector(note_word))
            except:
                missing_words = missing_words + 1
                #if note_word not in missing_words:
                    #missing_words =  missing_words+1
        patients_notes_fetures[patient_idx][date_idx][:] = patient_date_note
        patients_notes_fetures_mask[patient_idx][0] = date_idx 
t1 = time.time()
total = t1-t0

print(total)

print(str(patients_notes_fetures.element_size() * patients_notes_fetures.nelement()/1024/1024/1024) + ' GB')


print(missing_words)

print(patients_notes_fetures.shape)
print(len(patients_notes_fetures))

#print(patients_notes_fetures[1][0])
#print(patients_notes_fetures_mask[0][4])


patients_mortality = torch.FloatTensor(cohort_patients_df.mortality_flag.values)

First we will need a function to load the pre-processed train and test datasets:

In [37]:
def load_notes_dataset_object(prefix = ''):
    
    patient_subject_id = np.load(prefix + 'subject_id.npy', allow_pickle=True).tolist()
    patients_notes_fetures = np.load(prefix + 'patients_notes_fetures.npy', allow_pickle=True)
    index_0 = np.load(prefix + 'index_0.npy', allow_pickle=True)
    index_1 = np.load(prefix + 'index_1.npy', allow_pickle=True)
    patients_notes_last_date = np.load(prefix + 'patients_notes_last_date.npy', allow_pickle=True)
    patient_mortality = np.load(prefix + 'patient_mortality.npy', allow_pickle=True)
    return patient_subject_id, patients_notes_fetures, index_0, index_1, patients_notes_last_date, patient_mortality

We now load the objects we for train and test dataset and create the notes features sparse tensor for each:

In [262]:
train_subject_id, train_patients_notes_fetures, train_index_0, train_index_1, train_patients_notes_last_date, train_patient_mortality = load_notes_dataset_object(prefix = 'train_')
train_index = [train_index_0, train_index_1]
train_patients_notes_fetures = torch.sparse_coo_tensor(train_index, train_patients_notes_fetures, (len(train_subject_id),patients_max_visits,200), dtype = torch.float)
train_patients_notes_last_date = torch.from_numpy(train_patients_notes_last_date).long()
train_patient_mortality = torch.from_numpy(train_patient_mortality).float()

In [263]:
test_subject_id, test_patients_notes_fetures, test_index_0, test_index_1, test_patients_notes_last_date, test_patient_mortality = load_notes_dataset_object(prefix = 'test_')
test_index = [test_index_0, test_index_1]
test_patients_notes_fetures = torch.sparse_coo_tensor(test_index, test_patients_notes_fetures, (len(test_subject_id),patients_max_visits,200), dtype = torch.float)
test_patients_notes_last_date = torch.from_numpy(test_patients_notes_last_date).long()
test_patient_mortality = torch.from_numpy(test_patient_mortality).float()

In [211]:
#print(test_patients_notes_last_date[0:50])
print(train_patients_notes_last_date)
#.expand(-1,200)
#print(last_visit)

tensor([[ 5],
        [16],
        [10],
        ...,
        [10],
        [50],
        [ 6]])


Now we are going to create a custom notes dataset to then partition the data in batches:

In [212]:
from torch.utils.data import Dataset

class NotesDataset(Dataset):
    
    def __init__(self, patient_id, patients_notes, last_date_idx, mortality):
        
        self.patient_id = patient_id
        self.x = patients_notes
        self.mask = last_date_idx
        self.y = mortality
        
    
    def __len__(self):
        
        return len(self.x)
    
    def __getitem__(self, index):
        
        return((self.patient_id , self.x[index].to_dense(), self.mask[index], self.y[index]))

In [264]:
notes_train_dataset = NotesDataset(train_subject_id, train_patients_notes_fetures, train_patients_notes_last_date, train_patient_mortality)

In [265]:
notes_val_dataset = NotesDataset(test_subject_id, test_patients_notes_fetures, test_patients_notes_last_date, test_patient_mortality)

In [266]:
print("Length of train dataset:", len(notes_train_dataset))
print("Length of val dataset:", len(notes_val_dataset))

Length of train dataset: 13790
Length of val dataset: 1965


from torch.utils.data import DataLoader

from torch.utils.data.dataset import random_split
torch.manual_seed(230729)

split = int(len(notes_Dataset)*0.8)

lengths = [split, len(notes_Dataset) - split]
notes_train_dataset, notes_val_dataset = random_split(notes_Dataset, lengths)


In [267]:
from torch.utils.data import DataLoader
batch_size = 50
notes_train_loader = DataLoader(notes_train_dataset, batch_size=batch_size)


In [268]:
notes_val_loader = DataLoader(notes_val_dataset, batch_size=batch_size)

In [269]:
print(len(notes_train_loader))
print(len(notes_val_loader))

276
40


Since the number of date_notes is not the same for each patient, we need to get the hidden state for the last note date for each patient, for that we implement the following function:

In [278]:
def get_last_note_date(hidden_states, masks):   
    #last_visit = ((masks.sum(axis = 2) > 0).sum(axis = 1) - 1).unsqueeze(-1)
    #if(step == 134):
    #print(masks)
    #print(hidden_states.shape)
    last_visit = masks.expand(-1,hidden_states.shape[2]).unsqueeze(1)
    
    out = torch.gather(hidden_states,dim = 1,index = last_visit)[:,-1,:]
    return out

In [271]:
def conv_output_volume(W, K, S, P):
    

    
    return  (((W-K+2*P)//S)+1)

Now We can proceed to create our RNN

In [272]:
class NotesRNN(nn.Module):
    
    def __init__(self, notes_emb_size):
        super().__init__()
        
        self.emb_size = notes_emb_size
        self.RNN = nn.GRU(input_size = notes_emb_size, hidden_size = notes_emb_size, batch_first = True)
        self.fc1 = nn.Linear(notes_emb_size,notes_emb_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout()
        self.fc2 = nn.Linear(notes_emb_size,1)
        self.sig = nn.Sigmoid()
        
    def forward(self, x, masks, step):
                
        rnn_out = self.RNN(x)
        last_note_date_hs = get_last_note_date(rnn_out[0],masks)
        fc1_out = self.fc1(last_note_date_hs)
        fc1_out = self.relu(fc1_out)
        dp_out = self.dropout(fc1_out)
        fc2_out = self.fc2(dp_out)
        out = self.sig(fc2_out).flatten()

        return out

In [273]:
notes_rnn = NotesRNN(notes_emb_size = 200)
if torch.cuda.device_count() >0:
    notes_rnn.cuda()
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(notes_rnn.parameters(), lr=0.0001)

In [304]:
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score


def eval_model(model, val_loader):
    model.eval()
    val_labels = []
    val_probs = []
    
    for step, batch in enumerate(val_loader):
        subject_id, x, masks, labels = batch
        x = x.to(device, non_blocking=True)
        masks = masks.to(device, non_blocking=True)

        with torch.no_grad():
            
            probs = model(x, masks,0)
            val_labels.extend(labels.detach().numpy().tolist())
            val_probs.extend(probs.detach().numpy().reshape(-1).tolist())

    precision, recall, f1, _ = precision_recall_fscore_support(val_labels, np.array(val_probs)>0.5, average='binary' )
    roc_auc = roc_auc_score(val_labels, val_probs)
    
    return precision, recall, f1, roc_auc

In [305]:
def train(model, train_loader, val_loader, n_epochs):
    model.train() # prep model for training
    
    for epoch in range(n_epochs):
        curr_epoch_loss = []
        print('Batch :', end = ' ')
        for step, batch in enumerate(train_loader):
            if step % 10 == 0 and step>0:
                print(str(step)+',', end=' ' )
                #print(f"Epoch {epoch}: curr_epoch_loss={np.mean(curr_epoch_loss)}")
            subject_id, x, masks, labels = batch
            """Pushing tensors to CUDA"""
            x = x.to(device, non_blocking=True)
            masks = masks.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)
            
            """ Step 1. clear gradients """
            optimizer.zero_grad()
            """ Step 2. evaluate model ouput  """
            probs = model(x, masks, step)
            """ Step 3. Calculate loss  """
            loss = criterion(probs, labels)
            """ Step 4. Backward propagation  """
            loss.backward()
            """ Step 5. optimization """
            optimizer.step()
            """ Step 6. record loss """
            curr_epoch_loss.append(loss.cpu().data.numpy())
        
        
        print(f"Epoch {epoch}: curr_epoch_loss={np.mean(curr_epoch_loss)}")
    return model



In [279]:
# number of epochs to train the model
t0 = time.time()
n_epochs = 5
train(notes_rnn, notes_train_loader, notes_val_loader, n_epochs)
t1 = time.time()
processing_time = t1-t0
print('Model Training time: ' + str(processing_time))

Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, Epoch 0: curr_epoch_loss=0.4811288118362427
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, Epoch 1: curr_epoch_loss=0.6054174304008484
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, Epoch 2: curr_epoch_loss=0.5302814245223999
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, Epoch 3: curr_epoch_loss=0.49054014682769775
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, Epoch 4: curr_epoch_loss=0.45929083228111267
Model Training time: 1301.986031293869


In [280]:
p, r, f, roc_auc = eval_model(notes_rnn, notes_val_loader)
print(p, r, f, roc_auc)

0.1323529411764706 1.0 0.23376623376623376 0.8351219837355018


Trying with original dataset

In [281]:
orig_subject_id, orig_patients_notes_fetures, orig_index_0, orig_index_1, orig_patients_notes_last_date, orig_patient_mortality = load_notes_dataset_object(prefix = 'orig_')
orig_index = [orig_index_0, orig_index_1]
orig_patients_notes_fetures = torch.sparse_coo_tensor(orig_index, orig_patients_notes_fetures, (len(orig_subject_id),patients_max_visits,200), dtype = torch.float)
orig_patients_notes_last_date = torch.from_numpy(orig_patients_notes_last_date).long()
orig_patient_mortality = torch.from_numpy(orig_patient_mortality).float()

In [282]:
notes_orig_dataset = NotesDataset(orig_subject_id, orig_patients_notes_fetures, orig_patients_notes_last_date, orig_patient_mortality)

In [283]:

from torch.utils.data.dataset import random_split

torch.manual_seed(230729)
split = int(len(notes_orig_dataset)*0.8)
lengths = [split, len(notes_orig_dataset) - split]

notes_orig_train_dataset, notes_orig_val_dataset = random_split(notes_orig_dataset, lengths)

In [293]:
notes_orig_train_loader = DataLoader(notes_orig_train_dataset, batch_size=batch_size)
notes_orig_val_loader = DataLoader(notes_orig_val_dataset, batch_size=batch_size)

In [308]:
notes_rnn = NotesRNN(notes_emb_size = 200)
if torch.cuda.device_count() >0:
    notes_rnn.cuda()
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(notes_rnn.parameters(), lr=0.0001)

In [309]:
# number of epochs to train the model
t0 = time.time()
n_epochs = 5
train(notes_rnn, notes_orig_train_loader, notes_orig_val_loader, n_epochs)
t1 = time.time()
processing_time = t1-t0
print('Model Training time: ' + str(processing_time))

Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, Epoch 0: curr_epoch_loss=0.4201892912387848
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, Epoch 1: curr_epoch_loss=0.3786067068576813
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, Epoch 2: curr_epoch_loss=0.36374491453170776
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, Epoch 3: curr_epoch_loss=0.34486886858940125
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, Epoch 4: curr_epoch_loss=0.32579171657562256
Model Training time: 688.6786022186279


In [310]:
p, r, f, roc_auc = eval_model(notes_rnn, notes_orig_val_loader)
print(p, r, f, roc_auc)

0.4090909090909091 0.04186046511627907 0.0759493670886076 0.6922245847176081


In [299]:
class NotesRNN(nn.Module):
    
    def __init__(self, notes_emb_size):
        super().__init__()
        
        self.emb_size = notes_emb_size
        self.RNN = nn.GRU(input_size = notes_emb_size, hidden_size = notes_emb_size, batch_first = True)
        self.attention = nn.Linear(505,505)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout()
        self.fc2 = nn.Linear(505,1)
        self.sig = nn.Sigmoid()
        
    def forward(self, x, masks, step):
                
        rnn_out = self.RNN(x)
        sum_hidden_states = rnn_out[0].sum(dim = 2)
        attention_out = self.attention(sum_hidden_states)
        attention_out = self.relu(attention_out)
        dp_out = self.dropout(attention_out)
        fc2_out = self.fc2(dp_out)
        out = self.sig(fc2_out).flatten()

        return out

In [300]:
notes_rnn = NotesRNN(notes_emb_size = 200)
if torch.cuda.device_count() >0:
    notes_rnn.cuda()
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(notes_rnn.parameters(), lr=0.0001)

In [301]:
# number of epochs to train the model
t0 = time.time()
n_epochs = 5
train(notes_rnn, notes_orig_train_loader, notes_orig_val_loader, n_epochs)
t1 = time.time()
processing_time = t1-t0
print('Model Training time: ' + str(processing_time))

Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, Epoch 0: curr_epoch_loss=0.42356574535369873
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, Epoch 1: curr_epoch_loss=0.37926116585731506
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, Epoch 2: curr_epoch_loss=0.3613307774066925
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, Epoch 3: curr_epoch_loss=0.3446982204914093
Batch : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, Epoch 4: curr_epoch_loss=0.3264455199241638
Model Training time: 713.2944731712341


In [302]:
p, r, f, roc_auc = eval_model(notes_rnn, notes_orig_val_loader)
print(p, r, f, roc_auc)

0.32142857142857145 0.04186046511627907 0.07407407407407408 0.6954803986710963
