00. Environment setup

In [2]:
# Notebook environment setup
%pip install google-cloud-bigquery
%pip install google-cloud
%pip install google-cloud-vision
%pip install tqdm
%pip install numpy
%pip install tensorflow
%pip install matplotlib
%pip install gensim
%pip install pandas
%pip install sklearn

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/pytho

In [135]:
# Import packages
%load_ext autoreload
%autoreload 2
import numpy as np
import gdown
import numpy as np
import pandas as pd
import gensim
import torch
import os
from google.cloud import bigquery
from google.oauth2 import service_account
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from sklearn.metrics import precision_score, recall_score, f1_score
from model_utils import train_model, score_model, evaluate
from model_utils import ArrayDataset, D2VDataset

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


---
# 01. Fetch Discharge Summaries

In [114]:
# Fetch MIMIC-III Discharge Summaries from BigQuery
credentials = service_account.Credentials.from_service_account_file('/users/fahimkhan/Downloads/endless-upgrade-415215-8ffb43b4399b.json')

project_id = 'endless-upgrade-415215'
client = bigquery.Client(credentials= credentials,project=project_id)

# Query all discharge summaries from the note events table
train_query = client.query("""
  SELECT t1.HADM_ID, STRING_AGG(CAST(t1.TEXT AS STRING), ' ') AS concatenated_text
  FROM `endless-upgrade-415215.mimic3_notes.noteevents` AS t1
  JOIN (
      SELECT DISTINCT HADM_ID
      FROM `endless-upgrade-415215.mimic3_notes.noteevents`
  ) AS t2
  ON t1.HADM_ID = t2.HADM_ID
  WHERE t1.CATEGORY = "Discharge summary"
  GROUP BY t1.HADM_ID
 """)

# Run query
train_ds = train_query.result()

In [115]:
print(train_ds.total_rows)

52726


Fetch Labels

In [126]:
# Load labels
labels = np.load('data/finallabels.npz')['arr_0']
total_patients = labels.shape[0]
labels_tensor = torch.FloatTensor(labels)

print(labels[1])

[0. 0. 0. ... 0. 0. 0.]


---
# 02. Doc2Vector 

Tokenize Discharge Summaries ~5mins

In [116]:
def read_corpus(results, tokens_only=False):
    i = 0
    for row in results:
        tokens = gensim.utils.simple_preprocess(row[1])
        if tokens_only:
            yield tokens
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])
        i+=1

train_corpus = list(read_corpus(train_ds))

Train Doc2Vec Model

Avg Time ~ 2.5 mins

In [118]:
d2v_vector_size = 128
d2v_model = gensim.models.doc2vec.Doc2Vec(vector_size=d2v_vector_size, 
                                          min_count=2, 
                                          epochs=5,
                                          dbow_words=1)
d2v_model.build_vocab(train_corpus)
d2v_model.train(train_corpus, total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)

Create DataLoaders

In [136]:
doc_vectors = np.array([d2v_model.dv[i] for i in range(len(d2v_model.dv))])

# Calculate the number of samples for each set
percentage = 0.1
num_samples = int(np.floor(percentage * len(doc_vectors)))
train_samples = int(np.floor(0.7 * num_samples))
valid_samples = int(np.floor(0.15 * num_samples))  # Assuming 50% of the leftovers for validation
test_samples = num_samples - train_samples - valid_samples

# Slice the arrays to create train/test sets
train_data = doc_vectors[:train_samples]
train_labels = labels[:train_samples]

valid_data = doc_vectors[train_samples:train_samples + valid_samples]
valid_labels = labels[train_samples:train_samples + valid_samples]

test_data = doc_vectors[train_samples + valid_samples:train_samples + valid_samples + test_samples]
test_labels = labels[train_samples + valid_samples:train_samples + valid_samples + test_samples]

# Create ArrayDataset objects for train, validation, and test sets
train_dataset = D2VDataset(train_data, train_labels)
valid_dataset = D2VDataset(valid_data, valid_labels)
test_dataset = D2VDataset(test_data, test_labels)

# Create DataLoader objects for train, validation, and test sets
batch_size = 32  # Choose your desired batch size
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


Fully Connected Neural Network to extract D2V features

In [137]:
class FCNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.9):
        super(FCNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.dropout = nn.Dropout(p=1 - dropout_rate)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

Training FCNN using D2V output

Avg Time ~ 0.5 mins

In [129]:
# Define input size and output size
input_size = 128
hidden_size = 64
output_size = 6918

# Number of epochs
num_epochs = 5

# Create an instance of the model
fcnn_model = FCNN(input_size=input_size, hidden_size=hidden_size, output_size=output_size)

# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(fcnn_model.parameters(), lr=0.001)

d2v_outputs = []

# Training loop
for epoch in range(num_epochs):
    #for i in range(len(d2v_model.dv)):
    for i in range(5722):
        input_data = torch.tensor(d2v_model.dv[i], dtype=torch.float)
        
        # Forward pass
        output = fcnn_model(input_data)

        # Compute the loss
        loss = criterion(output, labels_tensor[i])
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Backward pass
        loss.backward()
        
        # Update the weights
        optimizer.step()
    
    # Print loss every epoch
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

Epoch [1/5], Loss: 0.6932
Epoch [2/5], Loss: 0.6932
Epoch [3/5], Loss: 0.6931
Epoch [4/5], Loss: 0.6931
Epoch [5/5], Loss: 0.6931


Train Classifier

In [142]:
num_epochs = 1
# 10% data, 5 Epochs ~ 8.5 mins

# Define input size and output size
input_size = 128
hidden_size = 64
output_size = 6918

# Number of epochs
num_epochs = 50

# Create an instance of the model
fcnn_model = FCNN(input_size=input_size, hidden_size=hidden_size, output_size=output_size)

# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(fcnn_model.parameters(), lr=0.001)

tlosses, tprecs, trecs, tf1s = [],[],[], []
vlosses, vprecs, vrecs, vf1s = [],[],[], []

best_val_f1 = 0.0

for epoch in range(num_epochs):
  tloss, tprec, trec, tf1 = train_model(fcnn_model, train_loader, criterion, optimizer)
  vloss, vprec, vrec, vf1 = evaluate(fcnn_model, test_loader, criterion)

  tlosses.append(tloss)
  tprecs.append(tprec)
  trecs.append(trec)
  tf1s.append(tf1)

  vlosses.append(vloss)
  vrecs.append(vprec)
  vrecs.append(vrec)
  vf1s.append(vf1)

  is_best = vf1 > best_val_f1
  best_val_f1 = best_val_f1*(not is_best) + vf1*is_best

  # if is_best:
  #   best_val_f1 = vf1
    # torch.save(model, "myCNN.pth", _use_new_zipfile_serialization=False)

  print(f'Epoch {epoch+1}, Loss: {tlosses[-1]}, Valid F1: {vf1s[-1]}')

Epoch 1, Loss: 0.16248567018593693, Valid F1: 0.06870397123992839
Epoch 2, Loss: 0.018996160686144542, Valid F1: 0.0537811714610336
Epoch 3, Loss: 0.018570418477636474, Valid F1: 0.09648207950916379
Epoch 4, Loss: 0.018369201875837713, Valid F1: 0.08705806341553426
Epoch 5, Loss: 0.018319569327386803, Valid F1: 0.08166270409791393
Epoch 6, Loss: 0.018250893592706013, Valid F1: 0.06897344098675019
Epoch 7, Loss: 0.01819218653414784, Valid F1: 0.06703061056673174
Epoch 8, Loss: 0.018091250387630587, Valid F1: 0.07779081127628033
Epoch 9, Loss: 0.01810520860077492, Valid F1: 0.06930783023847704
Epoch 10, Loss: 0.018123624878453797, Valid F1: 0.08346485868002165
Epoch 11, Loss: 0.01804174426621918, Valid F1: 0.07310014459843396
Epoch 12, Loss: 0.01799318265844265, Valid F1: 0.0736259347800419
Epoch 13, Loss: 0.01796421450402202, Valid F1: 0.08484381416438888
Epoch 14, Loss: 0.01794897755286817, Valid F1: 0.06548282429244313
Epoch 15, Loss: 0.01792500033203898, Valid F1: 0.08280347827798831

In [143]:
fcnn_model.eval()  # Setting the model to evaluation mode

all_true_labels = []
all_outputs = []

with torch.no_grad():  # No need to track gradients during evaluation
    for inputs, labels in test_loader:
        outputs = fcnn_model(inputs)
        # print(inputs.shape, labels.shape, outputs.shape)
        all_outputs.extend(outputs.cpu().numpy())
        all_true_labels.extend(labels.cpu().numpy())

In [144]:
thresh_ls = [0.005, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
for i in thresh_ls:
  print('threshold:', i,
        *["{}: {}".format(name, num) for name, num in zip(
          ["precision", "recall","f1"],
          score_model(all_true_labels, all_outputs, thresh=i))])

threshold: 0.005 precision: 0.25390851187028796 recall: 0.05282814288295855 f1: 0.08745948641236513
threshold: 0.05 precision: 0.2570220477197144 recall: 0.05124036608863168 f1: 0.08544605652894136
threshold: 0.1 precision: 0.25826972010177296 recall: 0.048856799037304166 f1: 0.08216960129528354
threshold: 0.2 precision: 0.26446860750612894 recall: 0.04529888855512139 f1: 0.07734919983586298
threshold: 0.3 precision: 0.26939405634888963 recall: 0.041881675267010435 f1: 0.0724931193851579
threshold: 0.4 precision: 0.27595044852625905 recall: 0.038712770420087264 f1: 0.06789993693504238
threshold: 0.5 precision: 0.2791353383458515 recall: 0.03554119547657491 f1: 0.06305397802664335


---
# 03. **CNN Part**

Load Weights

In [10]:
# Load Weights
data = np.load('data/w2v_stack3.npz')

#Change to (Batch, Feats/Channels, Sequences)
data = torch.FloatTensor(data['arr_0']).permute(2,1,0)
weights = np.load('data/loss_weights.npz')['arr_0']

Load Labels

In [11]:
# Load labels
labels = np.load('data/finallabels.npz')['arr_0']
total_patients = labels.shape[0]
labels = torch.FloatTensor(labels)

CNN Model

In [12]:
VECTOR_FEATURE_SIZE = 100
NUM_LABELS = labels.shape[1]
MAX_LENGTH = 700

class MyModel(nn.Module):
    def __init__(self, embedding_matrix=None):
        super(MyModel, self).__init__()

        self.embedding_matrix = embedding_matrix

        if self.embedding_matrix is not None:
            self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)

        input_length = 100

        # Define convolutional layers for 3 different kernel sizes
        k1, k2, k3 = 3,4,5
        self.conv1 = nn.Conv1d(in_channels=VECTOR_FEATURE_SIZE, out_channels=64, kernel_size=k1)
        self.conv2 = nn.Conv1d(in_channels=VECTOR_FEATURE_SIZE, out_channels=64, kernel_size=k2)
        self.conv3 = nn.Conv1d(in_channels=VECTOR_FEATURE_SIZE, out_channels=64, kernel_size=k3)

        # Max Pooling
        self.global_max_pool = nn.AdaptiveMaxPool1d(output_size=1)

        # Define a dropout layer to prevent overfitting
        self.dropout = nn.Dropout(0.75)

        # Fully connected layer
        self.fc = nn.Linear(64 * 3, NUM_LABELS)

        self.activationR = nn.ReLU()
        self.activationS = nn.Sigmoid() #Actually not using this because the loss function handles it

    def forward(self, x):

        if self.embedding_matrix is not None:
            # Convert token IDs to embeddings
            x = self.embedding(x)

        x = x.float()

        # Apply convolution
        x1 = F.pad(x,(1,1))
        x1 = self.conv1(x)
        x1 = self.activationR(x1)
        x2 = F.pad(x,(1,2))
        x2 = self.conv2(x)
        x2 = self.activationR(x2)
        x3 = F.pad(x,(2,2))
        x3 = self.conv3(x)
        x3 = self.activationR(x3)

        # Apply global max pooling to each convolutional layer's output
        x1 = self.global_max_pool(x1).squeeze(2)
        x2 = self.global_max_pool(x2).squeeze(2)
        x3 = self.global_max_pool(x3).squeeze(2)

        # Concatenate the pooled features from the three convolutional layers
        x = torch.cat((x1, x2, x3), 1)

        # Apply dropout
        x = self.dropout(x)

        # Pass through the fully connected layer
        x = self.fc(x)
        return x

Train 10%

In [13]:
percentage = 0.10
num_samples = int(np.floor(percentage * total_patients))
train_samples = int(np.floor(0.7 * num_samples))
leftovers = num_samples - train_samples
valid_samples = train_samples + int(np.floor(0.5*leftovers))
start_test_samples = num_samples - valid_samples

# Initialize Dataset
train_dataset = ArrayDataset(data[:train_samples, :, :],
                              labels[:train_samples])
valid_dataset = ArrayDataset(data[train_samples:valid_samples, :, :],
                            labels[train_samples:valid_samples])
test_dataset = ArrayDataset(data[start_test_samples:num_samples, :, :],
                              labels[start_test_samples:num_samples])

# Initialize DataLoader
train_loader = DataLoader(train_dataset, batch_size=50, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=50, shuffle=True)

Train CNN part model

In [14]:
num_epochs = 5
# 10% data, 5 Epochs ~ 8.5 mins

model = MyModel()
weights_tensor = torch.tensor(weights, dtype=torch.float)
criterion = nn.BCEWithLogitsLoss(pos_weight=weights_tensor)
optimizer = optim.Adam(model.parameters(), lr=0.00001)

tlosses, tprecs, trecs, tf1s = [],[],[], []
vlosses, vprecs, vrecs, vf1s = [],[],[], []

best_val_f1 = 0.0

for epoch in range(num_epochs):
  tloss, tprec, trec, tf1 = train_model(model, train_loader, criterion, optimizer)
  vloss, vprec, vrec, vf1 = evaluate(model, test_loader, criterion)

  tlosses.append(tloss)
  tprecs.append(tprec)
  trecs.append(trec)
  tf1s.append(tf1)

  vlosses.append(vloss)
  vrecs.append(vprec)
  vrecs.append(vrec)
  vf1s.append(vf1)

  is_best = vf1 > best_val_f1
  best_val_f1 = best_val_f1*(not is_best) + vf1*is_best

  # if is_best:
  #   best_val_f1 = vf1
    # torch.save(model, "myCNN.pth", _use_new_zipfile_serialization=False)

  print(f'Epoch {epoch+1}, Loss: {tlosses[-1]}, Valid F1: {vf1s[-1]}')

Epoch 1, Loss: 5.185114973300212, Valid F1: 0.00600193880847875
Epoch 2, Loss: 5.157990416964969, Valid F1: 0.0071628597926002464
Epoch 3, Loss: 4.738494074022448, Valid F1: 0.008219478206535699
Epoch 4, Loss: 4.80270152800792, Valid F1: 0.009942361780542544
Epoch 5, Loss: 4.536849837045412, Valid F1: 0.011084900467470491


Test Model

In [15]:
model.eval()  # Setting the model to evaluation mode

all_true_labels = []
all_outputs = []

with torch.no_grad():  # No need to track gradients during evaluation
    for inputs, labels in test_loader:
        outputs = model(inputs)
        # print(inputs.shape, labels.shape, outputs.shape)
        all_outputs.extend(outputs.cpu().numpy())
        all_true_labels.extend(labels.cpu().numpy())

Score Model

In [16]:
thresh_ls = [0.005, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
for i in thresh_ls:
  print('threshold:', i,
        *["{}: {}".format(name, num) for name, num in zip(
          ["precision", "recall","f1"],
          score_model(all_true_labels, all_outputs, thresh=i))])

threshold: 0.005 precision: 0.005430653218971768 recall: 0.5024583296586995 f1: 0.0107451708438315
threshold: 0.05 precision: 0.005458236111413131 recall: 0.4948183390050765 f1: 0.010797368739154269
threshold: 0.1 precision: 0.005465280431274714 recall: 0.4804477251257711 f1: 0.01080761996633599
threshold: 0.2 precision: 0.005607303775326335 recall: 0.4667975390149474 f1: 0.011081493523103433
threshold: 0.3 precision: 0.005658425100023264 recall: 0.44343219181823096 f1: 0.011174260827628617
threshold: 0.4 precision: 0.005589148302828285 recall: 0.411702671261918 f1: 0.011028576063405867
threshold: 0.5 precision: 0.005529607567410775 recall: 0.37613770844540034 f1: 0.010898988892925896
