In [200]:
import fasttext
import os
import pandas as pd
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.utils.data.dataset import random_split



In [201]:
MIMIC_DATA_PATH = "/Users/ericahlgren/Documents/UIUC/CS598/Project/data/mimic-iii-clinical-database-1.4"
ICU_CSV = "icu_diag_merge.csv"
OUTPUT_TEXT = "data/icd_long_title.txt"
DATA_PATH = "data/"

In [202]:
icu_df = pd.read_csv(os.path.join(MIMIC_DATA_PATH, ICU_CSV))
targs = pickle.load(open(os.path.join(DATA_PATH,'targets.pkl'), 'rb'))
seqs = pickle.load(open(os.path.join(DATA_PATH,'text_seqs.pkl'), 'rb'))
num_seqs = pickle.load(open(os.path.join(DATA_PATH,'seqs.pkl'), 'rb'))
codes = pickle.load(open(os.path.join(DATA_PATH,'icd9.pkl'), 'rb'))
text = pickle.load(open(os.path.join(DATA_PATH,'icd9_text.pkl'), 'rb'))
categories = pickle.load(open(os.path.join(DATA_PATH,'categories.pkl'), 'rb'))
sub_categories = pickle.load(open(os.path.join(DATA_PATH,'subcategories.pkl'), 'rb'))

In [203]:
icu_df.to_csv(OUTPUT_TEXT, columns=["LONG_TITLE_REPL"], header=False, index=False, sep='\n')
#icu_df.to_csv(OUTPUT_TEXT, columns=["ICD_SUBCATEGORY_DESC_REPL"], header=False, index=False, sep='\n')

In [204]:
ft_model = fasttext.train_unsupervised(OUTPUT_TEXT, model='skipgram', dim=300, minCount=1)

Read 2M words
Number of words:  3165
Number of labels: 0
Progress: 100.0% words/sec/thread:  142244 lr:  0.000000 avg.loss:  0.783721 ETA:   0h 0m 0s


In [205]:
class CustomDataset(Dataset):
    
    def __init__(self, text):

        self.x = text
        self.y = [i for i in range(len(text))]
    
    def __len__(self):
        
        return(len(self.x))
    
    def __getitem__(self, index):
        
        return (self.x[index], self.y[index])

In [206]:
dataset = CustomDataset(text)

In [207]:
def collate_fn(data):
    """
    Arguments:
        data: a list of samples fetched from `CustomDataset`
        
    Outputs:
        x: a tensor of shape (# patiens, max # visits, max # diagnosis codes) of type torch.long
        masks: a tensor of shape (# patiens, max # visits, max # diagnosis codes) of type torch.bool
        rev_x: same as x but in reversed time. This will be used in our RNN model for masking 
        rev_masks: same as mask but in reversed time. This will be used in our RNN model for masking
        y: a tensor of shape (# patiens) of type torch.float
    """
    text, indices = zip(*data)
    word_embed_dim = 300

    y = torch.tensor(indices, dtype=torch.long)
#     import pdb; pdb.set_trace()
    num_codes = len(text)
    num_words = [len(words.split()) for words in text]

    max_num_words = max(max(num_words), 4)

    global ft_model
#     x = torch.zeros((num_codes, word_embed_dim * max_num_words), dtype=torch.float)
#     x_masks = torch.zeros((num_codes, word_embed_dim * max_num_words), dtype=torch.bool)
#     for i, code in enumerate(text):
#         for j, word in enumerate(code.split()):
#             word_embed = ft_model[word]
#             x[i, j*300:j*300+300] = torch.tensor(word_embed, dtype=torch.float)
#             x_masks[i, j*300:j*300+300] = 1
    x = torch.zeros((num_codes, max_num_words, word_embed_dim), dtype=torch.float)
    x_masks = torch.zeros((num_codes, max_num_words, word_embed_dim), dtype=torch.bool)
    for i, code in enumerate(text):
        for j, word in enumerate(code.split()):
            word_embed = ft_model[word]
            x[i,j] = torch.tensor(word_embed, dtype=torch.float)
            x_masks[i,j] = torch.ones(word_embed_dim)

    
    return x, y, x_masks

In [208]:
def load_data(train_dataset, collate_fn):
    
    '''
    Arguments:
        train dataset: train dataset of type `CustomDataset`
        val dataset: validation dataset of type `CustomDataset`
        collate_fn: collate function
        
    Outputs:
        train_loader, val_loader: train and validation dataloaders
    
    Note that you need to pass the collate function to the data loader `collate_fn()`.
    '''
    
    batch_size = 100
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size,
                                               collate_fn=collate_fn,
                                               shuffle=True)
    test_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=4903,
                                               collate_fn=collate_fn,
                                               shuffle=False)
    
    return train_loader, test_loader


train_loader, test_loader = load_data(dataset, collate_fn)

In [209]:
def mask_conv2d(outputs, masks):
    masks = masks.any(dim=2)
    masks = masks.unsqueeze(1)
    masks = masks.repeat(1,100,1)
    x = []
    for mat in outputs:
        outmat = mat.clone()
        dim = outmat.shape[2]
        outmat[~masks[:,:,:dim]] = 0
        x.append(outmat)
    return x

In [210]:
class EmbeddingCNN(torch.nn.Module):

    def __init__(self, num_descriptions, embedding_dim, num_class, num_kernel, kernel_sizes):
        super().__init__()
        """
        Arguments:
            hidden_dim: the hidden dimension
        """
        self.embed = nn.Embedding(num_descriptions, embedding_dim)
        self.conv1 = nn.Conv2d(1, num_kernel, (2, embedding_dim))
        self.conv2 = nn.Conv2d(1, num_kernel, (3, embedding_dim))
        self.conv3 = nn.Conv2d(1, num_kernel, (4, embedding_dim))
        self.convs = nn.ModuleList(
            [nn.Conv2d(1, num_kernel, (K, embedding_dim)) for K in kernel_sizes]
        )
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(len(kernel_sizes) * num_kernel, num_descriptions)

    def forward(self, x, masks):
        """
        Arguments:
            g: the output tensor from RNN-alpha of shape (batch_size, seq_length, hidden_dim) 
        
        Outputs:
            alpha: the corresponding attention weights of shape (batch_size, seq_length, 1)
        """
#         import pdb; pdb.set_trace()
#         x = self.embed(x)
        x = x.unsqueeze(1)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]
        x = mask_conv2d(x, masks)
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
        x = torch.cat(x, 1)
        x_train = self.dropout(x)
        logit = self.fc(x_train)
        return logit, x

embedding_cnn = EmbeddingCNN(
    num_descriptions=len(codes), embedding_dim=300, num_class=len(codes),
    num_kernel=100, kernel_sizes=[2,3,4])
embedding_cnn

EmbeddingCNN(
  (embed): Embedding(4903, 300)
  (conv1): Conv2d(1, 100, kernel_size=(2, 300), stride=(1, 1))
  (conv2): Conv2d(1, 100, kernel_size=(3, 300), stride=(1, 1))
  (conv3): Conv2d(1, 100, kernel_size=(4, 300), stride=(1, 1))
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(2, 300), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(3, 300), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(4, 300), stride=(1, 1))
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=300, out_features=4903, bias=True)
)

In [211]:
criterion = nn.CrossEntropyLoss()
#criterion = nn.BCELoss()
#criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(embedding_cnn.parameters(), lr=0.001)
#optimizer = torch.optim.Adadelta(baseline_retain.parameters(), weight_decay=0.001)

In [212]:
def train(model, train_loader, n_epochs, n_class):
    """ 
    Arguments:
        model: the RNN model
        train_loader: training dataloder
        val_loader: validation dataloader
        n_epochs: total number of epochs
    """
    #base_cpu, base_ram = print_cpu_usage()
    for epoch in range(n_epochs):
        model.train()
        train_loss = 0
        for feature, target, masks in train_loader:
#             import pdb; pdb.set_trace()
            optimizer.zero_grad()
            logit, embedding = model(feature, masks)

#             y_mh = indices_to_multihot(target, masks, logit)
#             y_hat = F.one_hot(target, n_class)
            loss = criterion(logit, target)

            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss = train_loss / len(train_loader)
        print(f'Epoch: {epoch+1} \t Training Loss: {train_loss:.6f}')
#         eval_model(model, train_loader)
        

In [213]:
def eval_model(model, test_loader):
    
    """
    Arguments:
        model: the RNN model
        val_loader: validation dataloader
        
    Outputs:
        precision: overall precision score
        recall: overall recall score
        f1: overall f1 score
        roc_auc: overall roc_auc score
        
    """
    y_pred = torch.LongTensor()
    y_score = torch.Tensor()
    y_true = torch.LongTensor()
    all_precision = []
    all_accuracy = []
    
    model.eval()
#     import pdb; pdb.set_trace()
    with torch.no_grad():
        n_correct = 0
        n_total = 0
        for feature, target, masks in test_loader:
            logit, embedding = model(feature, masks)
            y_hat = F.softmax(logit, dim=-1)
            pred = torch.argmax(y_hat, dim=1)
            n_correct += (pred == target).sum()
            n_total += pred.shape[0]
        success = n_correct / n_total
        print(f'{n_correct}/{n_total} correct \t success rate: {success:.4f}')
    return embedding

In [214]:
def indices_to_multihot(indices, masks, y_hat):
#     import pdb; pdb.set_trace()
    #indices = indices[masks.any(dim=1)]
    multihot = torch.zeros_like(y_hat, dtype=torch.float)
    for idx, row in enumerate(indices):
        y_idx = row[masks[idx]].unique()
        multihot[idx] = F.one_hot(y_idx, y_hat.shape[1]).sum(0).float()
    return multihot

In [215]:
num_seqs[0]

[[4695, 4555, 4691], [4695, 4555, 4691]]

In [166]:
n_epochs = 100
%time train(embedding_cnn, train_loader, n_epochs, len(codes))

Epoch: 1 	 Training Loss: 8.448616
Epoch: 2 	 Training Loss: 8.132655
Epoch: 3 	 Training Loss: 7.329388
Epoch: 4 	 Training Loss: 5.963246
Epoch: 5 	 Training Loss: 4.663459
Epoch: 6 	 Training Loss: 3.687683
Epoch: 7 	 Training Loss: 2.982101
Epoch: 8 	 Training Loss: 2.478072
Epoch: 9 	 Training Loss: 2.076313
Epoch: 10 	 Training Loss: 1.887517
Epoch: 11 	 Training Loss: 1.640090
Epoch: 12 	 Training Loss: 1.481388
Epoch: 13 	 Training Loss: 1.353482
Epoch: 14 	 Training Loss: 1.239966
Epoch: 15 	 Training Loss: 1.158452
Epoch: 16 	 Training Loss: 1.039990
Epoch: 17 	 Training Loss: 1.017113
Epoch: 18 	 Training Loss: 0.959317
Epoch: 19 	 Training Loss: 0.884966
Epoch: 20 	 Training Loss: 0.922540
Epoch: 21 	 Training Loss: 0.804284
Epoch: 22 	 Training Loss: 0.732477
Epoch: 23 	 Training Loss: 0.698252
Epoch: 24 	 Training Loss: 0.651712
Epoch: 25 	 Training Loss: 0.644411
Epoch: 26 	 Training Loss: 0.615422
Epoch: 27 	 Training Loss: 0.579622
Epoch: 28 	 Training Loss: 0.560954
E

In [169]:
embedding = eval_model(embedding_cnn, test_loader)

4840/4903 correct 	 success rate: 0.9872


In [107]:
E = embedding.T

In [108]:
E

tensor([[0.0000, 0.0415, 0.0000,  ..., 2.1525, 2.3754, 0.0000],
        [0.0000, 0.9073, 0.9623,  ..., 0.0000, 0.2491, 0.0000],
        [2.2889, 1.4867, 0.4477,  ..., 5.4431, 2.1662, 0.8727],
        ...,
        [2.3640, 1.5446, 0.0000,  ..., 2.2595, 1.3269, 0.0000],
        [0.3830, 1.6254, 0.8770,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]])

In [101]:
E.shape

torch.Size([300, 4903])

In [172]:
torch.save(E1, os.path.join(DATA_PATH, 'embedding_matrix.pt'))