In [1]:
import fasttext
import os
import pandas as pd
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.utils.data.dataset import random_split

In [2]:
MIMIC_DATA_PATH = "/Users/ericahlgren/Documents/UIUC/CS598/Project/data/mimic-iii-clinical-database-1.4"
ICU_CSV = "icu_diag_merge.csv"
OUTPUT_TEXT = "data/icd_long_title.txt"
DATA_PATH = "data/"

In [3]:
icu_df = pd.read_csv(os.path.join(MIMIC_DATA_PATH, ICU_CSV))
targs = pickle.load(open(os.path.join(DATA_PATH,'targets.pkl'), 'rb'))
seqs = pickle.load(open(os.path.join(DATA_PATH,'text_seqs.pkl'), 'rb'))
num_seqs = pickle.load(open(os.path.join(DATA_PATH,'seqs.pkl'), 'rb'))
codes = pickle.load(open(os.path.join(DATA_PATH,'icd9.pkl'), 'rb'))
text = pickle.load(open(os.path.join(DATA_PATH,'icd9_text.pkl'), 'rb'))
categories = pickle.load(open(os.path.join(DATA_PATH,'categories.pkl'), 'rb'))
sub_categories = pickle.load(open(os.path.join(DATA_PATH,'subcategories.pkl'), 'rb'))

In [4]:
icu_df[['ICD9_CODE','LONG_TITLE_REPL']].head()

Unnamed: 0,ICD9_CODE,LONG_TITLE_REPL
0,V3001,single liveborn born in hospital delivered by ...
1,V053,need for prophylactic vaccination and inoculat...
2,V290,observation for suspected infectious condition
3,V3001,single liveborn born in hospital delivered by ...
4,V053,need for prophylactic vaccination and inoculat...


In [5]:
icu_df.ICD9_CODE.nunique()

4903

In [6]:
icu_df.to_csv(OUTPUT_TEXT, columns=["LONG_TITLE_REPL"], header=False, index=False, sep='\n')
#icu_df.to_csv(OUTPUT_TEXT, columns=["ICD_SUBCATEGORY_DESC_REPL"], header=False, index=False, sep='\n')

In [7]:
ft_model = fasttext.train_unsupervised(OUTPUT_TEXT, model='skipgram', dim=300, minCount=1)

Read 2M words
Number of words:  3165
Number of labels: 0
Progress: 100.0% words/sec/thread:  146126 lr:  0.000000 avg.loss:  0.779818 ETA:   0h 0m 0s  6.7% words/sec/thread:  151550 lr:  0.046634 avg.loss:  2.081994 ETA:   0h 0m 4s


In [8]:
class CustomDataset(Dataset):
    
    def __init__(self, text):

        self.x = text
        self.y = [i for i in range(len(text))]
    
    def __len__(self):
        
        return(len(self.x))
    
    def __getitem__(self, index):
        
        return (self.x[index], self.y[index])

In [9]:
dataset = CustomDataset(text)

In [10]:
def collate_fn(data):
    """
    Arguments:
        data: a list of samples fetched from `CustomDataset`
        
    Outputs:
        x: a tensor of shape (# icd9 codes, max # words, word embedding dim) of type torch.float
        y: a tensor of shape (# icd9 codes) of type torch.long
        masks: a tensor of shape (# icd9 codes, max # words, word embedding dim) of type torch.bool
    """
    text, indices = zip(*data)
    word_embed_dim = 300

    y = torch.tensor(indices, dtype=torch.long)
#     import pdb; pdb.set_trace()
    num_codes = len(text)
    num_words = [len(words.split()) for words in text]

    max_num_words = max(max(num_words), 4)

    global ft_model
#     x = torch.zeros((num_codes, word_embed_dim * max_num_words), dtype=torch.float)
#     x_masks = torch.zeros((num_codes, word_embed_dim * max_num_words), dtype=torch.bool)
#     for i, code in enumerate(text):
#         for j, word in enumerate(code.split()):
#             word_embed = ft_model[word]
#             x[i, j*300:j*300+300] = torch.tensor(word_embed, dtype=torch.float)
#             x_masks[i, j*300:j*300+300] = 1
    x = torch.zeros((num_codes, max_num_words, word_embed_dim), dtype=torch.float)
    masks = torch.zeros((num_codes, max_num_words, word_embed_dim), dtype=torch.bool)
    for i, code in enumerate(text):
        for j, word in enumerate(code.split()):
            word_embed = ft_model[word]
            x[i,j] = torch.tensor(word_embed, dtype=torch.float)
            masks[i,j] = torch.ones(word_embed_dim)

    
    return x, y, masks

In [11]:
def load_data(train_dataset, collate_fn):
    
    '''
    Arguments:
        train dataset: train dataset of type `CustomDataset`
        collate_fn: collate function
        
    Outputs:
        train_loader, test_loader: train and test dataloaders
    '''
    
    batch_size = 100
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size,
                                               collate_fn=collate_fn,
                                               shuffle=True)
    test_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=4903,
                                               collate_fn=collate_fn,
                                               shuffle=False)
    
    return train_loader, test_loader


train_loader, test_loader = load_data(dataset, collate_fn)

In [12]:
def mask_conv2d(outputs, masks):
    masks = masks.any(dim=2)
    masks = masks.unsqueeze(1)
    masks = masks.repeat(1,100,1)
    x = []
    for mat in outputs:
        outmat = mat.clone()
        dim = outmat.shape[2]
        outmat[~masks[:,:,:dim]] = 0
        x.append(outmat)
    return x

In [13]:
class EmbeddingCNN(torch.nn.Module):

    def __init__(self, num_descriptions, embedding_dim, num_class, num_kernel, kernel_sizes):
        super().__init__()
        """
        Arguments:
            hidden_dim: the hidden dimension
            num_descriptions: number of icd9 descrptions
            embedding_dim: size of word embedding dim (from fastText)
            num_class: number of classes to predict
            num_kernel: number of filters for each kernel size
            kernel_sizes: list of sizes to iterate on
        """
        self.embed = nn.Embedding(num_descriptions, embedding_dim)
#         self.conv1 = nn.Conv2d(1, num_kernel, (2, embedding_dim))
#         self.conv2 = nn.Conv2d(1, num_kernel, (3, embedding_dim))
#         self.conv3 = nn.Conv2d(1, num_kernel, (4, embedding_dim))
        self.convs = nn.ModuleList(
            [nn.Conv2d(1, num_kernel, (K, embedding_dim)) for K in kernel_sizes]
        )
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(len(kernel_sizes) * num_kernel, num_descriptions)

    def forward(self, x, masks):
        """
        Arguments:
            x: the input tensor of icd9 description of size (batch_size, max_num_words, word_embedding_dim) 
            masks: masks for the padded words of size (batch_size, max_num_words, word_embedding_dim)
        
        Outputs:
            logit: logits for cross entropy loss function to for training iterations
            embedding: embedding matrix of learned wieghts for icd9 descriptions
        """
#         import pdb; pdb.set_trace()
#         x = self.embed(x)
        x = x.unsqueeze(1)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]
        x = mask_conv2d(x, masks)
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
        embedding = torch.cat(x, 1)
        x = self.dropout(embedding)
        logit = self.fc(x)
        return logit, embedding

embedding_cnn = EmbeddingCNN(
    num_descriptions=len(codes), embedding_dim=300, num_class=len(codes),
    num_kernel=100, kernel_sizes=[2,3,4])
embedding_cnn

EmbeddingCNN(
  (embed): Embedding(4903, 300)
  (conv1): Conv2d(1, 100, kernel_size=(2, 300), stride=(1, 1))
  (conv2): Conv2d(1, 100, kernel_size=(3, 300), stride=(1, 1))
  (conv3): Conv2d(1, 100, kernel_size=(4, 300), stride=(1, 1))
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(2, 300), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(3, 300), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(4, 300), stride=(1, 1))
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=300, out_features=4903, bias=True)
)

In [14]:
criterion = nn.CrossEntropyLoss()
#criterion = nn.BCELoss()
#criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(embedding_cnn.parameters(), lr=0.001)
#optimizer = torch.optim.Adadelta(baseline_retain.parameters(), weight_decay=0.001)

In [15]:
def train(model, train_loader, n_epochs, n_class):
    """ 
    Arguments:
        model: the CNN model
        train_loader: training dataloder
        n_epochs: total number of epochs
        n_class: num of classes to learn
    """
    #base_cpu, base_ram = print_cpu_usage()
    for epoch in range(n_epochs):
        model.train()
        train_loss = 0
        for feature, target, masks in train_loader:
#             import pdb; pdb.set_trace()
            optimizer.zero_grad()
            logit, embedding = model(feature, masks)

#             y_mh = indices_to_multihot(target, masks, logit)
#             y_hat = F.one_hot(target, n_class)
            loss = criterion(logit, target)

            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss = train_loss / len(train_loader)
        print(f'Epoch: {epoch+1} \t Training Loss: {train_loss:.6f}')
#         eval_model(model, train_loader)
        

In [16]:
def eval_model(model, test_loader):
    
    """
    Arguments:
        model: the CNN model
        test_loader: validation dataloader
        
    Outputs:
        embedding: learned embedding matrix of the evaluated model
    """
    y_pred = torch.LongTensor()
    y_score = torch.Tensor()
    y_true = torch.LongTensor()
    all_precision = []
    all_accuracy = []
    
    model.eval()
#     import pdb; pdb.set_trace()
    with torch.no_grad():
        n_correct = 0
        n_total = 0
        for feature, target, masks in test_loader:
            logit, embedding = model(feature, masks)
            y_hat = F.softmax(logit, dim=-1)
            pred = torch.argmax(y_hat, dim=1)
            n_correct += (pred == target).sum()
            n_total += pred.shape[0]
        success = n_correct / n_total
        print(f'{n_correct}/{n_total} correct \t success rate: {success:.4f}')
    return embedding

In [17]:
def indices_to_multihot(indices, masks, y_hat):
#     import pdb; pdb.set_trace()
    #indices = indices[masks.any(dim=1)]
    multihot = torch.zeros_like(y_hat, dtype=torch.float)
    for idx, row in enumerate(indices):
        y_idx = row[masks[idx]].unique()
        multihot[idx] = F.one_hot(y_idx, y_hat.shape[1]).sum(0).float()
    return multihot

In [18]:
n_epochs = 100
%time train(embedding_cnn, train_loader, n_epochs, len(codes))

Epoch: 1 	 Training Loss: 8.552966
Epoch: 2 	 Training Loss: 8.478482
Epoch: 3 	 Training Loss: 8.302736
Epoch: 4 	 Training Loss: 7.672985
Epoch: 5 	 Training Loss: 6.665219
Epoch: 6 	 Training Loss: 5.490110
Epoch: 7 	 Training Loss: 4.447985
Epoch: 8 	 Training Loss: 3.671042
Epoch: 9 	 Training Loss: 3.120104
Epoch: 10 	 Training Loss: 2.695646
Epoch: 11 	 Training Loss: 2.389871
Epoch: 12 	 Training Loss: 2.127404
Epoch: 13 	 Training Loss: 1.939855
Epoch: 14 	 Training Loss: 1.726716
Epoch: 15 	 Training Loss: 1.630548
Epoch: 16 	 Training Loss: 1.501281
Epoch: 17 	 Training Loss: 1.455339
Epoch: 18 	 Training Loss: 1.309868
Epoch: 19 	 Training Loss: 1.223125
Epoch: 20 	 Training Loss: 1.203669
Epoch: 21 	 Training Loss: 1.107222
Epoch: 22 	 Training Loss: 1.057209
Epoch: 23 	 Training Loss: 1.020391
Epoch: 24 	 Training Loss: 0.969140
Epoch: 25 	 Training Loss: 0.965282
Epoch: 26 	 Training Loss: 0.889821
Epoch: 27 	 Training Loss: 0.845714
Epoch: 28 	 Training Loss: 0.769169
E

In [19]:
embedding = eval_model(embedding_cnn, test_loader)

4835/4903 correct 	 success rate: 0.9861


In [20]:
E = embedding.T

In [21]:
E

tensor([[0.1408, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0120],
        [0.8244, 0.0000, 4.8920,  ..., 0.5002, 0.0000, 4.2985],
        [1.4374, 0.2580, 0.1317,  ..., 2.3848, 2.0505, 0.0000],
        ...,
        [0.8518, 0.0000, 0.3237,  ..., 0.9934, 0.5658, 0.1184],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.1696, 2.2176, 0.0000,  ..., 2.6231, 0.6750, 1.3424]])

In [22]:
E.shape

torch.Size([300, 4903])

In [23]:
torch.save(E, os.path.join(DATA_PATH, 'embedding_matrix.pt'))