In [1]:
import fasttext
import os
import pandas as pd
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.utils.data.dataset import random_split

In [2]:
# MIMIC_DATA_PATH = "/Users/ericahlgren/Documents/UIUC/CS598/Project/data/mimic-iii-clinical-database-1.4"
# ICU_CSV = "icu_diag_merge.csv"
# OUTPUT_TEXT = "data/icd_long_title.txt"
DATA_PATH = "data/"
CHECKPOINT_PATH = "models/"

In [4]:
# icu_df = pd.read_csv(os.path.join(MIMIC_DATA_PATH, ICU_CSV))
targs = pickle.load(open(os.path.join(DATA_PATH,'targets.pkl'), 'rb'))
seqs = pickle.load(open(os.path.join(DATA_PATH,'text_seqs.pkl'), 'rb'))
num_seqs = pickle.load(open(os.path.join(DATA_PATH,'seqs.pkl'), 'rb'))
codes = pickle.load(open(os.path.join(DATA_PATH,'icd9.pkl'), 'rb'))
text = pickle.load(open(os.path.join(DATA_PATH,'icd9_text.pkl'), 'rb'))
categories = pickle.load(open(os.path.join(DATA_PATH,'categories.pkl'), 'rb'))
sub_categories = pickle.load(open(os.path.join(DATA_PATH,'subcategories.pkl'), 'rb'))
#pretrained_word_vecs = os.path.join(DATA_PATH, 'wiki-news-300d-1M-subword.vec')
# pretrained_word_vecs = os.path.join(DATA_PATH, 'crawl-300d-2M-subword.vec')
pretrained_word_model = os.path.join(DATA_PATH, 'crawl-300d-2M-subword.bin')


In [5]:
# with open(pretrained_word_vecs, 'r') as fin:
#     n, d = map(int, fin.readline().split())
#     ft_model = {}
#     for line in fin:
#         tokens = line.rstrip().split(' ')
#         ft_model[tokens[0]] = torch.Tensor([float(t) for t in tokens[1:]])

In [6]:
ft_model = fasttext.load_model(pretrained_word_model)



In [7]:
# icu_df.to_csv(OUTPUT_TEXT, columns=["LONG_TITLE_REPL"], header=False, index=False, sep='\n')
#icu_df.to_csv(OUTPUT_TEXT, columns=["ICD_SUBCATEGORY_DESC_REPL"], header=False, index=False, sep='\n')

In [8]:
# ft_model = fasttext.train_unsupervised(OUTPUT_TEXT, model='skipgram', dim=300, minCount=1)

In [9]:
class CustomDataset(Dataset):
    
    def __init__(self, text):

        self.x = text
        self.y = [i for i in range(len(text))]
    
    def __len__(self):
        
        return(len(self.x))
    
    def __getitem__(self, index):
        
        return (self.x[index], self.y[index])

In [10]:
dataset = CustomDataset(text)

In [11]:
def build_vocab(text, ft_model):
    """
    Arguments:
        text: full corpus of text with all ICD9 code descriptions
        ft_model: the pretrained fastText model
        
    Outputs:
        vocab: a tensor of shape (# words in corpus, word embedding dim) of type torch.float
        words: a sorted list of all words in corpus
        lookup: a dict which returns the index value for each word
    """
    word_embed_dim = ft_model.get_dimension()
    num_codes = len(text)
    num_words = [len(words.split()) for words in text]
    max_num_words = max(num_words)
    
    words = set()
    for w in text:
        words.update(w.split())
    words = sorted(list(words))
    lookup = dict(zip(words, range(len(words))))
    num_words = len(words)
    
    vocab = torch.zeros((num_words, word_embed_dim), dtype=torch.float)
    for i, word in enumerate(words):
        word_embed = ft_model[word]
        vocab[i] = torch.tensor(word_embed, dtype=torch.float)
    
    return vocab, words, lookup

In [12]:
vocab_built, words, lookup = build_vocab(text, ft_model)

In [13]:
def collate_fn(data):
    """
    Arguments:
        data: a list of samples fetched from `CustomDataset`
        
    Outputs:
        x: a tensor of shape (# icd9 codes, max # words, word embedding dim) of type torch.float
        y: a tensor of shape (# icd9 codes) of type torch.long
        masks: a tensor of shape (# icd9 codes, max # words, word embedding dim) of type torch.bool
    """
    text, indices = zip(*data)
    word_embed_dim = 300

    y = torch.tensor(indices, dtype=torch.long)
    num_codes = len(text)
    num_words = [len(words.split()) for words in text]

    max_num_words = max(max(num_words), 4)

    global lookup
    x = torch.zeros((num_codes, max_num_words), dtype=torch.long)
    masks = torch.zeros((num_codes, max_num_words), dtype=torch.bool)
    for i, code in enumerate(text):
        for j, word in enumerate(code.split()):
            x[i,j] = lookup[word]
            masks[i,j] = 1

    return x, y, masks

In [14]:
def load_data(train_dataset, collate_fn):
    '''
    The model is trained on the full dataset shuffled in batches of 100, the test
    dataset is the full dataset delievered in one large batch not shuffled.
    
    Arguments:
        train dataset: train dataset of type `CustomDataset`
        collate_fn: collate function
        
    Outputs:
        train_loader, test_loader: train and test dataloaders
    '''
    
    batch_size = 100
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size,
                                               collate_fn=collate_fn,
                                               shuffle=True)
    test_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=4903,
                                               collate_fn=collate_fn,
                                               shuffle=False)
    
    return train_loader, test_loader


train_loader, test_loader = load_data(dataset, collate_fn)

In [15]:
def mask_conv2d(outputs, masks):
    masks = masks.unsqueeze(1)
    masks = masks.repeat(1,100,1)
    x = []
    for mat in outputs:
        outmat = mat.clone()
        dim = outmat.shape[2]
        outmat[~masks[:,:,:dim]] = 0
        x.append(outmat)
    return x

In [16]:
class EmbeddingCNN(torch.nn.Module):

    def __init__(self, num_descriptions, max_num_words, vocab_built,
                 embedding_dim, num_class, num_kernel, kernel_sizes):
        super().__init__()
        """
        Arguments:
            hidden_dim: the hidden dimension
            num_descriptions: number of icd9 descrptions
            embedding_dim: size of word embedding dim (from fastText)
            num_class: number of classes to predict
            num_kernel: number of filters for each kernel size
            kernel_sizes: list of sizes to iterate on
        """
        self.embed = nn.Embedding(len(vocab_built), embedding_dim)
        self.embed.weight.data.copy_(vocab_built)
        self.convs = nn.ModuleList(
            [nn.Conv2d(1, num_kernel, (K, embedding_dim)) for K in kernel_sizes]
        )
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(len(kernel_sizes) * num_kernel, num_descriptions)

    def forward(self, x, masks):
        """
        Both the logit for training and the embedding matrix are output, so the embedding matrix
        can be obtained once training is complete.
        
        Arguments:
            x: the input tensor of icd9 description of size (batch_size, max_num_words, word_embedding_dim) 
            masks: masks for the padded words of size (batch_size, max_num_words, word_embedding_dim)
        
        Outputs:
            logit: logits for cross entropy loss function to for training iterations
            embedding: embedding matrix of learned wieghts for icd9 descriptions
        """
        x = self.embed(x)
        x = x.unsqueeze(1)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]
        x = mask_conv2d(x, masks)
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
        embedding = torch.cat(x, 1)
        x = self.dropout(embedding)
        logit = self.fc(x)
        return logit, embedding

embedding_cnn = EmbeddingCNN(
    num_descriptions=len(codes), max_num_words=30, vocab_built=vocab_built,
    embedding_dim=300, num_class=len(codes), num_kernel=100, kernel_sizes=[2,3,4])
embedding_cnn

EmbeddingCNN(
  (embed): Embedding(3164, 300)
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(2, 300), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(3, 300), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(4, 300), stride=(1, 1))
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=300, out_features=4903, bias=True)
)

In [17]:
vocab_built.shape

torch.Size([3164, 300])

In [18]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(embedding_cnn.parameters(), lr=0.001)
#optimizer = torch.optim.Adadelta(baseline_retain.parameters(), weight_decay=0.001)

In [19]:
def train(model, train_loader, n_epochs, n_class):
    """ 
    Arguments:
        model: the CNN model
        train_loader: training dataloder
        n_epochs: total number of epochs
        n_class: num of classes to learn
    """
    max_cpu, max_ram = print_cpu_usage()
    for epoch in range(n_epochs):
        model.train()
        train_loss = 0
        for feature, target, masks in train_loader:
            optimizer.zero_grad()
            logit, embedding = model(feature, masks)
            loss = criterion(logit, target)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss = train_loss / len(train_loader)
        cpu, ram = print_cpu_usage()
        max_cpu = cpu if cpu > max_cpu else max_cpu
        max_ram = ram if ram > max_ram else max_ram
        print(f'Epoch: {epoch+1} \t Training Loss: {train_loss:.6f}')
    final_cpu, final_ram = print_cpu_usage()
    print(f"Max CPU usage: {max_cpu:.3f}\tMax RAM % usage: {max_ram}")

In [20]:
def eval_model(model, test_loader): 
    """
    Arguments:
        model: the EmbeddingCNN model
        test_loader: validation dataloader
        
    Outputs:
        embedding: learned embedding matrix of the evaluated model
    """
    y_pred = torch.LongTensor()
    y_true = torch.LongTensor()
    all_precision = []
    all_accuracy = []
    
    model.eval()
    with torch.no_grad():
        n_correct = 0
        n_total = 0
        for feature, target, masks in test_loader:
            logit, embedding = model(feature, masks)
            y_hat = F.softmax(logit, dim=-1)
            pred = torch.argmax(y_hat, dim=1)
            n_correct += (pred == target).sum()
            n_total += pred.shape[0]
        success = n_correct / n_total
        print(f'{n_correct}/{n_total} correct \t success rate: {success:.4f}')
    return embedding

In [21]:
def indices_to_multihot(indices, masks, y_hat):
    multihot = torch.zeros_like(y_hat, dtype=torch.float)
    for idx, row in enumerate(indices):
        y_idx = row[masks[idx]].unique()
        multihot[idx] = F.one_hot(y_idx, y_hat.shape[1]).sum(0).float()
    return multihot

In [22]:
n_epochs = 250
%time train(embedding_cnn, train_loader, n_epochs, len(codes))

Epoch: 1 	 Training Loss: 8.527969
Epoch: 2 	 Training Loss: 8.485851
Epoch: 3 	 Training Loss: 8.357236
Epoch: 4 	 Training Loss: 7.961706
Epoch: 5 	 Training Loss: 7.381756
Epoch: 6 	 Training Loss: 6.573472
Epoch: 7 	 Training Loss: 5.590941
Epoch: 8 	 Training Loss: 4.656707
Epoch: 9 	 Training Loss: 3.782525
Epoch: 10 	 Training Loss: 3.079560
Epoch: 11 	 Training Loss: 2.493061
Epoch: 12 	 Training Loss: 2.069575
Epoch: 13 	 Training Loss: 1.827362
Epoch: 14 	 Training Loss: 1.498974
Epoch: 15 	 Training Loss: 1.300489
Epoch: 16 	 Training Loss: 1.166075
Epoch: 17 	 Training Loss: 1.052268
Epoch: 18 	 Training Loss: 0.942415
Epoch: 19 	 Training Loss: 0.846413
Epoch: 20 	 Training Loss: 0.837249
Epoch: 21 	 Training Loss: 0.684282
Epoch: 22 	 Training Loss: 0.649651
Epoch: 23 	 Training Loss: 0.604320
Epoch: 24 	 Training Loss: 0.539677
Epoch: 25 	 Training Loss: 0.523210
Epoch: 26 	 Training Loss: 0.456316
Epoch: 27 	 Training Loss: 0.412293
Epoch: 28 	 Training Loss: 0.391364
E

Epoch: 226 	 Training Loss: 0.031662
Epoch: 227 	 Training Loss: 0.030190
Epoch: 228 	 Training Loss: 0.027645
Epoch: 229 	 Training Loss: 0.029301
Epoch: 230 	 Training Loss: 0.028920
Epoch: 231 	 Training Loss: 0.027884
Epoch: 232 	 Training Loss: 0.043787
Epoch: 233 	 Training Loss: 0.029446
Epoch: 234 	 Training Loss: 0.031361
Epoch: 235 	 Training Loss: 0.032797
Epoch: 236 	 Training Loss: 0.030525
Epoch: 237 	 Training Loss: 0.030813
Epoch: 238 	 Training Loss: 0.031792
Epoch: 239 	 Training Loss: 0.026598
Epoch: 240 	 Training Loss: 0.031852
Epoch: 241 	 Training Loss: 0.029479
Epoch: 242 	 Training Loss: 0.030377
Epoch: 243 	 Training Loss: 0.029373
Epoch: 244 	 Training Loss: 0.031467
Epoch: 245 	 Training Loss: 0.032780
Epoch: 246 	 Training Loss: 0.029207
Epoch: 247 	 Training Loss: 0.030976
Epoch: 248 	 Training Loss: 0.028690
Epoch: 249 	 Training Loss: 0.035247
Epoch: 250 	 Training Loss: 0.031400
CPU times: user 47min 6s, sys: 12min 6s, total: 59min 12s
Wall time: 43min 

In [23]:
embedding = eval_model(embedding_cnn, test_loader)

4856/4903 correct 	 success rate: 0.9904


In [24]:
E = embedding.T

In [25]:
E.shape

torch.Size([300, 4903])

In [26]:
torch.save(E, os.path.join(DATA_PATH, 'embedding_matrix.pt'))

In [None]:
torch.save(embedding_cnn, os.path.join(CHECKPOINT_PATH, "EmbeddingCNN_250.pth"))