In [None]:
import fasttext
import os
import pandas as pd
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.utils.data.dataset import random_split

### Set local paths for input data and output models
NOTE: Download the `fastText` pretrained model `crawl-300d-2M-subword.zip` [here](https://fasttext.cc/docs/en/english-vectors.html) and place it in `DATA_PATH`.

In [None]:
DATA_PATH = "data/"
CHECKPOINT_PATH = "models/"

### Load input data and pre-trained model

In [None]:
targs = pickle.load(open(os.path.join(DATA_PATH,'targets.pkl'), 'rb'))
seqs = pickle.load(open(os.path.join(DATA_PATH,'text_seqs.pkl'), 'rb'))
num_seqs = pickle.load(open(os.path.join(DATA_PATH,'seqs.pkl'), 'rb'))
codes = pickle.load(open(os.path.join(DATA_PATH,'icd9.pkl'), 'rb'))
text = pickle.load(open(os.path.join(DATA_PATH,'icd9_text.pkl'), 'rb'))
categories = pickle.load(open(os.path.join(DATA_PATH,'categories.pkl'), 'rb'))
sub_categories = pickle.load(open(os.path.join(DATA_PATH,'subcategories.pkl'), 'rb'))
pretrained_word_model = os.path.join(DATA_PATH, 'crawl-300d-2M-subword.bin')

In [None]:
ft_model = fasttext.load_model(pretrained_word_model)

### Define and load custom dataset

In [None]:
class CustomDataset(Dataset):
    
    def __init__(self, text):

        self.x = text
        self.y = [i for i in range(len(text))]
    
    def __len__(self):
        
        return(len(self.x))
    
    def __getitem__(self, index):
        
        return (self.x[index], self.y[index])

In [None]:
dataset = CustomDataset(text)

### Define `build_vocab` function and build vocabulary

In [None]:
def build_vocab(text, ft_model):
    """
    Arguments:
        text: full corpus of text with all ICD9 code descriptions
        ft_model: the pretrained fastText model
        
    Outputs:
        vocab: a tensor of shape (# words in corpus, word embedding dim) of type torch.float
        words: a sorted list of all words in corpus
        lookup: a dict which returns the index value for each word
    """
    word_embed_dim = ft_model.get_dimension()
    num_codes = len(text)
    num_words = [len(words.split()) for words in text]
    max_num_words = max(num_words)
    
    words = set()
    for w in text:
        words.update(w.split())
    words = sorted(list(words))
    lookup = dict(zip(words, range(len(words))))
    num_words = len(words)
    
    vocab = torch.zeros((num_words, word_embed_dim), dtype=torch.float)
    for i, word in enumerate(words):
        word_embed = ft_model[word]
        vocab[i] = torch.tensor(word_embed, dtype=torch.float)
    
    return vocab, words, lookup

In [None]:
vocab_built, words, lookup = build_vocab(text, ft_model)

### Define collate and data loader functions

In [None]:
def collate_fn(data):
    """
    Arguments:
        data: a list of samples fetched from `CustomDataset`
        
    Outputs:
        x: a tensor of shape (# icd9 codes, max # words, word embedding dim) of type torch.float
        y: a tensor of shape (# icd9 codes) of type torch.long
        masks: a tensor of shape (# icd9 codes, max # words, word embedding dim) of type torch.bool
    """
    text, indices = zip(*data)
    word_embed_dim = 300

    y = torch.tensor(indices, dtype=torch.long)
    num_codes = len(text)
    num_words = [len(words.split()) for words in text]

    max_num_words = max(max(num_words), 4)

    global lookup
    x = torch.zeros((num_codes, max_num_words), dtype=torch.long)
    masks = torch.zeros((num_codes, max_num_words), dtype=torch.bool)
    for i, code in enumerate(text):
        for j, word in enumerate(code.split()):
            x[i,j] = lookup[word]
            masks[i,j] = 1

    return x, y, masks

NOTE: `train_loader` is batches of size 100 while `test_loader` loads the entire dataset.

In [None]:
def load_data(train_dataset, collate_fn):
    '''
    The model is trained on the full dataset shuffled in batches of 100, the test
    dataset is the full dataset delievered in one large batch not shuffled.
    
    Arguments:
        train dataset: train dataset of type `CustomDataset`
        collate_fn: collate function
        
    Outputs:
        train_loader, test_loader: train and test dataloaders
    '''
    
    batch_size = 100
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size,
                                               collate_fn=collate_fn,
                                               shuffle=True)
    test_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=4903,
                                               collate_fn=collate_fn,
                                               shuffle=False)
    
    return train_loader, test_loader


train_loader, test_loader = load_data(dataset, collate_fn)

### Define masking function to set masked indices to 0

In [None]:
def mask_conv2d(outputs, masks):
    masks = masks.unsqueeze(1)
    masks = masks.repeat(1,100,1)
    x = []
    for mat in outputs:
        outmat = mat.clone()
        dim = outmat.shape[2]
        outmat[~masks[:,:,:dim]] = 0
        x.append(outmat)
    return x

### Define helper function to convert indices to multihot vector

In [None]:
def indices_to_multihot(indices, masks, y_hat):
    multihot = torch.zeros_like(y_hat, dtype=torch.float)
    for idx, row in enumerate(indices):
        y_idx = row[masks[idx]].unique()
        multihot[idx] = F.one_hot(y_idx, y_hat.shape[1]).sum(0).float()
    return multihot

### Define EmbeddingCNN model

In [None]:
class EmbeddingCNN(torch.nn.Module):

    def __init__(self, num_descriptions, max_num_words, vocab_built,
                 embedding_dim, num_class, num_kernel, kernel_sizes):
        super().__init__()
        """
        Arguments:
            hidden_dim: the hidden dimension
            num_descriptions: number of icd9 descrptions
            embedding_dim: size of word embedding dim (from fastText)
            num_class: number of classes to predict
            num_kernel: number of filters for each kernel size
            kernel_sizes: list of sizes to iterate on
        """
        self.embed = nn.Embedding(len(vocab_built), embedding_dim)
        self.embed.weight.data.copy_(vocab_built)
        self.convs = nn.ModuleList(
            [nn.Conv2d(1, num_kernel, (K, embedding_dim)) for K in kernel_sizes]
        )
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(len(kernel_sizes) * num_kernel, num_descriptions)

    def forward(self, x, masks):
        """
        Both the logit for training and the embedding matrix are output, so the embedding matrix
        can be obtained once training is complete.
        
        Arguments:
            x: the input tensor of icd9 description of size (batch_size, max_num_words, word_embedding_dim) 
            masks: masks for the padded words of size (batch_size, max_num_words, word_embedding_dim)
        
        Outputs:
            logit: logits for cross entropy loss function to for training iterations
            embedding: embedding matrix of learned wieghts for icd9 descriptions
        """
        x = self.embed(x)
        x = x.unsqueeze(1)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]
        x = mask_conv2d(x, masks)
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
        embedding = torch.cat(x, 1)
        x = self.dropout(embedding)
        logit = self.fc(x)
        return logit, embedding

embedding_cnn = EmbeddingCNN(
    num_descriptions=len(codes), max_num_words=30, vocab_built=vocab_built,
    embedding_dim=300, num_class=len(codes), num_kernel=100, kernel_sizes=[2,3,4])
embedding_cnn

### Define loss and optimization, and train and evalutaion functions

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(embedding_cnn.parameters(), lr=0.001)
#optimizer = torch.optim.Adadelta(baseline_retain.parameters(), weight_decay=0.001)

In [None]:
def train(model, train_loader, n_epochs, n_class):
    """ 
    Arguments:
        model: the CNN model
        train_loader: training dataloder
        n_epochs: total number of epochs
        n_class: num of classes to learn
    """
    max_cpu, max_ram = print_cpu_usage()
    for epoch in range(n_epochs):
        model.train()
        train_loss = 0
        for feature, target, masks in train_loader:
            optimizer.zero_grad()
            logit, embedding = model(feature, masks)
            loss = criterion(logit, target)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss = train_loss / len(train_loader)
        cpu, ram = print_cpu_usage()
        max_cpu = cpu if cpu > max_cpu else max_cpu
        max_ram = ram if ram > max_ram else max_ram
        print(f'Epoch: {epoch+1} \t Training Loss: {train_loss:.6f}')
    final_cpu, final_ram = print_cpu_usage()
    print(f"Max CPU usage: {max_cpu:.3f}\tMax RAM % usage: {max_ram}")

NOTE: Evaluation function retruns the embedding matrix $E$

In [None]:
def eval_model(model, test_loader): 
    """
    Arguments:
        model: the EmbeddingCNN model
        test_loader: validation dataloader
        
    Outputs:
        embedding: learned embedding matrix of the evaluated model
    """
    y_pred = torch.LongTensor()
    y_true = torch.LongTensor()
    all_precision = []
    all_accuracy = []
    
    model.eval()
    with torch.no_grad():
        n_correct = 0
        n_total = 0
        for feature, target, masks in test_loader:
            logit, embedding = model(feature, masks)
            y_hat = F.softmax(logit, dim=-1)
            pred = torch.argmax(y_hat, dim=1)
            n_correct += (pred == target).sum()
            n_total += pred.shape[0]
        success = n_correct / n_total
        print(f'{n_correct}/{n_total} correct \t success rate: {success:.4f}')
    return embedding

### Set num epochs and train model

In [None]:
n_epochs = 250
%time train(embedding_cnn, train_loader, n_epochs, len(codes))

### Evaluate model and extract embedding matrix, then transpose

In [None]:
embedding = eval_model(embedding_cnn, test_loader)

In [None]:
E = embedding.T

### Save embedding matrix $E$ and pre-trained model

In [None]:
torch.save(E, os.path.join(DATA_PATH, 'embedding_matrix.pt'))

In [None]:
torch.save(embedding_cnn, os.path.join(CHECKPOINT_PATH, "EmbeddingCNN_250.pth"))