In [0]:
#@title Model hyperparameters
num_train = 600 #@param {type:"slider", min:2, max:600, step:2}
learning_rate = 0.008 #@param {type:"slider", min:0.005, max:0.1, step:0.001}
num_epochs = 30 #@param {type:"slider", min:10, max:300, step:10}

# Install and Load Package Dependencies

In [0]:
# Install pytorch
from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision

# Install gensim, more recent seaborn, slack API wrapper
!pip install gensim seaborn==0.9.0 slacker

You will need your own Slack token to utilize the Slack API. 

In [0]:
from string import whitespace
import time

# arrays and dataframes
import numpy as np
import pandas as pd

# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

import sklearn.metrics as metrics

# pytorch: neural nets including LSTMs
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
torch.manual_seed(1)

# gensim: word embeddings w/ word2vec
from gensim.test.utils import get_tmpfile
from gensim.models import Word2Vec, KeyedVectors

# google drive api
from google.colab import drive
drive.mount('/gdrive')

from slacker import Slacker
slack = Slacker('YOUR_TOKEN_HERE')


%matplotlib inline

# Import data and class labels

In [0]:
# Define data filepaths and read
drive_dir = '/gdrive/My Drive/med277/'

tags_filename = '20181204_MR_ncbi-disease-corpus_bio-translation.txt'
tags_filepath = tags_filename
corpus_tag = open(tags_filepath, 'r')

text_filename = '20181204_MR_ncbi-disease-corpus_text.txt'
text_filepath = text_filename
corpus_text = open(text_filepath, 'r')

# format input text
corpus_list = []
for entry in corpus_text.readlines():
    a = entry.split('\t')[1:]
    a = [word.strip() for word in a]
    corpus_list.append(a)
    
# format input tags
tag_list = []
for entry in corpus_tag.readlines():
    a = entry.split('\t')[1:]
    a = [tag.strip() for tag in a]
    tag_list.append(a)
    
# remove entries that don't parse correctly
print('Corpus size: {}'.format(len(corpus_list)))
remove_a = []
remove_b = []
for a, b in zip(corpus_list, tag_list):
    if len(a) != len(b):
        remove_a.append(a)
        remove_b.append(b)

[corpus_list.remove(v) for v in remove_a]
[tag_list.remove(v) for v in remove_b]

print('Corpus size (filtered): {}'.format(len(corpus_list)))

## Word2vec embeddings

In [0]:
wv_filename = 'corpus_model.wv'
wv_path = wv_filename

### Train and save

In [0]:
# train word2vec model
model = Word2Vec(corpus_list, size=100, window=5, min_count=1, workers=8)

# save corpus word embedding to drive
model.wv.save(wv_path)

# save some memory
word_vectors = model.wv
del model

### Load pretrained

In [0]:
word_vectors = KeyedVectors.load(wv_path, mmap='r')

# Define BiLSTM model

## Helper functions

In [0]:
def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()

# def prepare_sequence(seq, to_ix):
#     idxs = [to_ix[w] for w in seq]
#     return torch.tensor(idxs, dtype=torch.long)
def prepare_sequence(seq, word_vectors):
    # return word2vec word embedding
    idxs = [word_vectors.index2word.index(w) for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))
  
# Gets output of model from data
def get_classes(data, model):
    y_true = []
    y_pred = []

    with torch.no_grad():
        for sentence, tags in training_data:
            if len(sentence) != len(tags):
                print(sentence)

            true = np.array([tag_to_ix[t] for t in tags])
            true = (true != 2).tolist()
            y_true.extend(true)

            pred = np.array(model(prepare_sequence(sentence, word_vectors))[1])
#             pred = np.array(model(prepare_sequence(sentence, word_to_ix))[1])
            pred = (pred != 2).tolist()
            y_pred.extend(pred)

    return (y_true, y_pred)

## BiLSTM class



In [0]:
class BiLSTM_CRF(nn.Module):

    def __init__(self, embedding_vectors, tag_to_ix, hidden_dim):
#     def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
#         self.embedding_dim = embedding_dim
        self.embedding_dim = embedding_vectors.vector_size
        self.hidden_dim = hidden_dim
        self.vocab_size = len(embedding_vectors.vocab)
#         self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_vectors.vectors))
#         self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(self.embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2),
                torch.randn(2, 1, self.hidden_dim // 2))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the previous step, plus the score of transitioning from tag i to next_tag. We don't include the emission scores here because the max does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

# Train BiLSTM model



In [0]:
def train_model(word_vectors, tag_to_ix, HIDDEN_DIM, learning_rate, num_epochs, training_data):

  def slack_print(msg):
    print(msg)
#     slack.chat.post_message(channel='nlp-disease-model',
#                             text=msg,
#                             username='LSTM-log')
    
  def slack_savefig(fig, fname, fmt):
    fig.savefig(fname=fname, fmt=fmt)
#     slack.files.upload(file_=fname,
#                        channels='nlp-disease-model', 
#                        initial_comment=fname)
    
  slack_print('`START OF TRAINING`')
  slack_print('training size: {}'.format(len(training_data)))
  slack_print('learning rate: {}'.format(learning_rate))
  slack_print('epochs: {}'.format(num_epochs))
  
  fname_params = 'train-size-{}.lr-{}.epochs-{}'.format(num_train, learning_rate, num_epochs)
  
  # Initialize model & optimizer
  model = BiLSTM_CRF(word_vectors, tag_to_ix, HIDDEN_DIM)
  # model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
  optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=1e-4)


  # Training...
  # Make sure prepare_sequence from earlier in the LSTM section is loaded
  hist = np.zeros(num_epochs)
  outputs = []
  for epoch in range(num_epochs):  # again, normally you would NOT do 300 epochs, it is toy data
      start = time.time()
      for sentence, tags in training_data:
          # Step 1. Remember that Pytorch accumulates gradients. We need to clear them out before each instance
          model.zero_grad()

          # Step 2. Get our inputs ready for the network, that is, turn them into Tensors of word indices.
          sentence_in = prepare_sequence(sentence, word_vectors)
  #         sentence_in = prepare_sequence(sentence, word_to_ix)
          targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)

          # Step 3. Run our forward pass.
          loss = model.neg_log_likelihood(sentence_in, targets)

          # Step 4. Compute the loss, gradients, and update the parameters by
          # calling optimizer.step()
          loss.backward()
          optimizer.step()

      end = time.time()
      log_msg = ' '.join(["Epoch ", str(epoch), " | MSE: ", str(loss.item()), ' | Time elapsed: ', str(end - start)])
      slack_print(log_msg)
      hist[epoch] = loss.item()
      outputs.append(get_classes(training_data, model))

      # Plot loss
      loss_fig = plt.figure()
      plt.plot(hist)
      slack_savefig(loss_fig, fname='.loss.' + fname_params + '.png', fmt='png')

      # Calculate F1 scores
      f1_macro = []
      f1_micro = []
      for y_true, y_pred in outputs:
        f1_macro.append(metrics.f1_score(y_true, y_pred, average='macro'))
        f1_micro.append(metrics.f1_score(y_true, y_pred, average='micro'))

      # Plot F1 scores over epochs
      f1_fig = plt.figure()
      model_metrics = pd.DataFrame([range(num_epochs), f1_macro, f1_micro], index=['epoch', 'F1 (macro)', 'F1 (micro)']).T
      sns.lineplot(x='epoch', y='value', hue='variable', style='variable', dashes=False, data=model_metrics.melt(id_vars='epoch'))
      sns.despine()
      plt.ylim(0,1)
      
      slack_savefig(f1_fig, fname=drive_dir  + '.f1_scores.' + fname_params + '.png', fmt='png')

      
      # Save trained model
      torch.save(model.state_dict(), 'model.{}.pickle'.format(fname_params))

  with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_vectors)
    tag_scores = model(inputs)
    slack_print(tag_scores)
  
  slack_print('<!ckmah> `TRAINING DONE.`')
  return hist, outputs, loss_fig, f1_fig

## Define parameters and train

**NOTE:** do not run unless retraining model

In [0]:
# Model parameters
EMBEDDING_DIM = 5
HIDDEN_DIM = 4
START_TAG = "<START>"
STOP_TAG = "<STOP>"
tag_to_ix = {"B": 0, "I": 1, "O": 2, START_TAG: 3, STOP_TAG: 4}

In [0]:
# Define training and test data
training_data = [(sentence, tags) for sentence, tags in zip(corpus_list[:num_train], tag_list[:num_train])]
test_data = [(sentence, tags) for sentence, tags in zip(corpus_list[num_train:], tag_list[num_train:])]


results = train_model(word_vectors, tag_to_ix, HIDDEN_DIM, learning_rate, num_epochs, training_data)

# Evaluate Model

In [14]:
model = BiLSTM_CRF(word_vectors, tag_to_ix, HIDDEN_DIM)
model.load_state_dict(torch.load('model.train-size-100.lr-0.008.epochs-30.pickle'))

model.eval()

BiLSTM_CRF(
  (word_embeds): Embedding(14406, 100)
  (lstm): LSTM(100, 2, bidirectional=True)
  (hidden2tag): Linear(in_features=4, out_features=5, bias=True)
)

In [26]:
training_pred = get_classes(training_data, model)
print('F1-macro:', metrics.f1_score(training_pred[0], training_pred[1], average='macro'))
print('F1-micro:', metrics.f1_score(training_pred[0], training_pred[1], average='micro'))
print('Recall: ', metrics.recall_score(training_pred[0], training_pred[1]))
print('Precision: ', metrics.precision_score(training_pred[0], training_pred[1]))

F1-macro: 0.5750981969422336
F1-micro: 0.9380308109140802
Recall:  0.10164729919550504
Precision:  0.887402452619844


In [25]:
test_pred = get_classes(test_data, model)
print('Test data:')
print('F1-macro:', metrics.f1_score(test_pred[0], test_pred[1], average='macro'))
print('F1-micro:', metrics.f1_score(test_pred[0], test_pred[1], average='micro'))
print('Recall: ', metrics.recall_score(test_pred[0], test_pred[1]))
print('Precision: ', metrics.precision_score(test_pred[0], test_pred[1]))

Test data:
F1-macro: 0.5751365428442881
F1-micro: 0.938056862983483
Recall:  0.10164729919550504
Precision:  0.8903803131991052
