# 2-2. NER with LSTM + CRF
Another experiement I ran for the NER task with RNN based models is a bidirectional LSTM + CRF. I used a Bidirectional LSTM unit to learn the word embeddings and input to output mapping, and passed the output word vectors as input sequences to the CRF module to incorporate the label transition probabilities. 

In [13]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats

# sklearn imports
import sklearn
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV, cross_val_predict

# pytorch imports 
import torch
import torch.autograd as autograd
from torch.utils.data import Dataset, DataLoader

from torch import Tensor
import torch.nn as nn
import torch.optim as optim

# set a random seed
torch.manual_seed(10);

# model saving and inspection
import joblib
import eli5
from datetime import datetime

import pdb # debugging

# auto-reloads
%reload_ext autoreload
%autoreload 2

In [14]:
print(f"sklearn version: {sklearn.__version__}")
print(f"pytorch version: {torch.__version__}")
# make sure we are using pytorch > 0.4.0

sklearn version: 0.20.0
pytorch version: 0.4.1


In [15]:
print(sys.path)
if ".." not in sys.path:
    sys.path.insert(0, "..")

['..', '..', '', '/home/hayley/miniconda3/envs/fastai/lib/python36.zip', '/home/hayley/miniconda3/envs/fastai/lib/python3.6', '/home/hayley/miniconda3/envs/fastai/lib/python3.6/lib-dynload', '/home/hayley/miniconda3/envs/fastai/lib/python3.6/site-packages', '/home/hayley/miniconda3/envs/fastai/lib/python3.6/site-packages/defusedxml-0.5.0-py3.6.egg', '/home/hayley/miniconda3/envs/fastai/lib/python3.6/site-packages/IPython/extensions', '/home/hayley/.ipython']


In [16]:
print(sys.path)

['..', '..', '', '/home/hayley/miniconda3/envs/fastai/lib/python36.zip', '/home/hayley/miniconda3/envs/fastai/lib/python3.6', '/home/hayley/miniconda3/envs/fastai/lib/python3.6/lib-dynload', '/home/hayley/miniconda3/envs/fastai/lib/python3.6/site-packages', '/home/hayley/miniconda3/envs/fastai/lib/python3.6/site-packages/defusedxml-0.5.0-py3.6.egg', '/home/hayley/miniconda3/envs/fastai/lib/python3.6/site-packages/IPython/extensions', '/home/hayley/.ipython']


In [17]:
pwd

'/home/hayley/Workspace/Class/IE/HW1/notebooks'

## Load data

In [18]:
from nlp_utils import data_converter, conlleval

In [19]:
# train_data_iob = data_converter.read_conll('../data/eng.train')[1:] #ignore header
train_data_bio = data_converter.read_conll('../data/train.bio')[1:] #ignore header
dev_data_bio = data_converter.read_conll('../data/testa.bio')[1:]
test_data = data_converter.read_conll('../data/eng.testb')[1:]

In [20]:
all_data = train_data_bio.copy()
all_data.extend(dev_data_bio)
all_data.extend(test_data)

In [25]:
datasets = [train_data_bio, dev_data_bio, test_data, all_data]
print("Dataset sizes")
print(" train, dev, test, all")
print(list(map(lambda data: len(data), datasets)))

Dataset sizes
 train, dev, test, all
[14985, 3464, 3683, 22132]


In [27]:
train_sentences = joblib.load('../data/train_sentences.sav')
train_labels = joblib.load('../data/train_labels.sav')

dev_sentences = joblib.load('../data/testa_sentences.sav')
dev_labels = joblib.load('../data/testa_labels.sav')

test_sentences = joblib.load('../data/testb_sentences.sav')

# indice mappers
word2idx = joblib.load('../data/word2idx.sav')
tag2idx = joblib.load('../data/tag2idx.sav')

In [28]:
# Basic statistics on the datasets and lookup tables
print('train sentences: ', len(train_sentences), len(train_labels))
print('dev sentences: ', len(dev_sentences), len(dev_labels))
print('test sentences: ', len(test_sentences))
print('vocab size: ', len(word2idx))
print('number of tags: ', len(tag2idx))

train sentences:  10490 10490
dev sentences:  3464 3464
test sentences:  3683
vocab size:  15002
number of tags:  11


Unlike the CRF models, we don't need to hand-engineer the features for our word representation. Instead, we use the embedding matrix and learn the parameters for a vector representation of a word.

In [29]:
# Define START AND STOP TAGS
START_TAG = '<START>'
STOP_TAG = '<STOP>'

In [31]:
# Sanity check
idx2word = {i:w for (w,i) in word2idx.items()}
some_words = ['EU', 'German']
for w in some_words:
    idx = word2idx[w]
    print(f'w: {w}, idx: {idx}')
    inv_w = idx2word[idx]
    print(f'inv_w: {inv_w}')

w: EU, idx: 1045
inv_w: EU
w: German, idx: 238
inv_w: German


## Helper functions
Let's define some helper functions for the Viterbi algorithm and the input sentence processing.

In [32]:
# numerically stable log-sum computation for Viterbi forward
def log_sum_exp(vec):
    vec = vec.view(1,-1)
    max_val, _ = torch.max(vec, 1)
    return max_val + torch.log(torch.sum(torch.exp(vec - max_val)))

Simple test on log_sum_exp function

In [33]:
def test_logsumexp():
    t = torch.rand(2,2).view(1,-1).fill_(1e6)
    print(f"t: {t}")
    print(f"logsumexp: {log_sum_exp(t)}")
    print(f"\nt: {t.fill_(1e-10)}")
    print(f"logsumexp: {log_sum_exp(t)}")
test_logsumexp()

t: tensor([[1000000., 1000000., 1000000., 1000000.]])
logsumexp: tensor([1000001.3750])

t: tensor([[1.0000e-10, 1.0000e-10, 1.0000e-10, 1.0000e-10]])
logsumexp: tensor([1.3863])


In [34]:
# Helper function to convert words and tags to corresponding indices
def prepare_sentence(sentence, word2idx):
    """
    Returns a tensor of word indices given a list of words
    Args:
    - sentence (list): a list of word_infos. Each word_info is a tuple of (word, ..., label)
    - word2idx (dict): a  dictionary mapping each word in the vocab
                        to a unique index
    Returns:
    - indices (torch.LongTensor): a tensor of word indices
    - tags: (torch.LongTensor): a tensor of tag indices
    """
    indices = []
    tags = []
    for word_info in sentence:
        w,*_,t = word_info
        indices.append(word2idx[w])
        tags.append(tag2idx[t])

#     return torch.tensor(indices, dtype=torch.long), torch.tensor(tags, dtype=torch.long)
    return indices, tags

In [35]:
def test_prepare_sentence():
    test_sent = train_data_bio[0]
    word_indices, tag_indices = prepare_sentence(test_sent, word2idx)
    print(f"word_indices: \n{word_indices}")
    print(f"tag_indices: \n{tag_indices}")
test_prepare_sentence()

word_indices: 
[1045, 10620, 238, 824, 5, 3808, 229, 8246, 1]
tag_indices: 
[0, 1, 2, 1, 1, 1, 2, 1, 1]


## Create BiLSTM CRF model
Now, let's define our main model using Bidirectional LSTM for word->feature extraction 
and to learn the output probability over the tag space using CRF based model.

The viterbi forward and backtracing were mostly taken from [this tutorial](https://pytorch.org/tutorials/beginner/nlp/advanced_tutorial.html).

In [37]:
class BILSTM_CRF(nn.Module):
    def __init__(self, dim_embedding, dim_hidden, vocab_size, tag2idx,
                 n_lstm_layers=1):
        super(BILSTM_CRF, self).__init__()
        self.dim_embedding = dim_embedding
        self.dim_hidden = dim_hidden
        self.n_lstm_layers = n_lstm_layers
        self.vocab_size = vocab_size
        self.tag2idx = tag2idx
        self.output_size = len(tag2idx) #n_tags
        
        self.embedding = nn.Embedding(vocab_size, dim_embedding)
        self.lstm = nn.LSTM(dim_embedding, dim_hidden//2,
                            num_layers=n_lstm_layers, bidirectional=True)
        
        # output of biLSTM to tag
        self.hidden2tag = nn.Linear(self.dim_hidden, self.output_size)
        
        # Transition matrix for CRF
        ## T(i,j) = log_prob(tag_j -> tag_i). Note **from** j **to** i
        self.transitions = nn.Parameter(torch.randn(self.output_size, self.output_size))
        ## Never transit tag_i -> START_TAG and END_TAG -> tag_i
        self.transitions.data[tag2idx[START_TAG], :] = -1e6
        self.transitions.data[:, tag2idx[STOP_TAG],] = -1e6
        
        # Initial hidden layers
        self.hidden = self.init_hidden()
    
    def init_hidden(self):
        return [torch.randn(2,1,self.dim_hidden//2), 
                torch.randn(2,1,self.dim_hidden//2)]
    def _viterbi_forward(self, feats):
        """
        Args:
        - feats (tensor): output feature vector from LSTM layer
        """
        # Forward pass to compute the partition function
        init_alphas = torch.full((1, self.output_size), -1e-6)
        
        # Fill in the entries for START_TAG
        init_alphas[0][self.tag2idx[START_TAG]] = 0.0
        
        # For automatic backprop
        forward_var = init_alphas
        
        # Iterate through the sequence
        for feat in feats:
            alphas = []
            for tag in range(self.output_size):
                emit_score = torch.full((1,self.output_size), feat[tag].item())
                
                # jth entry of trans_score is the score of transitioning from j
                # to tag
                trans_score = self.transitions[tag].view(1,-1)
                
                tag_var = forward_var + emit_score + trans_score
                alphas.append(log_sum_exp(tag_var).view(1))
            forward_var = torch.cat(alphas).view(1,-1)
        terminal_var = forward_var + self.transitions[self.tag2idx[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha
    
    def _get_lstm_features(self, sentence):
        """
        Args:
        - sentence (torch.LongTensor): a 1D LongTensor of word indices
        """
        self.hidden = self.init_hidden()
        embedding = self.embedding(sentence).view(len(sentence), 1, -1)
        
        # Forward through LSTM
        lstm_out, self.hidden = self.lstm(embedding, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.dim_hidden)
        
        # Forward the feature vector from LSTM to output activation
        # through another linear layer
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats
    
    def _score_sentence(self, feats, tags):
        # Returns the score of the input tag sequence
        score = torch.zeros(1)
        
        # Prepend the START_TAG
        tags = torch.cat([torch.tensor([self.tag2idx[START_TAG]], dtype=torch.long),
                          tags])
        
        for i, feat in enumerate(feats):
            score += self.transitions[tags[i+1], tags[i]] + feat[tags[i+1]]
      
        # Lastly, add the transition score to the STOP_TAG
        score += self.transitions[self.tag2idx[STOP_TAG], tags[-1]]
        return score
    
    def _viterbi_decode(self, feats):
        backpointers = []
        
        # Initialize the viterbi vars in log domain
        init_vvars = torch.full( (1, self.output_size), -1e6 )
        init_vvars[0][self.tag2idx[START_TAG]] = 0 # initial starting point
        
        # Forward viterbi algorithm
        forward_var = init_vvars
        for feat in feats:
            
            bptrs = [] # backpointers for this time step
            vvars = [] # viberbi vars for this time step
            for tag in range(self.output_size):
                tag_var = forward_var + self.transitions[tag]
                _, best_tid = torch.max(tag_var,1)
                best_tid = best_tid.item()
                bptrs.append(best_tid)
                vvars.append(tag_var[0][best_tid].view(1))
            # Add in the emission scores 
            forward_var = (torch.cat(vvars) + feat).view(1,-1)
            backpointers.append(bptrs)
            
        # Add transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag2idx[STOP_TAG]]
        _, best_tid =  torch.max(terminal_var,1)
        best_tid = best_tid.item()
        path_score = terminal_var[0][best_tid]
        
        # Backtrace the backpointers to find the best path
        best_path = [best_tid]
        for bptrs in reversed(backpointers):
            best_tid = bptrs[best_tid]
            best_path.append(best_tid)
        
        # Remove the START_TAG 
        start = best_path.pop()
        assert (start == self.tag2idx[START_TAG])
        
        # Reverse the path order
        best_path.reverse()
        
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        # Computes negative log likelihood of having tags given the sentence
        feats = self._get_lstm_features(sentence)
        forward_score = self._viterbi_forward(feats)
        score = self._score_sentence(feats, tags)
        return forward_score - score
    
    def forward(self, sentence):
        # Returns the best path score and the best path, given the setence
        
        # features from the BILSTM
        lstm_feats = self._get_lstm_features(sentence)
        
        # Find the best path and its score, given the input sentence
        best_score, tag_seq = self._viterbi_decode(lstm_feats)
        return best_score, tag_seq

## Train the model

In [40]:
# Set hyperparams
DIM_EMBEDDING = 500
DIM_HIDDEN = 200
LR = 0.0001
N_EPOCH = 1000

In [41]:
# Initialize the model
model = BILSTM_CRF(dim_embedding=DIM_EMBEDDING, 
                   dim_hidden=DIM_HIDDEN,
                   vocab_size=len(word2idx),
                   tag2idx=tag2idx)
optimizer = optim.Adam(model.parameters(), lr=LR)

# Initial prediciton before training
with torch.no_grad():
    precheck_sent, precheck_tag = prepare_sentence(train_data_bio[0], word2idx)
    print("Sentence 0: ", precheck_sent)
    print("GT tag: ", precheck_tag)
    print("="*50)
    print("Pretrain prediciton on sentence 0")
    print(model(precheck_sent))

Sentence 0:  [1045, 10620, 238, 824, 5, 3808, 229, 8246, 1]
GT tag:  [0, 1, 2, 1, 1, 1, 2, 1, 1]
Pretrain prediciton on sentence 0


TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not list

In [None]:
# Training loop
for epoch in range(N_EPOCH):
    for _sent in train_data_bio:
        
        # Get input sentence and tags
        sent, tags = prepare_sentence(_sent, word2idx)
        
        # Make sure no gradient lingering from previous iters
        model.zero_grad()
        
        # Forward pass
        loss = model.neg_log_likelihood(sent, tags)
        
        # Update the weights
        loss.backward()
        optimizer.step()     

In [None]:
# Evaluate the predicitons after training
with torch.no_grad():
    sent0, tags0 = prepare_sentence(train_data_bio[0], word2idx)
    print("Predicition after {N_EPOCH} epochs")
    print("GT: {tags0}")
    print(model(sent0))

In [143]:
# Save the model
def get_current_time():
    now = datetime.now()
    now_str = f"{now.strftime('%m')}_{now.strftime('%d')}_{now.strftime('%H')}_{now.strftime('%M')}"
    return now_str

joblib.dump(model, '../progress_rnn2/bilstm_crf_epoch:65_.sav')

['../trained/bilstm_crf_epoch:65_.sav']

In [142]:
epoch

65