In [87]:
from fastNLP.modules.encoder.lstm import LSTM
from fastNLP.modules.encoder.linear import Linear
from fastNLP.modules.encoder.embedding import Embedding
from fastNLP.modules.decoder.CRF import ConditionalRandomField
from fastNLP.io.dataset_loader import Conll2003Loader
from fastNLP.models.base_model import BaseModel
from fastNLP import Vocabulary
from fastNLP.modules import decoder, encoder
from fastNLP.modules.utils import seq_mask
from fastNLP.core.metrics import MetricBase
from fastNLP import Trainer, Tester
from fastNLP import AccuracyMetric
from fastNLP.core.optimizer import SGD
import numpy as np
import torch
import torch.nn as nn

In [76]:
class BiLSTMCRF(nn.Module):
    
    def __init__(self, config):
        super(BiLSTMCRF, self).__init__()
        vocab_size = config["vocab_size"]
        word_emb_dim = config["word_emb_dim"]
        hidden_dim = config["rnn_hidden_units"]
        num_classes = config["num_classes"]
        bi_direciton = config["bi_direction"]
        self.Embedding = Embedding(vocab_size, word_emb_dim)
        self.Lstm = LSTM(word_emb_dim, hidden_dim, bidirectional=bi_direciton)
        self.Linear = Linear(2*hidden_dim if bi_direciton else hidden_dim, num_classes)
        self.Crf = ConditionalRandomField(num_classes)
        self.mask = None
        

    def forward(self, token_index_list, speech_index_list=None):
        max_len = len(token_index_list)
        self.mask = self.make_mask(token_index_list, max_len)
        
        x = self.Embedding(token_index_list) # [batch_size, max_len, word_emb_dim]
        x = self.Lstm(x) # [batch_size, max_len, hidden_size]
        x = self.Linear(x) # [batch_size, max_len, num_classes]
        
        loss = None
        ## Calculate the loss value
        if speech_index_list is not None:
            total_loss = self.Crf(x, speech_index_list, self.mask) ## [batch_size, 1]
            loss = torch.mean(total_loss)
        ## Get the part of speech
        tag_seq = self.Crf.viterbi_decode(x, self.mask)
        
        # pad prediction to equal length
#         import pdb;pdb.set_trace()
        for pred in tag_seq:
            if len(pred) < max_len:
                pred = torch.cat((pred, torch.LongTensor([0] * (max_len - len(pred))).cuda()))
        
        return {
            "loss": loss,
            "pred": tag_seq
        }
        
    
    def make_mask(self, x, seq_len):
        batch_size, max_len = x.size(0), x.size(1)
        mask = seq_mask(seq_len, max_len)
        mask = mask.view(batch_size, max_len)
        mask = mask.to(x).float()
        return mask



# if __name__ == "__main__":


data_loader = Conll2003Loader()
train_data = data_loader.load("/remote-home/nndl/data/CONLL2003/train.txt")
valid_data = data_loader.load("/remote-home/nndl/data/CONLL2003/valid.txt")
test_data = data_loader.load("/remote-home/nndl/data/CONLL2003/test.txt")



        

In [77]:
#Lower case the words in the sentences
train_data.apply(lambda x: list(map(lambda item: item.lower(), x['token_list'])), new_field_name='token_list')
test_data.apply(lambda x: list(map(lambda item: item.lower(), x['token_list'])), new_field_name='token_list')
valid_data.apply(lambda x: list(map(lambda item: item.lower(), x['token_list'])), new_field_name='token_list')

vocab = Vocabulary(min_freq=1)
train_data.apply(lambda x: [vocab.add(word) for word in x['token_list']])
valid_data.apply(lambda x: [vocab.add(word) for word in x['token_list']])
test_data.apply(lambda x: [vocab.add(word) for word in x['token_list']])
vocab.build_vocab()

speech_vocab = Vocabulary(min_freq=1)
train_data.apply(lambda x: [speech_vocab.add(word) for word in x['label0_list']])
valid_data.apply(lambda x: [speech_vocab.add(word) for word in x['label0_list']])
test_data.apply(lambda x: [speech_vocab.add(word) for word in x['label0_list']])

[[None, None, None, None, None, None, None, None, None, None, None, None],
 [None, None],
 [None, None, None, None, None, None],
 [None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],
 [None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],
 [None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],
 [None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  No

In [78]:
train_data.apply(lambda x: [vocab.to_index(word) for word in x['token_list']], new_field_name='token_index_list')
test_data.apply(lambda x: [vocab.to_index(word) for word in x['token_list']], new_field_name='token_index_list')
valid_data.apply(lambda x: [vocab.to_index(word) for word in x['token_list']], new_field_name='token_index_list')

train_data.apply(lambda x: [speech_vocab.to_index(word) for word in x['label0_list']], new_field_name='speech_index_list')
test_data.apply(lambda x: [speech_vocab.to_index(word) for word in x['label0_list']], new_field_name='speech_index_list')
valid_data.apply(lambda x: [vocab.to_index(word) for word in x['label0_list']], new_field_name='speech_index_list')



In [79]:
class PosMetric(MetricBase):
    def __init__(self, pred=None, target=None):
        super().__init__()

        self._init_param_map(pred=pred, target=target)

        self.total = 0
        self.acc_count = 0
        

    def evaluate(self, pred, target):
        """

        :param pred: List of (torch.Tensor, or numpy.ndarray). Element's shape can be:
                torch.Size([B,]), torch.Size([B, n_classes]), torch.Size([B, max_len]), torch.Size([B, max_len, n_classes])
        :param target: List of (torch.Tensor, or numpy.ndarray). Element's can be:
                torch.Size([B,]), torch.Size([B,]), torch.Size([B, max_len]), torch.Size([B, max_len])
        :param seq_lens: List of (torch.Tensor, or numpy.ndarray). Element's can be:
                None, None, torch.Size([B], torch.Size([B]). ignored if masks are provided.
        :return: dict({'acc': float})
        """
        
        self.acc_count += torch.sum(torch.eq(pred, target).float()).item()
        self.total += np.prod(list(pred.size()))

    def get_metric(self):
        return {
            'acc': round(self.acc_count / self.total, 6)
        }

In [86]:
config = {
    "vocab_size": len(vocab),
    "word_emb_dim": 200, 
    "rnn_hidden_units": 600,
    "num_classes": len(speech_vocab),
    "bi_direction": True
}
train_data.set_input("token_index_list", "speech_index_list")
test_data.set_input("token_index_list", "speech_index_list")
valid_data.set_input("token_index_list")

train_data.set_target("speech_index_list")
test_data.set_target("speech_index_list")
valid_data.set_target("speech_index_list")

model = BiLSTMCRF(config)

trainer = Trainer(
    model=model, 
    train_data=train_data, 
    dev_data=valid_data,
    use_cuda=True,
    metrics=PosMetric(pred='pred', target='speech_index_list'),
    optimizer=SGD(lr=0.1),
    n_epochs=100, 
    batch_size=1000,
    save_path="./"
)
trainer.train()

input fields after batch(if batch size is 2):
	token_index_list: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 9]) 
	speech_index_list: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 9]) 
target fields after batch(if batch size is 2):
	speech_index_list: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 9]) 

training epochs started 2019-01-09 00-03-30


HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=1500), HTML(value='')), layout=Layout(display…



RuntimeError: CUDA out of memory. Tried to allocate 1.29 GiB (GPU 0; 7.93 GiB total capacity; 5.02 GiB already allocated; 1001.25 MiB free; 1.38 GiB cached)

In [84]:
test = Tester(data=test_data, 
              model=model, 
              metrics=PosMetric(pred='pred', target='speech_index_list')
       )

In [85]:
test.test()

[tester] 
PosMetric: acc=0.671786


{'PosMetric': {'acc': 0.671786}}