# Imports

In [1]:
import pandas as pd
from collections import Counter
import re
from torch.utils.data import Dataset
import numpy as np
import torch
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


# Vocab

In [2]:
class Vocab(object):
    def __init__(self, iter, max_size=None, sos_token=None, eos_token=None, unk_token=None):
        """Initialize the vocabulary.
        Args:
            iter: An iterable which produces sequences of tokens used to update
                the vocabulary.
            max_size: (Optional) Maximum number of tokens in the vocabulary.
            sos_token: (Optional) Token denoting the start of a sequence.
            eos_token: (Optional) Token denoting the end of a sequence.
            unk_token: (Optional) Token denoting an unknown element in a
                sequence.
        """
        self.max_size = max_size
        self.pad_token = '<pad>'
        self.sos_token = sos_token
        self.eos_token = eos_token
        self.unk_token = unk_token

        # Add special tokens.
        id2word = [self.pad_token]
        if sos_token is not None:
            id2word.append(self.sos_token)
        if eos_token is not None:
            id2word.append(self.eos_token)
        if unk_token is not None:
            id2word.append(self.unk_token)

        # Update counter with token counts.
        counter = Counter()
        for x in iter:
            counter.update(x)

        # Extract lookup tables.
        if max_size is not None:
            counts = counter.most_common(max_size)
        else:
            counts = counter.items()
            counts = sorted(counts, key=lambda x: x[1], reverse=True)
        words = [x[0] for x in counts]
        id2word.extend(words)
        word2id = {x: i for i, x in enumerate(id2word)}

        self._id2word = id2word
        self._word2id = word2id

    def __len__(self):
        return len(self._id2word)

    def word2id(self, word):
        """Map a word in the vocabulary to its unique integer id.
        Args:
            word: Word to lookup.
        Returns:
            id: The integer id of the word being looked up.
        """
        if word in self._word2id:
            return self._word2id[word]
        elif self.unk_token is not None:
            return self._word2id[self.unk_token]
        else:
            raise KeyError('Word "%s" not in vocabulary.' % word)

    def id2word(self, id):
        """Map an integer id to its corresponding word in the vocabulary.
        Args:
            id: Integer id of the word being looked up.
        Returns:
            word: The corresponding word.
        """
        return self._id2word[id]

# CoNLLDataset e Annotation

In [3]:
class Annotation(object):
    def __init__(self):
        """A helper object for storing annotation data."""
        self.tokens = []
        self.pos_tags = []


class CoNLLDataset(Dataset):
    def __init__(self, fname, max_exs=None):
        """Initializes the CoNLLDataset.
        Args:
            fname: The .conllu file to load data from.
        """
        self.fname = fname
        self.annotations = self.process_conll_file(fname, max_exs)
        self.token_vocab = Vocab([x.tokens for x in self.annotations],
                                 unk_token='<unk>')
        self.pos_vocab = Vocab([x.pos_tags for x in self.annotations])
        

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        annotation = self.annotations[idx]
        input = [self.token_vocab.word2id(x) for x in annotation.tokens]
        target = [self.pos_vocab.word2id(x) for x in annotation.pos_tags]
        return input, target

    def process_conll_file(self, fname, max_exs):
        # Read the entire file.
        with open(fname, 'r') as f:
            raw_text = f.read()
        # Split into chunks on blank lines.
        chunks = re.split(r'^\n', raw_text, flags=re.MULTILINE)
        #print(chunks)
        # Process each chunk into an annotation.
        annotations = []
        exs = 0
        for chunk in chunks:
            if not max_exs or exs < max_exs:
                annotation = Annotation()
                lines = chunk.split('\n')
                # Iterate over all lines in the chunk.
                for line in lines:
                    # If line is empty ignore it.
                    if len(line)==0:
                        continue
                    # If line is a commend ignore it.
                    if line[0] == '#':
                        continue
                    # Otherwise split on tabs and retrieve the token and the
                    # POS tag fields.
                    fields = line.split('\t')
                    annotation.tokens.append(fields[1])
                    annotation.pos_tags.append(fields[3])
                if (len(annotation.tokens) > 0) and (len(annotation.pos_tags) > 0):
                    annotations.append(annotation)
            exs += 1
        return annotations

# Funções: pad() e collate_annotations()

In [4]:
def pad(sequences, max_length, pad_value=0):
    """Pads a list of sequences.
    Args:
        sequences: A list of sequences to be padded.
        max_length: The length to pad to.
        pad_value: The value used for padding.
    Returns:
        A list of padded sequences.
    """
    out = []
    for sequence in sequences:
        padded = sequence + [0]*(max_length - len(sequence))
        out.append(padded)
    return out


def collate_annotations(batch):
    """Function used to collate data returned by CoNLLDataset."""
    # Get inputs, targets, and lengths.
    inputs, targets = zip(*batch)
    lengths = [len(x) for x in inputs]
    # Sort by length.
    sort = sorted(zip(inputs, targets, lengths),
                  key=lambda x: x[2],
                  reverse=True)
    inputs, targets, lengths = zip(*sort)
    # Pad.
    max_length = max(lengths)
    inputs = pad(inputs, max_length)
    targets = pad(targets, max_length)
    # Transpose.
    inputs = list(map(list, zip(*inputs)))
    targets = list(map(list, zip(*targets)))
    # Convert to PyTorch variables.
    inputs = Variable(torch.LongTensor(inputs))
    targets = Variable(torch.LongTensor(targets))
    lengths = Variable(torch.LongTensor(lengths))
    if torch.cuda.is_available():
        inputs = inputs.cuda()
        targets = targets.cuda()
        lengths = lengths.cuda()
    return inputs, targets, lengths

# Tagger - GRU

In [5]:

from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class Tagger(nn.Module):
    def __init__(self,
                 input_dim,
                 output_dim,
                 n_layers, 
                 embedding_dim=64,
                 hidden_dim=64,
                 dropout=0.5,
                 bidirectional=True,
                 pad_idx=0):
        """Initializes the tagger.
        
        Args:
            input_dim: Size of the input vocabulary, projection
            output_dim: Size of the output vocabulary.
            embedding_dim: Dimension of the word embeddings.
            hidden_dim: Number of units in each LSTM hidden layer.
            bidirectional: Whether or not to use a bidirectional rnn.
        """
        super(Tagger, self).__init__()

        # Store parameters
        self.input_dim = input_dim 
        self.output_dim = output_dim
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.bidirectional = bidirectional
          
        # Define layers
        self.word_embeddings = nn.Embedding(input_dim, embedding_dim, padding_idx=pad_idx)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers = n_layers, 
                          bidirectional=bidirectional,
                          dropout = dropout if n_layers > 1 else 0)
        
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.activation = nn.LogSoftmax(dim=2)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, lengths=None, hidden=None):
        """Computes a forward pass of the language model.
        
        Args:
            x: A LongTensor w/ dimension [seq_len, batch_size].
            lengths: The lengths of the sequences in x.
            hidden: Hidden state to be fed into the lstm.
            
        Returns:
            net: Probability of the next word in the sequence.
            hidden: Hidden state of the lstm.
        """
        seq_len, batch_size = x.size()
        
        # If no hidden state is provided, then default to zeros.
        if hidden is None:
            if self.bidirectional:
                num_directions = 2
            else:
                num_directions = 1
            hidden = Variable(torch.zeros(num_directions, batch_size, self.hidden_dim))
            if torch.cuda.is_available():
                hidden = hidden.cuda()

        net = self.word_embeddings(x)
        # Pack before feeding into the RNN.
        if lengths is not None:
            lengths = lengths.data.view(-1).tolist()
            net = pack_padded_sequence(net, lengths)
        net, hidden = self.rnn(net, hidden)
        # Unpack after
        if lengths is not None:
            net, _ = pad_packed_sequence(net)
        net = self.fc(net)
        net = self.activation(net)

        return net, hidden

# Training Model

In [6]:
# Load datasets.
train_dataset = CoNLLDataset('./datasets/pt_bosque-ud-train.conllu', 4096)
dev_dataset = CoNLLDataset('./datasets/pt_bosque-ud-dev.conllu', 1024)

dev_dataset.token_vocab = train_dataset.token_vocab
dev_dataset.pos_vocab = train_dataset.pos_vocab

# Hyperparameters / constants.
input_vocab_size = len(train_dataset.token_vocab)
output_vocab_size = len(train_dataset.pos_vocab)
batch_size = 16
epochs = 6
n_layers = 1

# Initialize the model.
model = Tagger(input_vocab_size, output_vocab_size, n_layers)
if torch.cuda.is_available():
    model = model.cuda()

# Loss function weights.
weight = torch.ones(output_vocab_size)
weight[0] = 0
if torch.cuda.is_available():
    weight = weight.cuda()
    
# Initialize loss function and optimizer.
loss_function = torch.nn.NLLLoss(weight)
optimizer = torch.optim.Adam(model.parameters())

# Main training loop.
data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                         collate_fn=collate_annotations)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False,
                        collate_fn=collate_annotations)
losses = []
i = 0
for epoch in range(epochs):
    for inputs, targets, lengths in data_loader:
        optimizer.zero_grad()
        outputs, _ = model(inputs, lengths=lengths)

        outputs = outputs.view(-1, output_vocab_size)
        print('---------------outputs----------------')
        print(outputs.shape)
        print(outputs)
        targets = targets.view(-1)
        print('---------------targets----------------')
        print(targets.shape)
        print(targets)

        loss = loss_function(outputs, targets)
        loss.backward()
        optimizer.step()

        #losses.append(loss.data[0])
        losses.append(loss.item())
        if (i % 10) == 0:
            # Compute dev loss over entire dev set.
            # NOTE: This is expensive. You may want to only use a 
            # subset of the dev set.
            #print('iteration, ', i)
            dev_losses = []
            for inputs, targets, lengths in dev_loader:
                outputs, _ = model(inputs, lengths=lengths)
                outputs = outputs.view(-1, output_vocab_size)
                targets = targets.view(-1)
                loss = loss_function(outputs, targets)
                dev_losses.append(loss.item())
            avg_train_loss = np.mean(losses)
            avg_dev_loss = np.mean(dev_losses)
            losses = []
            #print('here')
            print('Epoch %i Iteration %i - Train Loss: %0.6f - Dev Loss: %0.6f' % (epoch, i, avg_train_loss, avg_dev_loss), end='\n')
            torch.save(model, 'pos_tagger_gru.pt')
        i += 1
        
torch.save(model, 'pos_tagger_gru.final.pt')

---------------outputs----------------
torch.Size([720, 19])
tensor([[-2.9833, -2.9880, -3.0604,  ..., -3.0257, -3.1751, -2.8746],
        [-2.6667, -3.1559, -2.9370,  ..., -2.6749, -3.1603, -2.7346],
        [-3.0176, -3.1779, -2.9734,  ..., -2.8453, -3.2018, -2.9602],
        ...,
        [-2.8657, -3.0024, -2.9583,  ..., -2.9776, -2.9520, -2.9357],
        [-2.8657, -3.0024, -2.9583,  ..., -2.9776, -2.9520, -2.9357],
        [-2.8657, -3.0024, -2.9583,  ..., -2.9776, -2.9520, -2.9357]],
       grad_fn=<ViewBackward0>)
---------------targets----------------
torch.Size([720])
tensor([ 2, 13,  7,  2,  2,  2,  6, 11,  6,  4,  1,  6, 13,  4,  1,  1,  1, 10,
         3,  1,  1,  1, 12,  5,  6,  5, 12,  9,  1, 13,  7,  8,  4,  9,  2,  4,
         5,  5, 14,  2,  4,  3,  5, 12,  9,  9,  3,  0,  1,  3,  1,  5,  1,  4,
         7,  1,  1,  1, 13,  5,  9,  5,  2,  0,  4,  9,  5,  7,  3,  7,  3,  3,
         8, 10,  2,  7, 13, 10,  1,  0, 12,  5,  4,  3,  1,  3,  2,  1,  7,  5,
         1,  3, 

In [7]:
train_dataset[0]

([235, 19, 7, 5, 75], [6, 7, 3, 2, 1])

In [8]:
data_loader

<torch.utils.data.dataloader.DataLoader at 0x7f4cd03fcb20>

In [9]:
# Collect the predictions and targets
y_true = []
y_pred = []

for inputs, targets, lengths in dev_loader:
    outputs, _ = model(inputs, lengths=lengths)
    _, preds = torch.max(outputs, dim=2)
    targets = targets.view(-1)
    preds = preds.view(-1)
    if torch.cuda.is_available():
        targets = targets.cpu()
        preds = preds.cpu()
    y_true.append(targets.data.numpy())
    y_pred.append(preds.data.numpy())

# Stack into numpy arrays
y_true = np.concatenate(y_true)
y_pred = np.concatenate(y_pred)

# Compute accuracy
acc = np.mean(y_true[y_true != 0] == y_pred[y_true != 0])
print('Accuracy - %0.6f\n' % acc)

# Evaluate f1-score
from sklearn.metrics import f1_score
score = f1_score(y_true, y_pred, average=None)
print('F1-scores:\n')
for label, score in zip(dev_dataset.pos_vocab._id2word[1:], score[1:]):
    print('%s - %0.6f' % (label, score))

Accuracy - 0.869652

F1-scores:

NOUN - 0.823098
DET - 0.959861
ADP - 0.963025
PUNCT - 0.997705
VERB - 0.091743
PROPN - 0.611173
_ - 0.983504
ADJ - 0.622965
ADV - 0.820966
PRON - 0.874222
CCONJ - 0.989862
AUX - 0.893428
SCONJ - 0.699286
NUM - 0.763359
SYM - 1.000000
X - 0.000000
INTJ - 0.000000


In [10]:
model = torch.load('pos_tagger_gru.final.pt')

def inference(sentence):
    # Convert words to id tensor.
    ids = [[dataset.token_vocab.word2id(x)] for x in sentence]
    ids = Variable(torch.LongTensor(ids))
    if torch.cuda.is_available():
        ids = ids.cuda()
    # Get model output.
    output, _ = model(ids)
    _, preds = torch.max(output, dim=2)
    if torch.cuda.is_available():
        preds = preds.cpu()
    preds = preds.data.view(-1).numpy()
    pos_tags = [dataset.pos_vocab.id2word(x) for x in preds]
    for word, tag in zip(sentence, pos_tags):
        print('%s - %s' % (word, tag))

In [11]:
def inference_with_labels(sentence, labels):
    #print(sentence)
    # Convert words to id tensor.
    ids = [[dataset.token_vocab.word2id(x)] for x in sentence]
    print(ids)
    ids = Variable(torch.LongTensor(ids))
    if torch.cuda.is_available():
        ids = ids.cuda()
    # Get model output.
    output, _ = model(ids)
    _, preds = torch.max(output, dim=2)
    if torch.cuda.is_available():
        preds = preds.cpu()
    preds = preds.data.view(-1).numpy()
    pos_tags = [dataset.pos_vocab.id2word(x) for x in preds]
    #labels = [dataset.pos_vocab.id2word(x) for x in labels]
    #sentence = [test_dataset.token_vocab.id2word(x) for x in ids]
    for word, tag, label in zip(sentence, pos_tags, labels):
        print('%s - %s - %s' % (word, tag, label))

In [12]:
test_dataset = CoNLLDataset('./datasets/pt_bosque-ud-test.conllu')
dataset = CoNLLDataset('./datasets/pt_bosque-ud-train.conllu')

sentence, labels = test_dataset[10]
sentence = [test_dataset.token_vocab.id2word(x) for x in sentence]
print(sentence)
labels = [test_dataset.pos_vocab.id2word(x) for x in labels]
inference_with_labels(sentence, labels)

['Os', 'policiais', 'federais', 'de', 'Mato', 'Grosso', 'do', 'de', 'o', 'Sul', 'entraram', 'em', 'greve', 'ontem', ',', 'em', 'adesão', 'ao', 'a', 'o', 'movimento', 'iniciado', 'no', 'em', 'o', 'Distrito', 'Federal', '.']
[[48], [1361], [3935], [2], [3265], [3266], [11], [2], [5], [496], [3737], [7], [1219], [58], [4], [7], [3524], [31], [3], [5], [962], [6250], [22], [7], [5], [15291], [653], [6]]
Os - SCONJ - DET
policiais - ADV - NOUN
federais - ADJ - ADJ
de - ADP - ADP
Mato - NOUN - PROPN
Grosso - VERB - PROPN
do - PRON - _
de - ADP - ADP
o - DET - DET
Sul - NOUN - PROPN
entraram - VERB - VERB
em - ADP - ADP
greve - ADJ - NOUN
ontem - SYM - ADV
, - AUX - PUNCT
em - ADP - ADP
adesão - NOUN - NOUN
ao - PROPN - _
a - PUNCT - ADP
o - DET - DET
movimento - NOUN - NOUN
iniciado - VERB - VERB
no - _ - _
em - ADP - ADP
o - DET - DET
Distrito - PROPN - PROPN
Federal - NOUN - PROPN
. - PUNCT - PUNCT
