In [1]:
!pwd

/content


In [2]:
from google.colab import drive
# drive.mount('/content/drive/', force_remount=True)
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
!pwd

/content


In [4]:
cd "drive/MyDrive/Doutorado/Disciplinas/[2022.1] [UFF] Processamento de Linguagem Natural - Professora: Aline Marins Paes Carvalho/Trabalhos/Trabalho 2 - POS  e Transfer Learning/"

/content/drive/MyDrive/Doutorado/Disciplinas/[2022.1] [UFF] Processamento de Linguagem Natural - Professora: Aline Marins Paes Carvalho/Trabalhos/Trabalho 2 - POS  e Transfer Learning


In [5]:
!pwd

/content/drive/MyDrive/Doutorado/Disciplinas/[2022.1] [UFF] Processamento de Linguagem Natural - Professora: Aline Marins Paes Carvalho/Trabalhos/Trabalho 2 - POS  e Transfer Learning


# Imports

In [6]:
import pandas as pd
from collections import Counter
import re
from torch.utils.data import Dataset
import numpy as np
import torch
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Vocab

In [7]:
class Vocab(object):
    def __init__(self, iter, max_size=None, sos_token=None, eos_token=None, unk_token=None):
        """Initialize the vocabulary.
        Args:
            iter: An iterable which produces sequences of tokens used to update
                the vocabulary.
            max_size: (Optional) Maximum number of tokens in the vocabulary.
            sos_token: (Optional) Token denoting the start of a sequence.
            eos_token: (Optional) Token denoting the end of a sequence.
            unk_token: (Optional) Token denoting an unknown element in a
                sequence.
        """
        self.max_size = max_size
        self.pad_token = '<pad>'
        self.sos_token = sos_token
        self.eos_token = eos_token
        self.unk_token = unk_token

        # Add special tokens.
        id2word = [self.pad_token]
        if sos_token is not None:
            id2word.append(self.sos_token)
        if eos_token is not None:
            id2word.append(self.eos_token)
        if unk_token is not None:
            id2word.append(self.unk_token)

        # Update counter with token counts.
        counter = Counter()
        for x in iter:
            counter.update(x)

        # Extract lookup tables.
        if max_size is not None:
            counts = counter.most_common(max_size)
        else:
            counts = counter.items()
            counts = sorted(counts, key=lambda x: x[1], reverse=True)
        words = [x[0] for x in counts]
        id2word.extend(words)
        word2id = {x: i for i, x in enumerate(id2word)}

        self._id2word = id2word
        self._word2id = word2id

    def __len__(self):
        return len(self._id2word)

    def word2id(self, word):
        """Map a word in the vocabulary to its unique integer id.
        Args:
            word: Word to lookup.
        Returns:
            id: The integer id of the word being looked up.
        """
        if word in self._word2id:
            return self._word2id[word]
        elif self.unk_token is not None:
            return self._word2id[self.unk_token]
        else:
            raise KeyError('Word "%s" not in vocabulary.' % word)

    def id2word(self, id):
        """Map an integer id to its corresponding word in the vocabulary.
        Args:
            id: Integer id of the word being looked up.
        Returns:
            word: The corresponding word.
        """
        return self._id2word[id]

# CoNLLDataset e Annotation

In [8]:
class Annotation(object):
    def __init__(self):
        """A helper object for storing annotation data."""
        self.tokens = []
        self.pos_tags = []


class CoNLLDataset(Dataset):
    def __init__(self, fname, max_exs=None):
        """Initializes the CoNLLDataset.
        Args:
            fname: The .conllu file to load data from.
        """
        self.fname = fname
        self.annotations = self.process_conll_file(fname, max_exs)
        self.token_vocab = Vocab([x.tokens for x in self.annotations],
                                 unk_token='<unk>')
        self.pos_vocab = Vocab([x.pos_tags for x in self.annotations])
        

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        annotation = self.annotations[idx]
        input = [self.token_vocab.word2id(x) for x in annotation.tokens]
        target = [self.pos_vocab.word2id(x) for x in annotation.pos_tags]
        return input, target

    def process_conll_file(self, fname, max_exs):
        # Read the entire file.
        with open(fname, 'r') as f:
            raw_text = f.read()
        # Split into chunks on blank lines.
        chunks = re.split(r'^\n', raw_text, flags=re.MULTILINE)
        #print(chunks)
        # Process each chunk into an annotation.
        annotations = []
        exs = 0
        for chunk in chunks:
            if not max_exs or exs < max_exs:
                annotation = Annotation()
                lines = chunk.split('\n')
                # Iterate over all lines in the chunk.
                for line in lines:
                    # If line is empty ignore it.
                    if len(line)==0:
                        continue
                    # If line is a commend ignore it.
                    if line[0] == '#':
                        continue
                    # Otherwise split on tabs and retrieve the token and the
                    # POS tag fields.
                    fields = line.split('\t')
                    annotation.tokens.append(fields[1])
                    annotation.pos_tags.append(fields[3])
                if (len(annotation.tokens) > 0) and (len(annotation.pos_tags) > 0):
                    annotations.append(annotation)
            exs += 1
        return annotations

# Funções: pad() e collate_annotations()

In [9]:
def pad(sequences, max_length, pad_value=0):
    """Pads a list of sequences.
    Args:
        sequences: A list of sequences to be padded.
        max_length: The length to pad to.
        pad_value: The value used for padding.
    Returns:
        A list of padded sequences.
    """
    out = []
    for sequence in sequences:
        padded = sequence + [0]*(max_length - len(sequence))
        out.append(padded)
    return out


def collate_annotations(batch):
    """Function used to collate data returned by CoNLLDataset."""
    # Get inputs, targets, and lengths.
    inputs, targets = zip(*batch)
    lengths = [len(x) for x in inputs]
    # Sort by length.
    sort = sorted(zip(inputs, targets, lengths),
                  key=lambda x: x[2],
                  reverse=True)
    inputs, targets, lengths = zip(*sort)
    # Pad.
    max_length = max(lengths)
    inputs = pad(inputs, max_length)
    targets = pad(targets, max_length)
    # Transpose.
    inputs = list(map(list, zip(*inputs)))
    targets = list(map(list, zip(*targets)))
    # Convert to PyTorch variables.
    inputs = Variable(torch.LongTensor(inputs))
    targets = Variable(torch.LongTensor(targets))
    lengths = Variable(torch.LongTensor(lengths))
    if torch.cuda.is_available():
        inputs = inputs.cuda()
        targets = targets.cuda()
        lengths = lengths.cuda()
    return inputs, targets, lengths

# Tagger - LSTM

In [10]:

from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class Tagger(nn.Module):
    def __init__(self,
                 input_dim,
                 output_dim,
                 n_layers, 
                 embedding_dim=64,
                 hidden_dim=64,
                 dropout=0.5,
                 bidirectional=True,
                 pad_idx=0):
        """Initializes the tagger.
        
        Args:
            input_dim: Size of the input vocabulary, projection
            output_dim: Size of the output vocabulary.
            embedding_dim: Dimension of the word embeddings.
            hidden_dim: Number of units in each LSTM hidden layer.
            bidirectional: Whether or not to use a bidirectional rnn.
        """
        super(Tagger, self).__init__()

        # Store parameters
        self.input_dim = input_dim 
        self.output_dim = output_dim
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.bidirectional = bidirectional
          
        # Define layers
        self.word_embeddings = nn.Embedding(input_dim, embedding_dim, padding_idx=pad_idx)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers = n_layers, 
                          bidirectional=bidirectional,
                          dropout = dropout if n_layers > 1 else 0)
        
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.activation = nn.LogSoftmax(dim=2)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, lengths=None, hidden=None):
        """Computes a forward pass of the language model.
        
        Args:
            x: A LongTensor w/ dimension [seq_len, batch_size].
            lengths: The lengths of the sequences in x.
            hidden: Hidden state to be fed into the lstm.
            
        Returns:
            net: Probability of the next word in the sequence.
            hidden: Hidden state of the lstm.
        """
        seq_len, batch_size = x.size()
        
        # If no hidden state is provided, then default to zeros.
        if hidden is None:
            if self.bidirectional:
                num_directions = 2
            else:
                num_directions = 1
            hidden = Variable(torch.zeros(num_directions, batch_size, self.hidden_dim))
            if torch.cuda.is_available():
                hidden = hidden.cuda()

        net = self.word_embeddings(x)
        # Pack before feeding into the RNN.
        if lengths is not None:
            lengths = lengths.data.view(-1).tolist()
            net = pack_padded_sequence(net, lengths)
        # net, hidden = self.rnn(net, hidden) # Daniel
        net, hidden = self.rnn(net, (hidden, hidden)) # Daniel
        # Unpack after
        if lengths is not None:
            net, _ = pad_packed_sequence(net)
        net = self.fc(net)
        net = self.activation(net)

        return net, hidden

# Training Model

In [11]:
# Load datasets.
train_dataset = CoNLLDataset('./datasets/pt_bosque-ud-train.conllu', 4096)
dev_dataset = CoNLLDataset('./datasets/pt_bosque-ud-dev.conllu', 1024)

dev_dataset.token_vocab = train_dataset.token_vocab
dev_dataset.pos_vocab = train_dataset.pos_vocab

# Hyperparameters / constants.
input_vocab_size = len(train_dataset.token_vocab)
output_vocab_size = len(train_dataset.pos_vocab)
batch_size = 16
epochs = 50
n_layers = 1

# Initialize the model.
model = Tagger(input_vocab_size, output_vocab_size, n_layers)
if torch.cuda.is_available():
    model = model.cuda()

# Loss function weights.
weight = torch.ones(output_vocab_size)
weight[0] = 0
if torch.cuda.is_available():
    weight = weight.cuda()
    
# Initialize loss function and optimizer.
loss_function = torch.nn.NLLLoss(weight)
optimizer = torch.optim.Adam(model.parameters())

# Main training loop.
data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                         collate_fn=collate_annotations)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False,
                        collate_fn=collate_annotations)
losses = []
i = 0
for epoch in range(epochs):
    for inputs, targets, lengths in data_loader:
        optimizer.zero_grad()
        outputs, _ = model(inputs, lengths=lengths)

        outputs = outputs.view(-1, output_vocab_size)
        targets = targets.view(-1)

        loss = loss_function(outputs, targets)
        loss.backward()
        optimizer.step()

        #losses.append(loss.data[0])
        losses.append(loss.item())
        if (i % 10) == 0:
            # Compute dev loss over entire dev set.
            # NOTE: This is expensive. You may want to only use a 
            # subset of the dev set.
            #print('iteration, ', i)
            dev_losses = []
            for inputs, targets, lengths in dev_loader:
                outputs, _ = model(inputs, lengths=lengths)
                outputs = outputs.view(-1, output_vocab_size)
                targets = targets.view(-1)
                loss = loss_function(outputs, targets)
                dev_losses.append(loss.item())
            avg_train_loss = np.mean(losses)
            avg_dev_loss = np.mean(dev_losses)
            losses = []
            #print('here')
            print('Epoch %i Iteration %i - Train Loss: %0.6f - Dev Loss: %0.6f' % (epoch, i, avg_train_loss, avg_dev_loss), end='\n')
            torch.save(model, 'pos_tagger_lstm.pt')
        i += 1
        
torch.save(model, 'pos_tagger_lstm.final.pt')

Epoch 0 Iteration 0 - Train Loss: 2.918808 - Dev Loss: 2.887232
Epoch 0 Iteration 10 - Train Loss: 2.800273 - Dev Loss: 2.674023
Epoch 0 Iteration 20 - Train Loss: 2.571697 - Dev Loss: 2.379754
Epoch 0 Iteration 30 - Train Loss: 2.283765 - Dev Loss: 2.135228
Epoch 0 Iteration 40 - Train Loss: 2.012758 - Dev Loss: 1.921213
Epoch 0 Iteration 50 - Train Loss: 1.826985 - Dev Loss: 1.742305
Epoch 0 Iteration 60 - Train Loss: 1.637574 - Dev Loss: 1.588861
Epoch 0 Iteration 70 - Train Loss: 1.499662 - Dev Loss: 1.455115
Epoch 0 Iteration 80 - Train Loss: 1.392854 - Dev Loss: 1.345650
Epoch 0 Iteration 90 - Train Loss: 1.289880 - Dev Loss: 1.257149
Epoch 0 Iteration 100 - Train Loss: 1.225468 - Dev Loss: 1.185523
Epoch 0 Iteration 110 - Train Loss: 1.134778 - Dev Loss: 1.127009
Epoch 0 Iteration 120 - Train Loss: 1.140387 - Dev Loss: 1.082329
Epoch 0 Iteration 130 - Train Loss: 1.096165 - Dev Loss: 1.046026
Epoch 0 Iteration 140 - Train Loss: 1.048191 - Dev Loss: 1.010638
Epoch 0 Iteration 150

In [12]:
# Collect the predictions and targets
y_true = []
y_pred = []

for inputs, targets, lengths in dev_loader:
    outputs, _ = model(inputs, lengths=lengths)
    _, preds = torch.max(outputs, dim=2)
    targets = targets.view(-1)
    preds = preds.view(-1)
    if torch.cuda.is_available():
        targets = targets.cpu()
        preds = preds.cpu()
    y_true.append(targets.data.numpy())
    y_pred.append(preds.data.numpy())

In [13]:
# Stack into numpy arrays
y_real = np.concatenate(y_true)
y_pred = np.concatenate(y_pred)

In [14]:
y_real_temp = []
for id in y_real:
  y_real_temp.append(dev_dataset.pos_vocab._id2word[ id ])
y_real = y_real_temp
print(y_real)

['DET', 'NUM', 'PROPN', 'DET', 'DET', 'DET', 'PUNCT', 'DET', 'ADV', 'ADV', 'DET', 'DET', 'NOUN', 'NOUN', 'NOUN', 'PROPN', 'NOUN', 'DET', 'PROPN', 'NOUN', 'NOUN', 'ADJ', 'PROPN', 'NOUN', 'ADV', 'ADV', 'NOUN', 'DET', 'VERB', 'AUX', 'VERB', 'PROPN', 'ADP', 'NOUN', 'PROPN', 'ADP', 'VERB', 'NOUN', 'PUNCT', 'VERB', 'AUX', 'NUM', 'NUM', 'NUM', 'ADP', 'NOUN', 'CCONJ', 'PROPN', 'NOUN', '_', 'PUNCT', 'DET', 'DET', 'VERB', 'SCONJ', '_', 'ADJ', 'NOUN', 'AUX', 'NOUN', 'NOUN', '_', 'VERB', '<pad>', 'ADP', 'ADP', 'NUM', 'NOUN', 'NOUN', 'ADP', 'AUX', 'ADP', 'VERB', 'VERB', 'DET', 'PRON', 'VERB', 'ADP', 'NOUN', '<pad>', 'DET', 'DET', 'PUNCT', '_', '_', 'NUM', 'NUM', 'DET', 'SCONJ', 'AUX', 'NOUN', 'AUX', '_', 'DET', 'CCONJ', '<pad>', 'NOUN', 'NOUN', 'VERB', 'ADP', 'ADP', 'NOUN', '_', 'NOUN', 'DET', 'VERB', 'ADP', '_', 'ADP', 'NOUN', 'NOUN', '<pad>', '_', 'VERB', '_', 'DET', 'DET', 'PRON', 'ADP', 'PUNCT', 'NOUN', '_', 'NOUN', 'ADP', 'DET', '<pad>', '<pad>', '<pad>', 'ADP', '_', 'ADP', 'NOUN', 'NOUN', 'AD

In [15]:
y_pred_temp = []
for id in y_pred:
  y_pred_temp.append(dev_dataset.pos_vocab._id2word[ id ])
y_pred = y_pred_temp
print(y_pred)

['DET', 'VERB', 'PROPN', 'DET', 'DET', 'DET', 'PUNCT', 'DET', 'ADV', 'ADP', 'DET', 'DET', 'AUX', 'PRON', 'PRON', 'PRON', 'NOUN', 'DET', 'VERB', 'NOUN', 'NOUN', 'NOUN', 'PRON', 'NOUN', 'ADV', 'DET', 'NOUN', 'DET', 'VERB', 'AUX', 'NOUN', 'NOUN', 'ADP', 'NOUN', 'PROPN', 'ADP', 'VERB', 'ADJ', 'PUNCT', 'VERB', 'AUX', 'NUM', 'NUM', 'NUM', 'ADP', 'NOUN', 'CCONJ', 'NOUN', 'NOUN', '_', 'PUNCT', 'DET', 'DET', 'VERB', 'SCONJ', '_', 'NOUN', 'NOUN', 'AUX', 'NOUN', 'NOUN', '_', 'NOUN', 'PROPN', 'ADP', 'ADP', 'NUM', 'NOUN', 'NOUN', 'ADP', 'AUX', 'ADP', 'VERB', 'VERB', 'DET', 'PRON', 'PRON', 'ADP', 'NOUN', 'PROPN', 'DET', 'DET', 'PUNCT', '_', '_', 'NUM', 'NUM', 'DET', 'SCONJ', 'AUX', 'NOUN', 'AUX', '_', 'DET', 'CCONJ', 'PROPN', 'NOUN', 'NOUN', 'VERB', 'ADP', 'ADP', 'NOUN', '_', 'NOUN', 'DET', 'VERB', 'ADP', '_', 'ADP', 'NOUN', 'NOUN', 'PROPN', '_', 'VERB', '_', 'DET', 'DET', 'PRON', 'ADP', 'PUNCT', 'NOUN', '_', 'NOUN', 'ADP', 'DET', 'PROPN', 'PROPN', 'PROPN', 'ADP', '_', 'ADP', 'NOUN', 'NOUN', 'ADV', 

In [16]:
len(y_real)

59168

In [17]:
len(y_pred)

59168

In [18]:
print( classification_report( y_real, y_pred ) )
f1 = f1_score( y_real, y_pred, average='weighted' )
acc = accuracy_score( y_real, y_pred )
print( f'F1: {f1:.2}' )
print( f'Accuracy: {acc:.2}' )

# # Compute accuracy
# acc = np.mean(y_real[y_real != 0] == y_pred[y_real != 0])
# print('Accuracy - %0.6f\n' % acc)

# # Evaluate f1-score
# from sklearn.metrics import f1_score
# score = f1_score(y_real, y_pred, average=None)
# print('F1-scores:\n')
# for label, score in zip(dev_dataset.pos_vocab._id2word[1:], score[1:]):
#     print('%s - %0.6f' % (label, score))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       <pad>       0.00      0.00      0.00     33153
         ADJ       0.80      0.60      0.69      1157
         ADP       0.96      0.97      0.96      3549
         ADV       0.90      0.87      0.89       844
         AUX       0.94      0.87      0.90       581
       CCONJ       0.99      0.99      0.99       542
         DET       0.96      0.97      0.97      3702
        INTJ       0.00      0.00      0.00         3
        NOUN       0.77      0.91      0.84      4415
         NUM       0.93      0.76      0.84       461
        PRON       0.70      0.86      0.77       835
       PROPN       0.03      0.50      0.06      2143
       PUNCT       1.00      1.00      1.00      3267
       SCONJ       0.72      0.68      0.70       542
         SYM       1.00      1.00      1.00        36
        VERB       0.72      0.84      0.77      2166
           X       0.20      0.05      0.08        19
           _       0.98    

In [19]:
model = torch.load('pos_tagger_lstm.final.pt')

def inference(sentence):
    # Convert words to id tensor.
    ids = [[dataset.token_vocab.word2id(x)] for x in sentence]
    ids = Variable(torch.LongTensor(ids))
    if torch.cuda.is_available():
        ids = ids.cuda()
    # Get model output.
    output, _ = model(ids)
    _, preds = torch.max(output, dim=2)
    if torch.cuda.is_available():
        preds = preds.cpu()
    preds = preds.data.view(-1).numpy()
    pos_tags = [dataset.pos_vocab.id2word(x) for x in preds]
    for word, tag in zip(sentence, pos_tags):
        print('%s - %s' % (word, tag))

In [20]:
def inference_with_labels(sentence, labels):
    #print(sentence)
    # Convert words to id tensor.
    ids = [[dataset.token_vocab.word2id(x)] for x in sentence]
    print(ids)
    ids = Variable(torch.LongTensor(ids))
    if torch.cuda.is_available():
        ids = ids.cuda()
    # Get model output.
    output, _ = model(ids)
    _, preds = torch.max(output, dim=2)
    if torch.cuda.is_available():
        preds = preds.cpu()
    preds = preds.data.view(-1).numpy()
    pos_tags = [dataset.pos_vocab.id2word(x) for x in preds]
    #labels = [dataset.pos_vocab.id2word(x) for x in labels]
    #sentence = [test_dataset.token_vocab.id2word(x) for x in ids]
    for word, tag, label in zip(sentence, pos_tags, labels):
        print('%s - %s - %s' % (word, tag, label))

In [21]:
test_dataset = CoNLLDataset('./datasets/pt_bosque-ud-test.conllu')
dataset = CoNLLDataset('./datasets/pt_bosque-ud-train.conllu')

sentence, labels = test_dataset[10]
sentence = [test_dataset.token_vocab.id2word(x) for x in sentence]
print(sentence)
labels = [test_dataset.pos_vocab.id2word(x) for x in labels]
inference_with_labels(sentence, labels)

['Os', 'policiais', 'federais', 'de', 'Mato', 'Grosso', 'do', 'de', 'o', 'Sul', 'entraram', 'em', 'greve', 'ontem', ',', 'em', 'adesão', 'ao', 'a', 'o', 'movimento', 'iniciado', 'no', 'em', 'o', 'Distrito', 'Federal', '.']
[[48], [1361], [3935], [2], [3265], [3266], [11], [2], [5], [496], [3737], [7], [1219], [58], [4], [7], [3524], [31], [3], [5], [962], [6250], [22], [7], [5], [15291], [653], [6]]
Os - SCONJ - DET
policiais - VERB - NOUN
federais - NOUN - ADJ
de - AUX - ADP
Mato - VERB - PROPN
Grosso - VERB - PROPN
do - PRON - _
de - ADP - ADP
o - DET - DET
Sul - NOUN - PROPN
entraram - ADJ - VERB
em - ADP - ADP
greve - PROPN - NOUN
ontem - SYM - ADV
, - ADP - PUNCT
em - ADP - ADP
adesão - NOUN - NOUN
ao - ADJ - _
a - PUNCT - ADP
o - DET - DET
movimento - NOUN - NOUN
iniciado - VERB - VERB
no - _ - _
em - ADP - ADP
o - DET - DET
Distrito - NOUN - PROPN
Federal - NOUN - PROPN
. - PUNCT - PUNCT
