# Assignment 3: Evaluating and extending an RNN based POS tagger

In [1]:
import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn

In [2]:
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(torch.cuda.get_device_name(i))
else:
    print("No GPU available")

Tesla T4


## Download and parse UD data

In [3]:
!curl --remote-name-all https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-3687{/ud-treebanks-v2.8.tgz,/ud-documentation-v2.8.tgz,/ud-tools-v2.8.tgz}
!tar -xf ud-treebanks-v2.8.tgz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  410M  100  410M    0     0  25.2M      0  0:00:16  0:00:16 --:--:-- 25.1M
100 89.9M  100 89.9M    0     0  27.5M      0  0:00:03  0:00:03 --:--:-- 30.2M
100  533k  100  533k    0     0  2051k      0 --:--:-- --:--:-- --:--:-- 2051k


In [4]:
def parse(file):
    X = []
    y = []
    with open(file, 'r') as infile:
        sents = infile.read().split('\n\n')
        if sents[-1] == '':
            sents = sents[:-1]
        for sent in sents:
            words, tags = [], []
            lines = sent.split('\n')
            for line in lines:
                if line.startswith('#'):
                    continue
                line = line.strip().split('\t')
                words.append(line[1])
                tags.append(line[3])
            X.append(words)
            y.append(tags)
            
    assert len(X) == len(y)

    return X, y


### Train/test split

English

In [22]:
treebank_train = '/content/ud-treebanks-v2.8/UD_English-EWT/en_ewt-ud-train.conllu'
treebank_test = '/content/ud-treebanks-v2.8/UD_English-EWT/en_ewt-ud-train.conllu'
X_train_en, y_train_en = parse(treebank_train)
X_test_en, y_test_en = parse(treebank_test)

Swedish

In [16]:
treebank_train = '/content/ud-treebanks-v2.8/UD_Swedish-LinES/sv_lines-ud-train.conllu'
treebank_test = '/content/ud-treebanks-v2.8/UD_Swedish-LinES/sv_lines-ud-test.conllu'
X_train_sv, y_train_sv = parse(treebank_train)
X_test_sv, y_test_sv = parse(treebank_test)

Norwegian

In [30]:
treebank_train = '/content/ud-treebanks-v2.8/UD_Norwegian-Bokmaal/no_bokmaal-ud-train.conllu'
treebank_test = '/content/ud-treebanks-v2.8/UD_Norwegian-Bokmaal/no_bokmaal-ud-test.conllu'
X_train_no, y_train_no = parse(treebank_train)
X_test_no, y_test_no = parse(treebank_test)

Danish

In [36]:
treebank_train = '/content/ud-treebanks-v2.8/UD_Danish-DDT/da_ddt-ud-train.conllu'
treebank_test = '/content/ud-treebanks-v2.8/UD_Danish-DDT/da_ddt-ud-test.conllu'
X_train_da, y_train_da = parse(treebank_train)
X_test_da, y_test_da = parse(treebank_test)

Icelandic

In [42]:
treebank_train = '/content/ud-treebanks-v2.8/UD_Icelandic-Modern/is_modern-ud-train.conllu'
treebank_test = '/content/ud-treebanks-v2.8/UD_Icelandic-Modern/is_modern-ud-test.conllu'
X_train_is, y_train_is = parse(treebank_train)
X_test_is, y_test_is = parse(treebank_test)

Faroese

In [53]:
treebank_train = '/content/ud-treebanks-v2.8/UD_Faroese-FarPaHC/fo_farpahc-ud-train.conllu'
treebank_test = '/content/ud-treebanks-v2.8/UD_Faroese-FarPaHC/fo_farpahc-ud-test.conllu'
X_train_fo, y_train_fo = parse(treebank_train)
X_test_fo, y_test_fo = parse(treebank_test)

## Data preprocessing

In [54]:
tokens = {token for sentence in X_train_fo for token in sentence}
idx2token = list(tokens)
idx2token.insert(0, '<UNK>')
idx2token.append('<PAD>')
token2idx = {token:idx for idx, token in enumerate(idx2token)}

tags = {tag for tags in y_train_fo for tag in tags}
idx2tag = list(tags)
idx2tag.append('<PAD>')
tag2idx = {tag:idx for idx, tag in enumerate(idx2tag)}


def pad_and_encode(sentences, labels):
    assert len(sentences)==len(labels)
    assert np.all([len(sentence)==len(tags) for sentence, tags in zip(sentences, labels)])
    max_sentence_length = np.max([len(sentence) for sentence in sentences])
    padded_sentences = torch.zeros(len(sentences), max_sentence_length,    
                                    dtype=torch.long)
    padded_sentences[:] = token2idx['<PAD>']
    padded_labels = torch.zeros(len(sentences), max_sentence_length, 
                                dtype=torch.long)
    padded_labels[:] = tag2idx['<PAD>']
    for i, (sentence, tags) in enumerate(zip(sentences, labels)):               
        for j, token in enumerate(sentence):
            if token in token2idx.keys():
                padded_sentences[i, j] = token2idx[token]
            else:
                padded_sentences[i, j] = token2idx['<UNK>']
        for j, tag in enumerate(tags):
            padded_labels[i, j] = tag2idx[tag]
    return padded_sentences, padded_labels


def batch_iterator(sentences, labels, batch_size=64):
    """Helper function for iterating over batches of the data"""
    assert len(sentences) == len(labels)
    for i in range(0, len(sentences), batch_size):
        X, y = pad_and_encode(sentences[i:min(i+batch_size, len(sentences))], 
                            labels[i:min(i+batch_size, len(sentences))])
        if torch.cuda.is_available():                                               
            yield (X.cuda(), y.cuda())
        else:
            yield (X, y)

next(batch_iterator(X_train_fo, y_train_fo, batch_size=5))

(tensor([[ 910,   63, 2063,  901, 1551, 2046,  901, 2063, 1790, 1477, 1551, 2046,
           901, 2063, 1339, 1842, 2297, 2297, 2297, 2297, 2297, 2297],
         [2155, 2063,  754,   63, 1790, 1477, 1842, 2297, 2297, 2297, 2297, 2297,
          2297, 2297, 2297, 2297, 2297, 2297, 2297, 2297, 2297, 2297],
         [2239, 1412,  103, 1318,  989,  635, 1186, 1551, 2046, 1296, 1926, 2270,
          1575,  989,  730, 1186, 1551, 1415,  989,   91, 1899, 1842],
         [ 910, 1186, 2063, 1375, 1551, 2046,  966, 2063,  136, 2277, 1842, 2297,
          2297, 2297, 2297, 2297, 2297, 2297, 2297, 2297, 2297, 2297],
         [ 342, 2174, 1302,  754, 1146, 1551, 2046, 1915, 2208, 1489,  635, 1186,
          1842, 2297, 2297, 2297, 2297, 2297, 2297, 2297, 2297, 2297]],
        device='cuda:0'),
 tensor([[ 6,  0,  9,  3,  5,  8,  3,  9,  6, 14,  5,  8,  3,  9, 14,  5, 17, 17,
          17, 17, 17, 17],
         [ 7,  9,  6,  0,  6, 14,  5, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
          17, 17,

## Model

In [55]:
class Tagger(nn.Module):
    def __init__(self, word_embedding_dim, model_hidden_dim, vocabulary_size, tagset_size, model='lstm', num_layers=1, dropout=.5):
        super(Tagger, self).__init__()                                          
        self.model_hidden_dim_ = model_hidden_dim                                     
        self.vocabulary_size_ = vocabulary_size
        self.tagset_size_ = tagset_size
        self.num_layers_ = num_layers                                           # allow for stacked model
        self.dropout_ = dropout                                                 # introduce dropout, default = 50%
        self.model = model                                                      # allow for GRU option, default LSTM

        self._word_embedding = nn.Embedding(num_embeddings=vocabulary_size,         
                                            embedding_dim=word_embedding_dim, 
                                            padding_idx=token2idx['<PAD>'])
        if self.model == 'lstm': 
            self._lstm = nn.LSTM(input_size=word_embedding_dim,                        
                                hidden_size=model_hidden_dim,                           
                                num_layers=num_layers,
                                batch_first=True,
                                dropout=dropout)
        elif self.model == 'gru':
            self._lstm = nn.GRU(input_size=word_embedding_dim,                         
                                hidden_size=model_hidden_dim,                          
                                batch_first=True,
                                num_layers=num_layers,
                                dropout=dropout)
        self._fc = nn.Linear(model_hidden_dim, tagset_size)                         
        self._softmax = nn.LogSoftmax(dim=1)                                        

        if torch.cuda.is_available():                                               
            self.cuda()


    def forward(self, padded_sentences):
        """The forward pass through the network"""
        batch_size, max_sentence_length = padded_sentences.size()

        embedded_sentences = self._word_embedding(padded_sentences)   

        sentence_lengths = (padded_sentences!=token2idx['<PAD>']).sum(dim=1)
        sentence_lengths = sentence_lengths.long().cpu()
        X = nn.utils.rnn.pack_padded_sequence(embedded_sentences, sentence_lengths, 
                                            batch_first=True, enforce_sorted=False)
        lstm_out, _ = self._lstm(X)                                               
        X, _ = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)        

        X = X.contiguous().view(-1, X.shape[2])                                 
        tag_space = self._fc(X)                 
        tag_scores = self._softmax(tag_space)                       
        return tag_scores.view(batch_size, max_sentence_length, self.tagset_size_)


    def fit(self, X_train, y_train):
        """Training the network"""
        loss_function = nn.NLLLoss(ignore_index=tag2idx['<PAD>'])
        optimizer = torch.optim.Adam(self.parameters(), lr=0.01) 

        batch_size = 256  
        for epoch in range(5):  
            with tqdm(batch_iterator(X_train, y_train, batch_size=batch_size), 
                    total=len(X_train)//batch_size+1, unit="batch", desc="Epoch %i" % epoch) as batches:
                for inputs, targets in batches:   
                    self.zero_grad()            
                    scores = self(inputs)      
                    loss = loss_function(scores.view(-1, self.tagset_size_),   
                                        targets.view(-1))               
                    loss.backward()
                    optimizer.step()   
                    predictions = scores.argmax(dim=2, keepdim=True).squeeze() 
                    mask = targets!=tag2idx['<PAD>']  
                    correct = (predictions[mask] == targets[mask]).sum().item()  
                    accuracy = correct / mask.sum().item()*100
                    batches.set_postfix(loss=loss.item(), accuracy=accuracy)


    def score(self, X_test, y_test):
        """Get the accuracy of the model given the data"""
        with torch.no_grad():
            n_correct = 0
            n_total = 0
            for inputs, targets in batch_iterator(X_test, y_test, batch_size=64): 
                scores = self(inputs)
                predictions = scores.argmax(dim=2, keepdim=True).squeeze()
                mask = targets!=tag2idx['<PAD>'] 
                n_correct += (predictions[mask] == targets[mask]).sum().item() 
                n_total += mask.sum().item()
        print("Test accuracy %.1f%%" % (100*n_correct/n_total))



In [56]:
model = Tagger(word_embedding_dim=64,
               model_hidden_dim=128,
               vocabulary_size=len(token2idx),
               tagset_size=len(tag2idx)-1,
               model='gru')                                 

  "num_layers={}".format(dropout, num_layers))


## Quantitative results


English

In [28]:
model.fit(X_train_en, y_train_en)

Epoch 0: 100%|██████████| 49/49 [00:02<00:00, 19.12batch/s, accuracy=94.6, loss=0.158]
Epoch 1: 100%|██████████| 49/49 [00:02<00:00, 19.33batch/s, accuracy=96.2, loss=0.111]
Epoch 2: 100%|██████████| 49/49 [00:02<00:00, 19.10batch/s, accuracy=96.7, loss=0.0961]
Epoch 3: 100%|██████████| 49/49 [00:02<00:00, 19.16batch/s, accuracy=97.6, loss=0.0696]
Epoch 4: 100%|██████████| 49/49 [00:02<00:00, 19.44batch/s, accuracy=98.2, loss=0.0568]


In [29]:
model.score(X_test_en, y_test_en)

Test accuracy 98.6%


Swedish

In [20]:
model.fit(X_train_sv, y_train_sv)

Epoch 0: 100%|██████████| 13/13 [00:00<00:00, 19.40batch/s, accuracy=63, loss=1.17]
Epoch 1: 100%|██████████| 13/13 [00:00<00:00, 19.51batch/s, accuracy=74.4, loss=0.733]
Epoch 2: 100%|██████████| 13/13 [00:00<00:00, 19.66batch/s, accuracy=83.9, loss=0.494]
Epoch 3: 100%|██████████| 13/13 [00:00<00:00, 19.62batch/s, accuracy=89.7, loss=0.34]
Epoch 4: 100%|██████████| 13/13 [00:00<00:00, 19.65batch/s, accuracy=93.3, loss=0.226]


In [21]:
model.score(X_test_sv, y_test_sv)

Test accuracy 78.9%


Norwegian

In [34]:
model.fit(X_train_no, y_train_no)

Epoch 0: 100%|██████████| 62/62 [00:02<00:00, 21.24batch/s, accuracy=76.8, loss=0.661]
Epoch 1: 100%|██████████| 62/62 [00:02<00:00, 21.41batch/s, accuracy=87.5, loss=0.353]
Epoch 2: 100%|██████████| 62/62 [00:02<00:00, 21.60batch/s, accuracy=94.9, loss=0.168]
Epoch 3: 100%|██████████| 62/62 [00:02<00:00, 21.34batch/s, accuracy=97.3, loss=0.0894]
Epoch 4: 100%|██████████| 62/62 [00:02<00:00, 21.49batch/s, accuracy=98.7, loss=0.0547]


In [35]:
model.score(X_test_no, y_test_no)

Test accuracy 86.5%


Danish

In [40]:
model.fit(X_train_da, y_train_da)

Epoch 0: 100%|██████████| 18/18 [00:00<00:00, 18.70batch/s, accuracy=68.4, loss=1]
Epoch 1: 100%|██████████| 18/18 [00:00<00:00, 18.40batch/s, accuracy=82.1, loss=0.555]
Epoch 2: 100%|██████████| 18/18 [00:00<00:00, 18.61batch/s, accuracy=92.8, loss=0.291]
Epoch 3: 100%|██████████| 18/18 [00:00<00:00, 18.69batch/s, accuracy=97.8, loss=0.136]
Epoch 4: 100%|██████████| 18/18 [00:00<00:00, 18.76batch/s, accuracy=98.9, loss=0.0696]


In [41]:
model.score(X_test_da, y_test_da)

Test accuracy 81.4%


Icelandic

In [46]:
model.fit(X_train_is, y_train_is)

Epoch 0:  95%|█████████▌| 21/22 [00:01<00:00, 14.33batch/s, accuracy=60.6, loss=1.27]
Epoch 1:  95%|█████████▌| 21/22 [00:01<00:00, 14.30batch/s, accuracy=76.1, loss=0.73]
Epoch 2:  95%|█████████▌| 21/22 [00:01<00:00, 14.25batch/s, accuracy=88.1, loss=0.386]
Epoch 3:  95%|█████████▌| 21/22 [00:01<00:00, 14.52batch/s, accuracy=95.1, loss=0.192]
Epoch 4:  95%|█████████▌| 21/22 [00:01<00:00, 14.63batch/s, accuracy=97.8, loss=0.0981]


In [47]:
model.score(X_test_is, y_test_is)

Test accuracy 94.4%


Faroese

In [57]:
model.fit(X_train_fo, y_train_fo)

Epoch 0: 100%|██████████| 4/4 [00:00<00:00, 15.13batch/s, accuracy=37.7, loss=1.97]
Epoch 1: 100%|██████████| 4/4 [00:00<00:00, 15.34batch/s, accuracy=65.5, loss=1.25]
Epoch 2: 100%|██████████| 4/4 [00:00<00:00, 13.27batch/s, accuracy=72.6, loss=0.889]
Epoch 3: 100%|██████████| 4/4 [00:00<00:00, 14.71batch/s, accuracy=80.2, loss=0.678]
Epoch 4: 100%|██████████| 4/4 [00:00<00:00, 14.58batch/s, accuracy=84.3, loss=0.523]


In [58]:
model.score(X_test_fo, y_test_fo)

Test accuracy 71.0%


### Baseline

*for English*

In [60]:
from sklearn.dummy import DummyClassifier

baseline_X = []
for sentence in X_train_en:
    baseline_X.extend(sentence)

baseline_y = []
for sentence in y_train_en:
    baseline_y.extend(sentence)

baseline = DummyClassifier(strategy='most_frequent')
baseline.fit(baseline_X, baseline_y)

baseline_X_test = []
for sentence in X_test_en:
    baseline_X_test.extend(sentence)

baseline_y_test = []
for sentence in y_test_en:
    baseline_y_test.extend(sentence)

baseline.score(baseline_X_test, baseline_y_test)

0.16791973902636761

## Report

As extensions, I chose to implement a GRU option, dropout, and comparing my tagger to the current state-of-the-art. Perhaps unsurprisingly, the model performs consistently more poorly on all data than the state-of-the-art. For English, I was able to get an accuracy of about 97% on the EWT data set, while [state-of-the-art](https://arxiv.org/abs/1906.01569) is at approximately 96%. In terms of methodology, the biggest difference between my model and the state-of-the-art is that they tend to use much larger embeddings like BERT. They also use [bi-LSTMs](https://arxiv.org/abs/1604.05529), which I could have implemented but did not. This is probably my model's largest shortcoming, since a bi-LSTM would likely increase robustness. Additionally, extending my model to the character level would probably help too, but at the end of the day my model still does quite well, which is somewhat surprising when comparing to the state-of-the-art. However, it's also not so surprising at the end of the day since English is such a high resource language with a good amount of training data available.

In order to get this decent accuracy, I had to increase the dimensionality of my embeddings quite a bit. This does support the trend that a more complex model leads to better accuracy, but I do not want to say that is a rule. As with all models, at a certain point increasing complexity does not contribute significantly to better accuracy, and the more important factor is how much data is available to train on. Even looking at the best performing models on NLP progress, the average accuracy is within 1 percentage point. This is, however, using clean, UD data. Looking at the social media data, the scores are quite a bit lower since the data is messy, and no amount of complexity can (probably) fix that. At least at this point.

However, on some of the Scandinavian languages I tested, like Faroese with a much smaller data set of only 40,000 tokens, the model performs quite a bit worse. This indicates that the size of the data set has a greater impact. I did, though, find that a slight change in dimensionality had a much greater impact on this small data set. By increasing the dimensionalities from 62 to 128 and 128 to 256, the model acuracy jumped by about 10 percentage points.

Across all languages, though, my model is consistently better than a majority baseline that simply guesses the most common tag for each word. For English, this was a little under 20%, so my 97% is much better. 

Overall, implementing extensions was not too difficult, as it generally required simply adding one or two lines of code since the PyTorch API is quite consistent. However, I did find it significantly harder to implement the model as an sk-learn class than it needed to be. After speaking with some of the TAs, I decided to do pre-processing before implementing the class, which made life a lot simpler, and in reality is still consistent with sk-learn classifiers which require pre-processed data. Additionally, I decided not to implement a ```predict``` method since it was not a part of the code we were given, and it was not necessary for the rest of the assignment.