<a href="https://colab.research.google.com/github/darisoy/EE517_Sp21/blob/master/hw3/hw3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 🐍 Setup Python environment

In [31]:
!pip install transformers



In [32]:
import numpy as np
import pandas as pd
import math
import torch
from tqdm.notebook import tqdm
from torch.utils.data import TensorDataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
from tokenizers import decoders
from sklearn.metrics import classification_report

import pickle

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 📀 Load the data

In [33]:
labels = {'O' : 0,
          'B-geo-loc' : 1,
          'I-geo-loc' : 2,
          'B-product' : 3,
          'I-product' : 4,
          'B-facility' : 5,
          'I-facility' : 6,
          'B-company' : 7,
          'I-company' : 8,
          'B-person' : 9,
          'I-person' : 10,
          'B-sportsteam' : 11,
          'I-sportsteam' : 12,
          'B-musicartist' : 13,
          'I-musicartist' : 14,
          'B-movie' : 15,
          'I-movie' : 16,
          'B-tvshow' : 17,
          'I-tvshow' : 18,
          'B-other' : 19,
          'I-other' : 20,
          }
end_token = '<END>'
beg_token = '<BEG>'

In [34]:
def get_sentences(df):
    sentences = []
    labels = []
    running_sentence = ''
    runnnig_label = []
    for idx, row in df.iterrows():
        if row.word == end_token:
            if len(running_sentence[:-1]) > 0:
                sentences.append(running_sentence[:-1])
                labels.append(runnnig_label)
            running_sentence = ''
            runnnig_label = []
        else:
            running_sentence += row.word + ' '
            runnnig_label.append(row.tag)
    return sentences, labels

def get_data(type):
    data = pd.read_csv('https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/wnut16/data/' + type, delimiter='\t', names=["word", "tag"], skip_blank_lines=False, quoting=3)
    data = data.fillna({'word': end_token, 'tag': 'O'})
    data.tag = data.tag.apply((lambda x: labels[x]))
    return get_sentences(data)

# 🔐 Encode the data using BERT transformer

## Load the transformer

In [35]:
transformer_name = "distilbert-base-uncased"
transformer = DistilBertModel.from_pretrained(transformer_name)
tokenizer = DistilBertTokenizer.from_pretrained(transformer_name)
tokenizer.decoder = decoders.WordPiece()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Get dataset embeddings

In [37]:
def get_sublist_start_end(sl, l):
    results = []
    sll = len(sl)
    for ind in (i for i,e in enumerate(l) if e==sl[0]):
        if tokenizer.decode(l[ind:ind+sll]) ==  tokenizer.decode(sl):
            results.append([ind,ind+sll-1])
    return results

def get_embeddings(sentences):
    transformer.eval()
    transformer.to(device)
    data = []
    with torch.no_grad():
        for sentence in tqdm(sentences):
            tokens = tokenizer.encode(sentence)
            out = transformer(torch.tensor(tokens).unsqueeze(0).to(device))
            embed = []
            for i, word in enumerate(sentence.split()):
                target = word
                target_ids = tokenizer.encode(target, add_special_tokens=False)
                start_end_found = get_sublist_start_end(target_ids , tokens)
                if len(start_end_found) > 1:
                    occurrences_pos_of_cur_word = [word_i for word_i, x in enumerate(sentence.split()) if x == word]
                    cur_occ_num =  occurrences_pos_of_cur_word.index(i)
                    target_idx = start_end_found[cur_occ_num]
                else:
                    target_idx = start_end_found[0]
                embed.append(torch.mean(out[0][0][target_idx[0]:target_idx[1]+1], 0))
            data.append(torch.stack(embed))
    return data

In [38]:
train_sentences, train_tags = get_data('train')
train_embeddings = get_embeddings(train_sentences)

HBox(children=(FloatProgress(value=0.0, max=2394.0), HTML(value='')))




In [39]:
valid_sentences, valid_tags = get_data('dev')
valid_embeddings = get_embeddings(valid_sentences)

HBox(children=(FloatProgress(value=0.0, max=1003.0), HTML(value='')))




In [40]:
test_sentences, test_tags = get_data('test')
test_embeddings = get_embeddings(test_sentences)

HBox(children=(FloatProgress(value=0.0, max=3860.0), HTML(value='')))




# 🧑‍💻 Classify the embeddings using RNN

In [41]:
# Model Definition
class RNN(nn.Module):
    def __init__(self, hidden_dim, num_layers):
        super(RNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.rnn = nn.GRU(768, self.hidden_dim, self.num_layers, batch_first=True) 
        self.fc = nn.Linear(self.hidden_dim, len(labels))
    
    def forward(self, x):
        h = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_().to(device)
        out, (h) = self.rnn(x, (h.detach()))
        out = self.fc(out) 
        return out

In [42]:
def train(classifier, device, hidden_dim, num_layers, epochs, learning_rate):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(classifier.parameters(), lr=learning_rate)
    classifier.train()
    # train
    for epoch in tqdm(range(epochs)):
        running_loss = 0.0
        for i, sentence in enumerate(train_embeddings):
            tags = torch.tensor(train_tags[i])
            sentence, tags = sentence.to(device), tags.to(device)
            optimizer.zero_grad()
            outputs = classifier(sentence.unsqueeze(dim=0))
            loss = criterion(outputs.squeeze(dim=0), tags)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print('[Epoch %d]\tTrain Loss: \t\t%.3f' % (epoch+1, running_loss / len(train_embeddings)))


In [12]:
def evaluate(classifier, device, sentences, sentence_tags):
    criterion = nn.CrossEntropyLoss()
    correct = 0
    total = 0
    running_loss = 0
    truth = []
    preds = []
    classifier.eval()
    print("Evaluating ... ")
    with torch.no_grad():
        for i, sentence in enumerate(tqdm(sentences)):
            tags = torch.tensor(sentence_tags[i])
            sentence, tags = sentence.to(device), tags.to(device)

            outputs = classifier(sentence.unsqueeze(dim=0)).squeeze(dim=0)
            pred = outputs.squeeze(dim=1).argmax(dim=1)
            loss = criterion(outputs, tags)
            running_loss += loss.item()

            correct += torch.sum(tags == pred)
            total += len(tags)

            for t in sentence_tags[i]:
                truth.append(t)
            for p in pred:
                preds.append(p.item())

    loss = running_loss/len(sentences)
    accuracy = 100*correct/total

    print('Overall Accuracy: \t%.3f%% \tloss: %.3f' % (accuracy, loss))
    return preds, truth, loss, accuracy

In [13]:

# We vary these as hyperparamters
hidden_size_variations = [64, 128]
num_layers_variations = [2, 5]
num_epoch_variations = [5, 10]
learning_rate_variations = [0.01, 0.001]

combinations = len(hidden_size_variations) * len(num_layers_variations) \
                * len(num_epoch_variations) * len(learning_rate_variations)

In [14]:
loss_accuracy_comb = [[None, None, None] for i in range(combinations)]
num_comb = 0
for hidden_dim in hidden_size_variations:
    for num_layers in num_layers_variations:
        for epochs in num_epoch_variations:
            for learning_rate in learning_rate_variations:
                # train model
                classifier = RNN(hidden_dim, num_layers).to(device)
                train(classifier, device, hidden_dim, num_layers, epochs, learning_rate)
                _, _, loss, accuracy = evaluate(classifier, device, valid_embeddings, valid_tags)
                loss_accuracy_comb[num_comb][0] = float(loss)
                loss_accuracy_comb[num_comb][1] = float(accuracy)
                loss_accuracy_comb[num_comb][2] = "hidden_dim=" + str(hidden_dim) + ", num_layers=" + str(num_layers) \
                    + ", num_epoch=" + str(epochs) +   ", learning_rate=" + str(learning_rate)
                print(str(loss_accuracy_comb[num_comb][0]) + ":::" + str(loss_accuracy_comb[num_comb][1]) + ":::" + loss_accuracy_comb[num_comb][2])
                num_comb += 1



HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

[Epoch 1]	Train Loss: 		0.306
[Epoch 2]	Train Loss: 		0.246
[Epoch 3]	Train Loss: 		0.233
[Epoch 4]	Train Loss: 		0.229
[Epoch 5]	Train Loss: 		0.233

Evaluating ... 


HBox(children=(FloatProgress(value=0.0, max=1003.0), HTML(value='')))


Overall Accuracy: 	93.024% 	loss: 0.355
0.3554153390528666:::93.02411651611328:::hidden_dim=64, num_layers=2, num_epoch=5, learning_rate=0.01


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

[Epoch 1]	Train Loss: 		0.233
[Epoch 2]	Train Loss: 		0.143
[Epoch 3]	Train Loss: 		0.104
[Epoch 4]	Train Loss: 		0.074
[Epoch 5]	Train Loss: 		0.054

Evaluating ... 


HBox(children=(FloatProgress(value=0.0, max=1003.0), HTML(value='')))


Overall Accuracy: 	94.691% 	loss: 0.242
0.24210654832239056:::94.69119262695312:::hidden_dim=64, num_layers=2, num_epoch=5, learning_rate=0.001


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

[Epoch 1]	Train Loss: 		0.296
[Epoch 2]	Train Loss: 		0.250
[Epoch 3]	Train Loss: 		0.242
[Epoch 4]	Train Loss: 		0.233
[Epoch 5]	Train Loss: 		0.228
[Epoch 6]	Train Loss: 		0.223
[Epoch 7]	Train Loss: 		0.229
[Epoch 8]	Train Loss: 		0.239
[Epoch 9]	Train Loss: 		0.245
[Epoch 10]	Train Loss: 		0.235

Evaluating ... 


HBox(children=(FloatProgress(value=0.0, max=1003.0), HTML(value='')))


Overall Accuracy: 	93.639% 	loss: 0.322
0.3218309618393377:::93.63927459716797:::hidden_dim=64, num_layers=2, num_epoch=10, learning_rate=0.01


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

[Epoch 1]	Train Loss: 		0.240
[Epoch 2]	Train Loss: 		0.148
[Epoch 3]	Train Loss: 		0.113
[Epoch 4]	Train Loss: 		0.084
[Epoch 5]	Train Loss: 		0.060
[Epoch 6]	Train Loss: 		0.045
[Epoch 7]	Train Loss: 		0.033
[Epoch 8]	Train Loss: 		0.031
[Epoch 9]	Train Loss: 		0.025
[Epoch 10]	Train Loss: 		0.019

Evaluating ... 


HBox(children=(FloatProgress(value=0.0, max=1003.0), HTML(value='')))


Overall Accuracy: 	94.304% 	loss: 0.314
0.313645365262637:::94.30364227294922:::hidden_dim=64, num_layers=2, num_epoch=10, learning_rate=0.001


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

[Epoch 1]	Train Loss: 		0.397
[Epoch 2]	Train Loss: 		0.393
[Epoch 3]	Train Loss: 		0.392
[Epoch 4]	Train Loss: 		0.392
[Epoch 5]	Train Loss: 		0.391

Evaluating ... 


HBox(children=(FloatProgress(value=0.0, max=1003.0), HTML(value='')))


Overall Accuracy: 	93.061% 	loss: 0.459
0.45947823270873084:::93.06101989746094:::hidden_dim=64, num_layers=5, num_epoch=5, learning_rate=0.01


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

[Epoch 1]	Train Loss: 		0.343
[Epoch 2]	Train Loss: 		0.214
[Epoch 3]	Train Loss: 		0.180
[Epoch 4]	Train Loss: 		0.163
[Epoch 5]	Train Loss: 		0.145

Evaluating ... 


HBox(children=(FloatProgress(value=0.0, max=1003.0), HTML(value='')))


Overall Accuracy: 	93.092% 	loss: 0.265
0.2651546497314849:::93.09178161621094:::hidden_dim=64, num_layers=5, num_epoch=5, learning_rate=0.001


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

[Epoch 1]	Train Loss: 		0.397
[Epoch 2]	Train Loss: 		0.391
[Epoch 3]	Train Loss: 		0.391
[Epoch 4]	Train Loss: 		0.391
[Epoch 5]	Train Loss: 		0.391
[Epoch 6]	Train Loss: 		0.391
[Epoch 7]	Train Loss: 		0.391
[Epoch 8]	Train Loss: 		0.390
[Epoch 9]	Train Loss: 		0.390
[Epoch 10]	Train Loss: 		0.390

Evaluating ... 


HBox(children=(FloatProgress(value=0.0, max=1003.0), HTML(value='')))


Overall Accuracy: 	93.061% 	loss: 0.458
0.4584947709153859:::93.06101989746094:::hidden_dim=64, num_layers=5, num_epoch=10, learning_rate=0.01


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

[Epoch 1]	Train Loss: 		0.368
[Epoch 2]	Train Loss: 		0.267
[Epoch 3]	Train Loss: 		0.197
[Epoch 4]	Train Loss: 		0.172
[Epoch 5]	Train Loss: 		0.156
[Epoch 6]	Train Loss: 		0.143
[Epoch 7]	Train Loss: 		0.134
[Epoch 8]	Train Loss: 		0.122
[Epoch 9]	Train Loss: 		0.111
[Epoch 10]	Train Loss: 		0.106

Evaluating ... 


HBox(children=(FloatProgress(value=0.0, max=1003.0), HTML(value='')))


Overall Accuracy: 	93.935% 	loss: 0.264
0.2641883613072811:::93.9345474243164:::hidden_dim=64, num_layers=5, num_epoch=10, learning_rate=0.001


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

[Epoch 1]	Train Loss: 		0.326
[Epoch 2]	Train Loss: 		0.269
[Epoch 3]	Train Loss: 		0.264
[Epoch 4]	Train Loss: 		0.250
[Epoch 5]	Train Loss: 		0.242

Evaluating ... 


HBox(children=(FloatProgress(value=0.0, max=1003.0), HTML(value='')))


Overall Accuracy: 	93.326% 	loss: 0.325
0.3245607111969794:::93.3255386352539:::hidden_dim=128, num_layers=2, num_epoch=5, learning_rate=0.01


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

[Epoch 1]	Train Loss: 		0.227
[Epoch 2]	Train Loss: 		0.143
[Epoch 3]	Train Loss: 		0.105
[Epoch 4]	Train Loss: 		0.074
[Epoch 5]	Train Loss: 		0.052

Evaluating ... 


HBox(children=(FloatProgress(value=0.0, max=1003.0), HTML(value='')))


Overall Accuracy: 	94.605% 	loss: 0.246
0.24551738443053947:::94.60507202148438:::hidden_dim=128, num_layers=2, num_epoch=5, learning_rate=0.001


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

[Epoch 1]	Train Loss: 		0.350
[Epoch 2]	Train Loss: 		0.292
[Epoch 3]	Train Loss: 		0.287
[Epoch 4]	Train Loss: 		0.290
[Epoch 5]	Train Loss: 		0.276
[Epoch 6]	Train Loss: 		0.271
[Epoch 7]	Train Loss: 		0.273
[Epoch 8]	Train Loss: 		0.254
[Epoch 9]	Train Loss: 		0.262
[Epoch 10]	Train Loss: 		0.254

Evaluating ... 


HBox(children=(FloatProgress(value=0.0, max=1003.0), HTML(value='')))


Overall Accuracy: 	93.547% 	loss: 0.344
0.3441495222711782:::93.5469970703125:::hidden_dim=128, num_layers=2, num_epoch=10, learning_rate=0.01


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

[Epoch 1]	Train Loss: 		0.225
[Epoch 2]	Train Loss: 		0.142
[Epoch 3]	Train Loss: 		0.104
[Epoch 4]	Train Loss: 		0.075
[Epoch 5]	Train Loss: 		0.054
[Epoch 6]	Train Loss: 		0.037
[Epoch 7]	Train Loss: 		0.031
[Epoch 8]	Train Loss: 		0.029
[Epoch 9]	Train Loss: 		0.020
[Epoch 10]	Train Loss: 		0.022

Evaluating ... 


HBox(children=(FloatProgress(value=0.0, max=1003.0), HTML(value='')))


Overall Accuracy: 	94.593% 	loss: 0.303
0.3028141982119554:::94.59276580810547:::hidden_dim=128, num_layers=2, num_epoch=10, learning_rate=0.001


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

[Epoch 1]	Train Loss: 		0.415
[Epoch 2]	Train Loss: 		0.410
[Epoch 3]	Train Loss: 		0.409
[Epoch 4]	Train Loss: 		0.408
[Epoch 5]	Train Loss: 		0.409

Evaluating ... 


HBox(children=(FloatProgress(value=0.0, max=1003.0), HTML(value='')))


Overall Accuracy: 	93.061% 	loss: 0.476
0.4763931387131022:::93.06101989746094:::hidden_dim=128, num_layers=5, num_epoch=5, learning_rate=0.01


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

[Epoch 1]	Train Loss: 		0.368
[Epoch 2]	Train Loss: 		0.300
[Epoch 3]	Train Loss: 		0.215
[Epoch 4]	Train Loss: 		0.192
[Epoch 5]	Train Loss: 		0.175

Evaluating ... 


HBox(children=(FloatProgress(value=0.0, max=1003.0), HTML(value='')))


Overall Accuracy: 	93.006% 	loss: 0.256
0.2559136118677631:::93.00566101074219:::hidden_dim=128, num_layers=5, num_epoch=5, learning_rate=0.001


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

[Epoch 1]	Train Loss: 		0.415
[Epoch 2]	Train Loss: 		0.409
[Epoch 3]	Train Loss: 		0.408
[Epoch 4]	Train Loss: 		0.407
[Epoch 5]	Train Loss: 		0.406
[Epoch 6]	Train Loss: 		0.406
[Epoch 7]	Train Loss: 		0.412
[Epoch 8]	Train Loss: 		0.409
[Epoch 9]	Train Loss: 		0.409
[Epoch 10]	Train Loss: 		0.409

Evaluating ... 


HBox(children=(FloatProgress(value=0.0, max=1003.0), HTML(value='')))


Overall Accuracy: 	93.061% 	loss: 0.478
0.4779852944299029:::93.06101989746094:::hidden_dim=128, num_layers=5, num_epoch=10, learning_rate=0.01


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

[Epoch 1]	Train Loss: 		0.369
[Epoch 2]	Train Loss: 		0.358
[Epoch 3]	Train Loss: 		0.345
[Epoch 4]	Train Loss: 		0.299
[Epoch 5]	Train Loss: 		0.228
[Epoch 6]	Train Loss: 		0.199
[Epoch 7]	Train Loss: 		0.183
[Epoch 8]	Train Loss: 		0.170
[Epoch 9]	Train Loss: 		0.160
[Epoch 10]	Train Loss: 		0.154

Evaluating ... 


HBox(children=(FloatProgress(value=0.0, max=1003.0), HTML(value='')))


Overall Accuracy: 	93.529% 	loss: 0.278
0.2776309128208615:::93.5285415649414:::hidden_dim=128, num_layers=5, num_epoch=10, learning_rate=0.001


In [15]:
with open('val_acc.data', 'wb') as use_val_data_save:
    # store the data as binary data stream
    pickle.dump(loss_accuracy_comb, use_val_data_save)

In [16]:
loss_accuracy_comb

[[0.3554153390528666,
  93.02411651611328,
  'hidden_dim=64, num_layers=2, num_epoch=5, learning_rate=0.01'],
 [0.24210654832239056,
  94.69119262695312,
  'hidden_dim=64, num_layers=2, num_epoch=5, learning_rate=0.001'],
 [0.3218309618393377,
  93.63927459716797,
  'hidden_dim=64, num_layers=2, num_epoch=10, learning_rate=0.01'],
 [0.313645365262637,
  94.30364227294922,
  'hidden_dim=64, num_layers=2, num_epoch=10, learning_rate=0.001'],
 [0.45947823270873084,
  93.06101989746094,
  'hidden_dim=64, num_layers=5, num_epoch=5, learning_rate=0.01'],
 [0.2651546497314849,
  93.09178161621094,
  'hidden_dim=64, num_layers=5, num_epoch=5, learning_rate=0.001'],
 [0.4584947709153859,
  93.06101989746094,
  'hidden_dim=64, num_layers=5, num_epoch=10, learning_rate=0.01'],
 [0.2641883613072811,
  93.9345474243164,
  'hidden_dim=64, num_layers=5, num_epoch=10, learning_rate=0.001'],
 [0.3245607111969794,
  93.3255386352539,
  'hidden_dim=128, num_layers=2, num_epoch=5, learning_rate=0.01'],
 [

In [17]:
sorted(loss_accuracy_comb, key = lambda x: x[1], reverse = True)

[[0.24210654832239056,
  94.69119262695312,
  'hidden_dim=64, num_layers=2, num_epoch=5, learning_rate=0.001'],
 [0.24551738443053947,
  94.60507202148438,
  'hidden_dim=128, num_layers=2, num_epoch=5, learning_rate=0.001'],
 [0.3028141982119554,
  94.59276580810547,
  'hidden_dim=128, num_layers=2, num_epoch=10, learning_rate=0.001'],
 [0.313645365262637,
  94.30364227294922,
  'hidden_dim=64, num_layers=2, num_epoch=10, learning_rate=0.001'],
 [0.2641883613072811,
  93.9345474243164,
  'hidden_dim=64, num_layers=5, num_epoch=10, learning_rate=0.001'],
 [0.3218309618393377,
  93.63927459716797,
  'hidden_dim=64, num_layers=2, num_epoch=10, learning_rate=0.01'],
 [0.3441495222711782,
  93.5469970703125,
  'hidden_dim=128, num_layers=2, num_epoch=10, learning_rate=0.01'],
 [0.2776309128208615,
  93.5285415649414,
  'hidden_dim=128, num_layers=5, num_epoch=10, learning_rate=0.001'],
 [0.3245607111969794,
  93.3255386352539,
  'hidden_dim=128, num_layers=2, num_epoch=5, learning_rate=0.01

# 📈 Evaluate valid and test data


We achieve the best validation accuracy of 94.82% using hidden_size=128, num_layers=2, num_epoch=5, learning_rate=0.001 . This will be the set up we use for reporting our results.

## Helper functions

In [43]:
import sys
from collections import defaultdict

def split_tag(chunk_tag):
    """
    split chunk tag into IOBES prefix and chunk_type
    e.g. 
    B-PER -> (B, PER)
    O -> (O, None)
    """
    if chunk_tag == 0:
        return ('O', None)
    return list(labels.keys())[chunk_tag].split('-', maxsplit=1)

def is_chunk_end(prev_tag, tag):
    """
    check if the previous chunk ended between the previous and current word
    e.g. 
    (B-PER, I-PER) -> False
    (B-LOC, O)  -> True
    Note: in case of contradicting tags, e.g. (B-PER, I-LOC)
    this is considered as (B-PER, B-LOC)
    """
    prefix1, chunk_type1 = split_tag(prev_tag)
    prefix2, chunk_type2 = split_tag(tag)

    if prefix1 == 'O':
        return False
    if prefix2 == 'O':
        return prefix1 != 'O'

    if chunk_type1 != chunk_type2:
        return True

    return prefix2 in ['B', 'S'] or prefix1 in ['E', 'S']

def is_chunk_start(prev_tag, tag):
    """
    check if a new chunk started between the previous and current word
    """
    prefix1, chunk_type1 = split_tag(prev_tag)
    prefix2, chunk_type2 = split_tag(tag)

    if prefix2 == 'O':
        return False
    if prefix1 == 'O':
        return prefix2 != 'O'

    if chunk_type1 != chunk_type2:
        return True

    return prefix2 in ['B', 'S'] or prefix1 in ['E', 'S']


def calc_metrics(tp, p, t, percent=True):
    """
    compute overall precision, recall and FB1 (default values are 0.0)
    if percent is True, return 100 * original decimal value
    """
    precision = tp / p if p else 0
    recall = tp / t if t else 0
    fb1 = 2 * precision * recall / (precision + recall) if precision + recall else 0
    if percent:
        return 100 * precision, 100 * recall, 100 * fb1
    else:
        return precision, recall, fb1


def count_chunks(true_seqs, pred_seqs):
    """
    true_seqs: a list of true tags
    pred_seqs: a list of predicted tags
    return: 
    correct_chunks: a dict (counter), 
                    key = chunk types, 
                    value = number of correctly identified chunks per type
    true_chunks:    a dict, number of true chunks per type
    pred_chunks:    a dict, number of identified chunks per type
    correct_counts, true_counts, pred_counts: similar to above, but for tags
    """
    correct_chunks = defaultdict(int)
    true_chunks = defaultdict(int)
    pred_chunks = defaultdict(int)

    correct_counts = defaultdict(int)
    true_counts = defaultdict(int)
    pred_counts = defaultdict(int)

    prev_true_tag, prev_pred_tag = 0, 0
    correct_chunk = None

    for true_tag, pred_tag in zip(true_seqs, pred_seqs):
        if true_tag == pred_tag:
            correct_counts[true_tag] += 1
        true_counts[true_tag] += 1
        pred_counts[pred_tag] += 1

        _, true_type = split_tag(true_tag)
        _, pred_type = split_tag(pred_tag)

        if correct_chunk is not None:
            true_end = is_chunk_end(prev_true_tag, true_tag)
            pred_end = is_chunk_end(prev_pred_tag, pred_tag)

            if pred_end and true_end:
                correct_chunks[correct_chunk] += 1
                correct_chunk = None
            elif pred_end != true_end or true_type != pred_type:
                correct_chunk = None

        true_start = is_chunk_start(prev_true_tag, true_tag)
        pred_start = is_chunk_start(prev_pred_tag, pred_tag)

        if true_start and pred_start and true_type == pred_type:
            correct_chunk = true_type
        if true_start:
            true_chunks[true_type] += 1
        if pred_start:
            pred_chunks[pred_type] += 1

        prev_true_tag, prev_pred_tag = true_tag, pred_tag
    if correct_chunk is not None:
        correct_chunks[correct_chunk] += 1

    return (correct_chunks, true_chunks, pred_chunks, 
        correct_counts, true_counts, pred_counts)

def get_result(correct_chunks, true_chunks, pred_chunks,
    correct_counts, true_counts, pred_counts, verbose=True):
    """
    if verbose, print overall performance, as well as preformance per chunk type;
    otherwise, simply return overall prec, rec, f1 scores
    """
    # sum counts
    sum_correct_chunks = sum(correct_chunks.values())
    sum_true_chunks = sum(true_chunks.values())
    sum_pred_chunks = sum(pred_chunks.values())

    sum_correct_counts = sum(correct_counts.values())
    sum_true_counts = sum(true_counts.values())
    O_correct_counts = sum(v for k, v in correct_counts.items() if k == 0)
    O_true_counts = sum(v for k, v in true_counts.items() if k == 0)
    O_pred_counts = sum(v for k, v in pred_counts.items() if k == 0)

    chunk_types = sorted(list(set(list(true_chunks) + list(pred_chunks))))

    # compute overall precision, recall and FB1 (default values are 0.0)
    prec, rec, f1 = calc_metrics(sum_correct_chunks, sum_pred_chunks, sum_true_chunks)
    res = (prec, rec, f1)
    if not verbose:
        return res

    print("processed %i tokens with %i phrases; " % (sum_true_counts, sum_true_chunks), end='')
    print("found: %i phrases; correct: %i.\n" % (sum_pred_chunks, sum_correct_chunks), end='')
    print()
    print("%i Entity Types:" % (len(chunk_types)))
    print("accuracy: %6.2f%%; " % (100*sum_correct_counts/sum_true_counts), end='')
    print("precision: %6.2f%%; recall: %6.2f%%; FB1: %6.2f" % (prec, rec, f1))

    for t in chunk_types:
        prec, rec, f1 = calc_metrics(correct_chunks[t], pred_chunks[t], true_chunks[t])
        print("%17s: " %t , end='')
        print("precision: %6.2f%%; recall: %6.2f%%; FB1: %6.2f" %
                    (prec, rec, f1), end='')
        print("  %d" % pred_chunks[t])

    print()
    print("No Types: ")
    print("accuracy: %6.2f%%; " % (100*O_correct_counts/O_true_counts), end='')
    prec, rec, f1 = calc_metrics(O_correct_counts, O_pred_counts, O_true_counts)
    print("precision: %6.2f%%; recall: %6.2f%%; FB1: %6.2f" % (prec, rec, f1), end='')
    print("  %d" % O_pred_counts)
    return res

def ConLLEval(true_seqs, pred_seqs, verbose=True):
    (correct_chunks, true_chunks, pred_chunks,
        correct_counts, true_counts, pred_counts) = count_chunks(true_seqs, pred_seqs)
    result = get_result(correct_chunks, true_chunks, pred_chunks,
        correct_counts, true_counts, pred_counts, verbose=verbose)
    return result

## Run

In [51]:
# Based on perplexity on validation data, we choose these values for hyperparamters
hidden_dim=64
num_layers=2
epochs=5
learning_rate=0.001

# train model
classifier = RNN(hidden_dim, num_layers).to(device)
train(classifier, device, hidden_dim, num_layers, epochs, learning_rate)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

[Epoch 1]	Train Loss: 		0.239
[Epoch 2]	Train Loss: 		0.145
[Epoch 3]	Train Loss: 		0.108
[Epoch 4]	Train Loss: 		0.080
[Epoch 5]	Train Loss: 		0.058



In [52]:
print('[Validation Data]')
preds, truth, loss, accuracy = evaluate(classifier, device, valid_embeddings, valid_tags)
print("Loss:", loss)
print("Accuracy:", loss)

print()
ConLLEval(truth, preds)

[Validation Data]
Evaluating ... 


HBox(children=(FloatProgress(value=0.0, max=1003.0), HTML(value='')))


Overall Accuracy: 	94.710% 	loss: 0.231
Loss: 0.23104000055883858
Accuracy: 0.23104000055883858

processed 16256 tokens with 661 phrases; found: 636 phrases; correct: 266.

10 Entity Types:
accuracy:  94.71%; precision:  41.82%; recall:  40.24%; FB1:  41.02
          company: precision:  18.45%; recall:  48.72%; FB1:  26.76  103
         facility: precision:  12.50%; recall:  10.53%; FB1:  11.43  32
          geo-loc: precision:  57.58%; recall:  65.52%; FB1:  61.29  132
            movie: precision:   0.00%; recall:   0.00%; FB1:   0.00  2
      musicartist: precision:  23.53%; recall:   9.76%; FB1:  13.79  17
            other: precision:  14.71%; recall:   7.58%; FB1:  10.00  68
           person: precision:  63.69%; recall:  66.67%; FB1:  65.14  179
          product: precision:   9.52%; recall:   5.41%; FB1:   6.90  21
       sportsteam: precision:  52.86%; recall:  52.86%; FB1:  52.86  70
           tvshow: precision:   0.00%; recall:   0.00%; FB1:   0.00  12

No Types: 
accurac

(41.82389937106918, 40.24205748865356, 41.017733230531995)

In [53]:
print('[Test Data]')
preds, truth, loss, accuracy = evaluate(classifier, device, test_embeddings, test_tags)
print("Loss:", loss)
print("Accuracy:", loss)

print()
ConLLEval(truth, preds)

[Test Data]
Evaluating ... 


HBox(children=(FloatProgress(value=0.0, max=3860.0), HTML(value='')))


Overall Accuracy: 	92.563% 	loss: 0.379
Loss: 0.37926863981989734
Accuracy: 0.37926863981989734

processed 61880 tokens with 3473 phrases; found: 3260 phrases; correct: 1296.

10 Entity Types:
accuracy:  92.56%; precision:  39.75%; recall:  37.32%; FB1:  38.50
          company: precision:  38.56%; recall:  47.50%; FB1:  42.57  765
         facility: precision:  16.13%; recall:   9.88%; FB1:  12.25  155
          geo-loc: precision:  56.88%; recall:  59.52%; FB1:  58.17  923
            movie: precision:   0.00%; recall:   0.00%; FB1:   0.00  6
      musicartist: precision:  14.71%; recall:   2.62%; FB1:   4.44  34
            other: precision:  12.68%; recall:   5.99%; FB1:   8.14  276
           person: precision:  48.05%; recall:  68.88%; FB1:  56.61  691
          product: precision:   5.56%; recall:   2.03%; FB1:   2.98  90
       sportsteam: precision:  25.36%; recall:  48.30%; FB1:  33.26  280
           tvshow: precision:   7.50%; recall:   9.09%; FB1:   8.22  40

No Types: 
a

(39.75460122699386, 37.31644111718975, 38.49695529481657)