<a href="https://colab.research.google.com/github/darisoy/EE517_Sp21/blob/master/hw3/hw3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🐍Setup Python environment

In [5]:
!pip install transformers



In [24]:
import numpy as np
import pandas as pd
import math
import torch
from tqdm.notebook import tqdm
from torch.utils.data import TensorDataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
from tokenizers import decoders
from sklearn.metrics import classification_report

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 📀Load the data

In [7]:
labels = {'O' : 0,
          'B-geo-loc' : 1,
          'I-geo-loc' : 2,
          'B-product' : 3,
          'I-product' : 4,
          'B-facility' : 5,
          'I-facility' : 6,
          'B-company' : 7,
          'I-company' : 8,
          'B-person' : 9,
          'I-person' : 10,
          'B-sportsteam' : 11,
          'I-sportsteam' : 12,
          'B-musicartist' : 13,
          'I-musicartist' : 14,
          'B-movie' : 15,
          'I-movie' : 16,
          'B-tvshow' : 17,
          'I-tvshow' : 18,
          'B-other' : 19,
          'I-other' : 20,
          }
end_token = '<END>'
beg_token = '<BEG>'

In [8]:
def get_sentences(df):
    sentences = []
    labels = []
    running_sentence = ''
    runnnig_label = []
    for idx, row in df.iterrows():
        if row.word == end_token:
            if len(running_sentence[:-1]) > 0:
                sentences.append(running_sentence[:-1])
                labels.append(runnnig_label)
            running_sentence = ''
            runnnig_label = []
        else:
            running_sentence += row.word + ' '
            runnnig_label.append(row.tag)
    return sentences, labels

def get_data(type):
    data = pd.read_csv('https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/wnut16/data/' + type, delimiter='\t', names=["word", "tag"], skip_blank_lines=False, quoting=3)
    data = data.fillna({'word': end_token, 'tag': 'O'})
    data.tag = data.tag.apply((lambda x: labels[x]))
    return get_sentences(data)

# 🔐Encode the data using BERT transformer

## Load the transformer

In [9]:
transformer_name = "distilbert-base-uncased"
transformer = DistilBertModel.from_pretrained(transformer_name)
tokenizer = DistilBertTokenizer.from_pretrained(transformer_name)
tokenizer.decoder = decoders.WordPiece()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




## Get dataset embeddings

In [10]:
def get_sublist_start_end(sl, l):
    results = []
    sll = len(sl)
    for ind in (i for i,e in enumerate(l) if e==sl[0]):
        if tokenizer.decode(l[ind:ind+sll]) ==  tokenizer.decode(sl):
            results.append([ind,ind+sll-1])
    return results

def get_embeddings(sentences):
    transformer.eval()
    transformer.to(device)
    data = []
    for sentence in tqdm(sentences):
        with torch.no_grad():
            tokens = tokenizer.encode(sentence)
            out = transformer(torch.tensor(tokens).unsqueeze(0).to(device))
            embed = []
            for i, word in enumerate(sentence.split()):
                target = word
                target_ids = tokenizer.encode(target, add_special_tokens=False)
                target_idx = get_sublist_start_end(target_ids, tokens)[0]
                embed.append(torch.mean(out[0][0][target_idx[0]:target_idx[1]+1], 0))
            data.append(torch.stack(embed))
    return data

In [11]:
train_sentences, train_tags = get_data('train')
train_embeddings = get_embeddings(train_sentences)

HBox(children=(FloatProgress(value=0.0, max=2394.0), HTML(value='')))




In [12]:
valid_sentences, valid_tags = get_data('dev')
valid_embeddings = get_embeddings(valid_sentences)

HBox(children=(FloatProgress(value=0.0, max=1003.0), HTML(value='')))




In [13]:
test_sentences, test_tags = get_data('test')
test_embeddings = get_embeddings(test_sentences)

HBox(children=(FloatProgress(value=0.0, max=3860.0), HTML(value='')))




# 🧑‍💻Classify the embeddings using RNN

In [19]:
# Model Definition
class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()
        hidden_size = 256
        self.rnn = nn.LSTM(input_size=768, hidden_size=hidden_size) 
        self.fc = nn.Linear(in_features=hidden_size, out_features=len(labels))

    def forward(self, sentence, h=None):
        out = []
        for X in sentence:
            tmp, h = self.rnn(X.unsqueeze(dim=0).unsqueeze(dim=0), h)
            out.append(self.fc(tmp))
        return torch.stack(out).squeeze(1), h

In [20]:
classifier = RNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.parameters(), lr=0.001)
epochs = 10

In [21]:
# train
for epoch in range(epochs):
    running_loss = 0.0
    for i, sentence in enumerate(tqdm(train_embeddings)):
        tags = torch.tensor(train_tags[i])
        sentence, tags = sentence.to(device), tags.to(device)
        optimizer.zero_grad()
        outputs, _ = classifier(sentence)
        loss = criterion(outputs.squeeze(dim=1), tags)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print('[Epoch %d]\tTrain Loss: \t\t%.3f' % (epoch+1, running_loss / len(train_embeddings)))

HBox(children=(FloatProgress(value=0.0, max=2394.0), HTML(value='')))


[Epoch 1]	Train Loss: 		0.208


HBox(children=(FloatProgress(value=0.0, max=2394.0), HTML(value='')))


[Epoch 2]	Train Loss: 		0.132


HBox(children=(FloatProgress(value=0.0, max=2394.0), HTML(value='')))


[Epoch 3]	Train Loss: 		0.099


HBox(children=(FloatProgress(value=0.0, max=2394.0), HTML(value='')))


[Epoch 4]	Train Loss: 		0.071


HBox(children=(FloatProgress(value=0.0, max=2394.0), HTML(value='')))


[Epoch 5]	Train Loss: 		0.047


HBox(children=(FloatProgress(value=0.0, max=2394.0), HTML(value='')))


[Epoch 6]	Train Loss: 		0.033


HBox(children=(FloatProgress(value=0.0, max=2394.0), HTML(value='')))


[Epoch 7]	Train Loss: 		0.024


HBox(children=(FloatProgress(value=0.0, max=2394.0), HTML(value='')))


[Epoch 8]	Train Loss: 		0.017


HBox(children=(FloatProgress(value=0.0, max=2394.0), HTML(value='')))


[Epoch 9]	Train Loss: 		0.015


HBox(children=(FloatProgress(value=0.0, max=2394.0), HTML(value='')))


[Epoch 10]	Train Loss: 		0.013


In [22]:
def evaluate(sentences, sentence_tags):
    correct = 0
    total = 0
    running_loss = 0
    truth = []
    preds = []
    for i, sentence in enumerate(tqdm(sentences)):
        tags = torch.tensor(sentence_tags[i])
        sentence, tags = sentence.to(device), tags.to(device)
        optimizer.zero_grad()

        outputs, _ = classifier(sentence)
        pred = outputs.squeeze(dim=1).argmax(dim=1)
        loss = criterion(outputs.squeeze(dim=1), tags)

        correct += torch.sum(tags == pred)
        total += len(tags)
        running_loss += loss.item()

        for t in sentence_tags[i]:
            truth.append(t)
        for p in pred:
            preds.append(p.item())

    print('Overall Accuracy: \t%.3f%% \tloss: %.3f' % (100*correct/total, running_loss/len(sentences)))
    return preds, truth

In [25]:
print('[Validation Data]')
preds, truth = evaluate(valid_embeddings, valid_tags)
print()
print(classification_report(truth, preds, target_names=labels))

[Validation Data]


HBox(children=(FloatProgress(value=0.0, max=1003.0), HTML(value='')))


Accuracy: 	94.667% 	loss: 0.292

               precision    recall  f1-score   support

            O       0.97      0.99      0.98     15128
    B-geo-loc       0.53      0.65      0.58       116
    I-geo-loc       0.67      0.29      0.40        42
    B-product       0.54      0.19      0.28        37
    I-product       1.00      0.09      0.17       121
   B-facility       0.30      0.37      0.33        38
   I-facility       0.43      0.33      0.38        39
    B-company       0.37      0.49      0.42        39
    I-company       0.06      0.10      0.07        10
     B-person       0.73      0.71      0.72       171
     I-person       0.78      0.69      0.73        95
 B-sportsteam       0.82      0.26      0.39        70
 I-sportsteam       0.67      0.31      0.42        13
B-musicartist       0.30      0.07      0.12        41
I-musicartist       0.57      0.11      0.19        35
      B-movie       0.67      0.13      0.22        15
      I-movie       0.20      

  _warn_prf(average, modifier, msg_start, len(result))


# 📊Results on test data

In [30]:
labels_fun = {'❌O' : 0,
          '📍B-geo-loc' : 1,
          '📍I-geo-loc' : 2,
          '🎧B-product' : 3,
          '🎧I-product' : 4,
          '🏭B-facility' : 5,
          '🏭I-facility' : 6,
          '🏬B-company' : 7,
          '🏬I-company' : 8,
          '🧑B-person' : 9,
          '🧑I-person' : 10,
          '⚽️B-sportsteam' : 11,
          '⚽️I-sportsteam' : 12,
          '🎶B-musicartist' : 13,
          '🎶I-musicartist' : 14,
          '🎥B-movie' : 15,
          '🎥I-movie' : 16,
          '📺B-tvshow' : 17,
          '📺I-tvshow' : 18,
          '🤷‍B-other' : 19,
          '🤷‍I-other' : 20,
          }

In [31]:
print('[Test Data]')
preds, truth = evaluate(test_embeddings, test_tags)
print()
print(classification_report(truth, preds, target_names=labels_fun))

[Test Data]


HBox(children=(FloatProgress(value=0.0, max=3860.0), HTML(value='')))


Accuracy: 	92.846% 	loss: 0.468

                precision    recall  f1-score   support

            ❌O       0.95      0.99      0.97     55925
    📍B-geo-loc       0.65      0.67      0.66       882
    📍I-geo-loc       0.51      0.53      0.52       219
    🎧B-product       0.48      0.11      0.17       246
    🎧I-product       0.58      0.05      0.10       500
   🏭B-facility       0.42      0.32      0.36       253
   🏭I-facility       0.55      0.42      0.48       366
    🏬B-company       0.62      0.44      0.52       621
    🏬I-company       0.45      0.20      0.28       265
     🧑B-person       0.57      0.73      0.64       482
     🧑I-person       0.74      0.72      0.73       300
⚽️B-sportsteam       0.60      0.26      0.36       147
⚽️I-sportsteam       0.52      0.27      0.36        48
🎶B-musicartist       0.29      0.05      0.08       191
🎶I-musicartist       0.39      0.09      0.15       140
      🎥B-movie       0.29      0.06      0.10        34
      🎥I-movi