<a href="https://colab.research.google.com/github/darisoy/EE517_Sp21/blob/master/hw3/hw3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup Python environment

In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 9.1MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 36.4MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 52.0MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1


In [2]:
import numpy as np
import pandas as pd
import math
import torch
from tqdm.notebook import tqdm
from torch.utils.data import TensorDataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the data

In [3]:
labels = {'O' : 0,
          'B-geo-loc' : 1,
          'I-geo-loc' : 2,
          'B-product' : 3,
          'I-product' : 4,
          'B-facility' : 5,
          'I-facility' : 6,
          'B-company' : 7,
          'I-company' : 8,
          'B-person' : 9,
          'I-person' : 10,
          'B-sportsteam' : 11,
          'I-sportsteam' : 12,
          'B-musicartist' : 13,
          'I-musicartist' : 14,
          'B-movie' : 15,
          'I-movie' : 16,
          'B-tvshow' : 17,
          'I-tvshow' : 18,
          'B-other' : 19,
          'I-other' : 20,
          }
end_token = '<END>'
beg_token = '<BEG>'

In [4]:
def get_sentences(df):
    sentences = []
    labels = []
    running_sentence = [beg_token]
    runnnig_label = [0]
    for idx, row in df.iterrows():
        running_sentence.append(row.word)
        runnnig_label.append(row.tag)
        if row.word == end_token:
            sentences.append(running_sentence)
            labels.append(runnnig_label)
            running_sentence = [beg_token]
            runnnig_label = [0]
    return sentences, labels

def get_data(type):
    data = pd.read_csv('https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/wnut16/data/' + type, delimiter='\t', names=["word", "tag"], skip_blank_lines=False, quoting=3)
    data = data.fillna({'word': end_token, 'tag': 'O'})
    data.tag = data.tag.apply((lambda x: labels[x]))
    return get_sentences(data)

# Encode the data using BERT transformer

## Load the transformer

In [5]:
transformer_name = "distilbert-base-uncased"
transformer = DistilBertModel.from_pretrained(transformer_name)
tokenizer = DistilBertTokenizer.from_pretrained(transformer_name)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




## Preprocessing before sending to the transformer

In [6]:
def get_BERT_loader(dataset, max_len=182, batch_size=64):
    sentences, labels = get_data(dataset)
    token = [[tokenizer.encode(w, add_special_tokens=True) for w in s] for s in sentences]
    flat_token = [list(np.concatenate(sentence).flat) for sentence in token]
    padded = np.array([i + [0]*(max_len-len(i)) for i in flat_token])
    attention_mask = np.where(padded != 0, 1, 0)
    data = TensorDataset(torch.from_numpy(padded), torch.from_numpy(attention_mask))
    data_loader = DataLoader(data, shuffle=False, batch_size=batch_size)
    return data_loader, labels, token

In [7]:
train_trans_loader, train_labels, train_token = get_BERT_loader('train')
valid_trans_loader, valid_labels, valid_token = get_BERT_loader('dev')
test_trans_loader, test_labels, test_token = get_BERT_loader('test')

## Encode Using BERT

In [8]:
def get_embeddings(target_loader):
    transformer.eval()
    transformer.to(device)
    all_embed = []
    for sample, sample_mask in tqdm(target_loader):
        with torch.no_grad():
            sample, sample_mask = sample.to(device), sample_mask.to(device)
            last_hidden_states = transformer(sample, attention_mask=sample_mask)
            all_embed.append(last_hidden_states[0][:,0,:])
    return torch.cat(all_embed,dim =0)

In [9]:
train_embed = get_embeddings(train_trans_loader)
valid_embed = get_embeddings(valid_trans_loader)
test_embed = get_embeddings(test_trans_loader)

HBox(children=(FloatProgress(value=0.0, max=38.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))




In [82]:
def clean_embed_output(embed, token):
    data_clean_embed = []
    for i, token_sentence in enumerate(token):
        clean_embed = []
        embed_idx = 0
        for word in token_sentence:
            clean_embed.append(torch.mean(embed[i][embed_idx:embed_idx+len(word)]).item())
            embed_idx += len(word)
        data_clean_embed.append(torch.tensor(clean_embed[1:len(clean_embed)-1]))
    return data_clean_embed

In [83]:
train_clean_embed = clean_embed_output(train_embed, train_token)
valid_clean_embed = clean_embed_output(valid_embed, valid_token)
test_clean_embed = clean_embed_output(test_embed, test_token)

In [12]:
train_labels_clean = [torch.tensor(x[1:len(x)-1]) for x in train_labels]
valid_labels_clean = [torch.tensor(x[1:len(x)-1]) for x in valid_labels]
test_labels_clean = [torch.tensor(x[1:len(x)-1]) for x in test_labels]

# Classify the embeddings using RNN

In [65]:
# Model Definition
class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()
        hidden_size = 256
        self.rnn = nn.LSTM(input_size=1, hidden_size=hidden_size) 
        self.fc = nn.Linear(in_features=hidden_size, out_features=len(labels))

    def forward(self, sentence, h=None):
        out = []
        for X in sentence:
            tmp, h = self.rnn(X.unsqueeze(dim=0).unsqueeze(dim=0).unsqueeze(dim=0), h)
            out.append(self.fc(tmp))
        return torch.stack(out).squeeze(1), h

In [66]:
# train the classifier NOT TESTED YET
classifier = RNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(classifier.parameters(), lr=0.01)
epochs = 4
for epoch in range(epochs):
    running_loss = 0.0
    for i, sentence in enumerate(tqdm(train_clean_embed)):
        tags = train_labels_clean[i]
        sentence, tags = sentence.to(device), tags.to(device)
        optimizer.zero_grad()
        outputs, _ = classifier(sentence)
        loss = criterion(outputs.squeeze(dim=1), tags)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print('[Epoch %d]\tTrain Loss: \t\t%.3f' % (epoch+1, running_loss / len(train_clean_embed)))

HBox(children=(FloatProgress(value=0.0, max=2394.0), HTML(value='')))


[Epoch 1]	Train Loss: 		0.531


HBox(children=(FloatProgress(value=0.0, max=2394.0), HTML(value='')))


[Epoch 2]	Train Loss: 		0.373


HBox(children=(FloatProgress(value=0.0, max=2394.0), HTML(value='')))


[Epoch 3]	Train Loss: 		0.365


HBox(children=(FloatProgress(value=0.0, max=2394.0), HTML(value='')))


[Epoch 4]	Train Loss: 		0.361


In [None]:
correct = 0
total = 0
running_loss = 0
true_distribution = [0 for _ in range(len(labels))]
pick_distribution = [0 for _ in range(len(labels))]
for i, sentence in enumerate(tqdm(valid_clean_embed)):
    if len(sentence) > 0:
        tags = valid_labels_clean[i]
        sentence, tags = sentence.to(device), tags.to(device)
        optimizer.zero_grad()
        outputs, _ = classifier(sentence)
        total += len(tags)
        correct += torch.sum(tags == outputs.squeeze(dim=1).argmax(dim=1))
        loss = criterion(outputs.squeeze(dim=1), tags)
        running_loss += loss.item()
print('Accuracy: \t%.3f%%' % (100*correct/total))

In [84]:
total = 0
for i in range(len(train_clean_embed)):
    sentence = train_clean_embed[i]
    tags = train_labels_clean[i]
    sentence, tags = sentence.to(device), tags.to(device)
    optimizer.zero_grad()
    outputs, _ = classifier(sentence)
    pred = outputs.squeeze(dim=1).argmax(dim=1)
    # print()
    # print(tags)
    total += torch.sum(pred).item()
    print(pred)
print(total)

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0