<a href="https://colab.research.google.com/github/darisoy/EE517_Sp21/blob/master/hw3/hw3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup Python environment

In [55]:
!pip install transformers



In [56]:
import numpy as np
import pandas as pd
import math
import torch
from tqdm.notebook import tqdm
from torch.utils.data import TensorDataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the data

In [57]:
labels = {'O' : 0,
          'B-geo-loc' : 1,
          'I-geo-loc' : 2,
          'B-product' : 3,
          'I-product' : 4,
          'B-facility' : 5,
          'I-facility' : 6,
          'B-company' : 7,
          'I-company' : 8,
          'B-person' : 9,
          'I-person' : 10,
          'B-sportsteam' : 11,
          'I-sportsteam' : 12,
          'B-musicartist' : 13,
          'I-musicartist' : 14,
          'B-movie' : 15,
          'I-movie' : 16,
          'B-tvshow' : 17,
          'I-tvshow' : 18,
          'B-other' : 19,
          'I-other' : 20,
          }
end_token = '<END>'
beg_token = '<BEG>'

In [58]:
def get_sentences(df):
    sentences = []
    labels = []
    running_sentence = [beg_token]
    runnnig_label = [0]
    for idx, row in df.iterrows():
        running_sentence.append(row.word)
        runnnig_label.append(row.tag)
        if row.word == end_token:
            sentences.append(running_sentence)
            labels.append(runnnig_label)
            running_sentence = [beg_token]
            runnnig_label = [0]
    return sentences, labels

def get_data(type):
    data = pd.read_csv('https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/wnut16/data/' + type, delimiter='\t', names=["word", "tag"], skip_blank_lines=False, quoting=3)
    data = data.fillna({'word': end_token, 'tag': 'O'})
    data.tag = data.tag.apply((lambda x: labels[x]))
    return get_sentences(data)

# Encode the data using BERT transformer

## Load the transformer

In [59]:
transformer_name = "distilbert-base-uncased"
transformer = DistilBertModel.from_pretrained(transformer_name)
tokenizer = DistilBertTokenizer.from_pretrained(transformer_name)

## Preprocessing before sending to the transformer

In [60]:
def get_BERT_loader(dataset, max_len=182, batch_size=64):
    sentences, labels = get_data(dataset)
    token = [[tokenizer.encode(w, add_special_tokens=True) for w in s] for s in sentences]
    flat_token = [list(np.concatenate(sentence).flat) for sentence in token]
    padded = np.array([i + [0]*(max_len-len(i)) for i in flat_token])
    attention_mask = np.where(padded != 0, 1, 0)
    data = TensorDataset(torch.from_numpy(padded), torch.from_numpy(attention_mask))
    data_loader = DataLoader(data, shuffle=False, batch_size=batch_size)
    return data_loader, labels

In [61]:
train_trans_loader, train_labels = get_BERT_loader('train')
valid_trans_loader, valid_labels = get_BERT_loader('dev')
test_trans_loader, test_labels = get_BERT_loader('test')

## Encode Using BERT

In [48]:
def get_embeddings(target_loader):
    transformer.eval()
    transformer.to(device)
    all_embed = []
    for sample, sample_mask in tqdm(target_loader):
        with torch.no_grad():
            sample, sample_mask = sample.to(device), sample_mask.to(device)
            last_hidden_states = transformer(sample, attention_mask=sample_mask)
            all_embed.append(last_hidden_states[0][:,0,:])
    return torch.cat(all_embed,dim =0)

In [29]:
train_embed = get_embeddings(train_trans_loader)
valid_embed = get_embeddings(valid_trans_loader)
test_embed = get_embeddings(test_trans_loader)

In [222]:
def clean_embed_output(embed, token):
    data_clean_embed = []
    for i, token_sentence in enumerate(token):
        clean_embed = []
        embed_idx = 0
        for word in token_sentence:
            clean_embed.append(torch.mean(embed[i][embed_idx:embed_idx+len(word)]).item())
            embed_idx += len(word)
        data_clean_embed.append(torch.tensor(clean_embed[1:len(clean_embed)-1]))
    return data_clean_embed

In [214]:
train_clean_embed = clean_embed_output(train_embed, train_token)
valid_clean_embed = clean_embed_output(valid_embed, valid_token)
test_clean_embed = clean_embed_output(test_embed, test_token)

In [64]:
train_labels_clean = [torch.tensor(x[1:len(x)-1]) for x in train_labels]
valid_labels_clean = [torch.tensor(x[1:len(x)-1]) for x in valid_labels]
test_labels_clean = [torch.tensor(x[1:len(x)-1]) for x in test_labels]

# Classify the embeddings using RNN

In [238]:
# Model Definition
class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()
        self.rnn_layer = nn.RNN(input_size=1, hidden_size=128)
        self.out_layer = nn.Linear(in_features=128, out_features=21)

    def forward(self, sentence, h=None):
        out = []
        X_in = torch.unsqueeze(seq[0],0)
        for X in sentence:
            X_in = X.unsqueeze(dim = 0)
            tmp, h = self.rnn_layer(X_in, h)
            out.append(self.out_layer(tmp))
        return torch.stack(out).squeeze(1), h

In [None]:
# train the classifier NOT TESTED YET
classifier = RNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(seq.parameters(), lr=0.001)
epochs = 10
for epoch in range(epochs):
    running_loss = 0.0
    for i, inputs in enumerate(tqdm(train_clean_embed)):
        labels = train_labels_clean[i]
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs, _ = classifier(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print('[Epoch %d]\tTrain Loss: \t\t%.3f' % (epoch+1, running_loss / len(train_loader)))