<a href="https://colab.research.google.com/github/darisoy/EE517_Sp21/blob/master/hw3/hw3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [237]:
import numpy as np
import pandas as pd
import math
import torch
from tqdm.notebook import tqdm
from torch.utils.data import TensorDataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the data

In [138]:
labels = {'O' : 0,
          'B-geo-loc' : 1,
          'I-geo-loc' : 2,
          'B-product' : 3,
          'I-product' : 4,
          'B-facility' : 5,
          'I-facility' : 6,
          'B-company' : 7,
          'I-company' : 8,
          'B-person' : 9,
          'I-person' : 10,
          'B-sportsteam' : 11,
          'I-sportsteam' : 12,
          'B-musicartist' : 13,
          'I-musicartist' : 14,
          'B-movie' : 15,
          'I-movie' : 16,
          'B-tvshow' : 17,
          'I-tvshow' : 18,
          'B-other' : 19,
          'I-other' : 20,
          }
end_token = '<END>'
beg_token = '<BEG>'

In [141]:
def get_sentences(df):
    sentences = []
    labels = []

    running_sentence = [beg_token]
    runnnig_label = [0]
    for idx, row in df.iterrows():
        running_sentence.append(row.word)
        runnnig_label.append(row.tag)
        if row.word == end_token:
            sentences.append(running_sentence)
            labels.append(runnnig_label)

            running_sentence = [beg_token]
            runnnig_label = [0]

    return sentences, labels

def get_data(type):
    data = pd.read_csv('https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/wnut16/data/' + type, delimiter='\t', names=["word", "tag"], skip_blank_lines=False, quoting=3)
    data = data.fillna({'word': end_token, 'tag': 'O'})
    data.tag = data.tag.apply((lambda x: labels[x]))
    return get_sentences(data)

In [142]:
train_sentences, train_labels = get_data('train')
valid_sentences, valid_labels = get_data('dev')
test_sentences, test_labels = get_data('test')

# Encode the data

## Load the model

In [144]:
model_name = "distilbert-base-uncased"
model = DistilBertModel.from_pretrained(model_name)
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

## Preprocessing before sending to the model

In [154]:
# This turns every word into the list of ids
train_token = [[tokenizer.encode(w, add_special_tokens=True) for w in sentence] for sentence in train_sentences]
valid_token = [[tokenizer.encode(w, add_special_tokens=True) for w in sentence] for sentence in valid_sentences]
test_token = [[tokenizer.encode(w, add_special_tokens=True) for w in sentence] for sentence in test_sentences]

In [153]:
train_token[0]

[[101, 1026, 11693, 1028, 102],
 [101, 1030, 3520, 9856, 27610, 25855, 2213, 102],
 [101, 1030, 1056, 2290, 10790, 2581, 2620, 2487, 102],
 [101, 2027, 102],
 [101, 2097, 102],
 [101, 2022, 102],
 [101, 2035, 102],
 [101, 2589, 102],
 [101, 2011, 102],
 [101, 4465, 102],
 [101, 3404, 102],
 [101, 2033, 102],
 [101, 1008, 16837, 1008, 102],
 [101, 1026, 2203, 1028, 102]]

In [163]:
flat_train_token = [list(np.concatenate(sentence).flat) for sentence in train_token]
flat_valid_token = [list(np.concatenate(sentence).flat) for sentence in valid_token]
flat_test_token = [list(np.concatenate(sentence).flat) for sentence in test_token]

In [164]:
def get_max_len(tokenized):
    max_len = 0
    for i in tokenized:
        if len(i) > max_len:
            max_len = len(i)
    return max_len

In [165]:
# Padding
max_len = max(get_max_len(flat_train_token), get_max_len(flat_valid_token), get_max_len(flat_test_token))

padded_train = np.array([i + [0]*(max_len-len(i)) for i in flat_train_token])
padded_valid = np.array([i + [0]*(max_len-len(i)) for i in flat_valid_token])
padded_test = np.array([i + [0]*(max_len-len(i)) for i in flat_test_token])

print("Padded shape (train): ", np.array(padded_train).shape)
print("Padded shape (valid): ", np.array(padded_valid).shape)
print("Padded shape (test): ", np.array(padded_test).shape)

Padded shape (train):  (2394, 182)
Padded shape (valid):  (1005, 182)
Padded shape (test):  (3878, 182)


In [167]:
# Tell BERT to ignore padding
attention_mask_train = np.where(padded_train != 0, 1, 0)
attention_mask_valid = np.where(padded_valid != 0, 1, 0)
attention_mask_test = np.where(padded_test != 0, 1, 0)

In [218]:
# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(padded_train), torch.from_numpy(attention_mask_train))
valid_data = TensorDataset(torch.from_numpy(padded_valid), torch.from_numpy(attention_mask_valid))
test_data = TensorDataset(torch.from_numpy(padded_test), torch.from_numpy(attention_mask_test))

batch_size = 64

train_loader = DataLoader(train_data, shuffle=False, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=False, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

In [219]:
# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_mask_x = dataiter.next()

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample input mask size: ', sample_mask_x.size()) # batch_size, seq_length
print('Sample input mask: \n', sample_mask_x)

Sample input size:  torch.Size([64, 182])
Sample input: 
 tensor([[  101,  1026, 11693,  ...,     0,     0,     0],
        [  101,  1026, 11693,  ...,     0,     0,     0],
        [  101,  1026, 11693,  ...,     0,     0,     0],
        ...,
        [  101,  1026, 11693,  ...,     0,     0,     0],
        [  101,  1026, 11693,  ...,     0,     0,     0],
        [  101,  1026, 11693,  ...,     0,     0,     0]])

Sample input mask size:  torch.Size([64, 182])
Sample input mask: 
 tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])


## Using BERT to encode

In [220]:
def get_embeddings(target_loader):
    model.eval()
    model.to(device)
    all_embed = []
    for sample, sample_mask in target_loader:
        with torch.no_grad():
            sample, sample_mask = sample.to(device), sample_mask.to(device)
            last_hidden_states = model(sample, attention_mask=sample_mask)
            all_embed.append(last_hidden_states[0][:,0,:])
    return torch.cat(all_embed,dim =0)

In [221]:
train_embed = get_embeddings(train_loader)
valid_embed = get_embeddings(valid_loader)
test_embed = get_embeddings(test_loader)

In [222]:
def clean_embed_output(embed, token):
    data_clean_embed = []
    for i, token_sentence in enumerate(token):
        clean_embed = []
        embed_idx = 0
        for word in token_sentence:
            clean_embed.append(torch.mean(embed[i][embed_idx:embed_idx+len(word)]).item())
            embed_idx += len(word)
        data_clean_embed.append(clean_embed[1:len(clean_embed)-1])
    return data_clean_embed

In [214]:
train_clean_embed = clean_embed_output(train_embed, train_token)
valid_clean_embed = clean_embed_output(valid_embed, valid_token)
test_clean_embed = clean_embed_output(test_embed, test_token)

In [223]:
print('embedded train sentences: ', len(train_clean_embed))
print('embedded valid sentences: ', len(valid_clean_embed))
print('embedded test sentences: ', len(test_clean_embed))

embedded train sentences:  2394
embedded valid sentences:  1005
embedded test sentences:  3878


In [234]:
train_labels_clean = [x[1:len(x)-1] for x in train_labels]
valid_labels_clean = [x[1:len(x)-1] for x in valid_labels]
test_labels_clean = [x[1:len(x)-1] for x in test_labels]

In [235]:
bert_embeddings = [train_clean_embed, train_labels_clean, valid_clean_embed, valid_labels_clean, test_clean_embed, test_labels_clean]

# Classify the embeddings

In [238]:
# Model Definition
class RNN(nn.Module):
    def __init__(self, p = 0.5):
        super(RNN, self).__init__()
        self.rnn_layer = nn.RNN(input_size=1, hidden_size=128)
        self.out_layer = nn.Linear(in_features=128, out_features=21)
        self.p = p #Whether to use actual seq or output for next step

    def forward(self,seq, h = None):
        out = []
        X_in = torch.unsqueeze(seq[0],0)
        for X in seq:
            if np.random.rand()>self.p: #Use teacher forcing
                X_in = X.unsqueeze(dim = 0)
            tmp, h = self.rnn_layer(X_in, h)
            X_in = self.out_layer(tmp)
            out.append(X_in)
        return torch.stack(out).squeeze(1), h