# CS 224N - Neural Network Prediction of Word Embeddings
Grabbing GPT-2 word embeddings and training ROBERTA to learn the correct embeddings.

## Setting up PyTorch
Using PyTorch on the GPU

In [1]:
import torch
print("Using GPU: " + str(torch.cuda.is_available()))
from torch.utils.data import Dataset, DataLoader

Using GPU: True


In [2]:
import time

## Grab GPT-2 and Word Embeddings
Look at word embeddings GPT-2 has processed

In [3]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
word_embeddings = model.transformer.wte.weight



In [4]:
print(len(word_embeddings))

50257


## Load Dictionary Definitions
Using `Wordset`, find all of the dictionary words and their definitions. Right now, starting off with all the words from letter `A`.

In [5]:
import json
import numpy as np
import time

# All letters of alphabet, plus all eventual words and definitions
letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
words = np.array([])
definitions = np.array([])

# Grab all the words
for letter in letters:
    # Load in the data for each letter
    f = open('dictionary/' + letter + '.json')
    data = json.load(f)
    
    # Define empty current words and definitions
    curr_words, curr_definitions = [], []
    for word in list(data.keys()):
        if ('meanings' in data[word]):
            curr_words.append(word)
            curr_definitions.append("")
            for index in range(len(data[word]['meanings'])):
                curr_definitions[-1] += data[word]['meanings'][index]['def'] + ". "
    
    # Update existing numpy array
    words = np.concatenate((words, np.array(curr_words)))
    definitions = np.concatenate((definitions, np.array(curr_definitions)))

## Get all Words that have Embeddings 
Tokenize each word, check which are singular tokens, and collect all correspondings words and their definitions.

In [6]:
final_embeddings, final_words, final_definitions = [], [], []
for i in range(len(words)):
    word = words[i]
    tokens = tokenizer.encode(word,add_prefix_space=True)
    if (len(tokens) == 1):
        final_embeddings.append(word_embeddings[tokens,:])
        final_words.append(word)
        final_definitions.append(definitions[i])

In [7]:
print(final_words[1])
print(final_embeddings[1][0])
print(final_definitions[1])


about
tensor([ 3.5818e-03,  6.9158e-02,  2.6934e-02,  3.2574e-02,  1.2125e-03,
        -6.7487e-02, -2.6660e-01,  1.1424e-02, -8.1788e-02,  1.1982e-01,
         1.1785e-02,  1.5385e-02,  9.6082e-02, -7.0277e-02,  1.7012e-01,
        -1.5220e-02, -6.4566e-02, -3.2492e-02,  8.0022e-02,  6.3599e-02,
         1.5231e-01, -7.5073e-02, -1.0552e-01,  9.6400e-02,  4.6874e-03,
         3.7594e-02, -2.7366e-02,  9.4609e-02,  4.5500e-02, -5.7275e-02,
        -5.9990e-02, -4.1737e-02,  4.1300e-02, -1.2985e-02,  1.7531e-01,
         1.2025e-01, -3.1544e-01, -1.4215e-01,  4.9248e-03,  4.8260e-02,
         8.3238e-02,  2.1300e-02, -1.0378e-01,  1.1654e-01, -4.5384e-02,
         6.3146e-03, -1.8227e-01,  6.3809e-02, -5.2265e-02, -7.5506e-02,
        -6.4237e-02,  9.1597e-02,  1.0758e-01,  7.3340e-02,  5.0310e-02,
        -1.1204e-01,  3.0986e-02, -1.1251e-01, -8.1787e-02,  6.0744e-02,
        -1.0306e-01, -2.9245e-02,  7.6112e-02, -1.7006e-02, -1.9620e-01,
         6.0342e-02, -4.6941e-03, -4.7242e-02

Now it's time to add in our RoBERTa model

In [8]:
# Randomize order of words and definitions + embeddings
import random
c = list(zip(final_words, final_definitions, final_embeddings))
random.shuffle(c)
final_words, final_definitions, final_embeddings = zip(*c)

In [9]:
#defines a custom pytorch dataset
# train_words = 90% of final_words
class DefinitionAndEmbeddings(Dataset):

    def __init__(self, final_words, final_embeddings, final_definitions, tokenizer):
        self.words = final_words
        self.input = final_definitions
        self.labels = final_embeddings
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.words)

    def __getitem__(self, idx):
        curr_definition = self.input[idx]
        tokenized_input = self.tokenizer(curr_definition, return_tensors="pt", padding='max_length', truncation=True, max_length=512)
        tokenized_input['input_ids'] = tokenized_input['input_ids'].squeeze(0)
        curr_embedding = self.labels[idx].squeeze(0)
        return {'input': tokenized_input, 'output': curr_embedding, 'word': self.words[idx]}

In [10]:
from transformers import AutoTokenizer, RobertaModel
import torch

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

train_words = final_words[:int(0.9 * len(final_words))]
train_definitions = final_definitions[:int(0.9 * len(final_definitions))]
train_embeddings = final_embeddings[:int(0.9 * len(final_embeddings))]

train_dataset = DefinitionAndEmbeddings(train_words, train_embeddings, train_definitions, tokenizer)

test_words = final_words[int(0.9 * len(final_words)):]
test_definitions = final_definitions[int(0.9 * len(final_definitions)):]
test_embeddings = final_embeddings[int(0.9 * len(final_embeddings)):]

test_dataset = DefinitionAndEmbeddings(test_words, test_embeddings, test_definitions, tokenizer)

In [11]:
train_params = {'batch_size': 10,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(train_dataset, **train_params)

In [12]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

model = RobertaModel.from_pretrained("roberta-base")
if (torch.cuda.is_available()):
    print("Using GPU")
    model.to('cuda')

# Define loss function and optimizer
mse_loss = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

model.train()
for i in range(2):
    running_loss = 0.0
    
    for j, data in tqdm(enumerate(training_loader, 0)):
        optimizer.zero_grad()

        input_ids = data['input']['input_ids'].to('cuda')
        outputs = model(input_ids=input_ids)

        last_hidden = outputs.last_hidden_state[:,0,:]
        print(last_hidden.shape)
        orig_embeddings = data['output'].to('cuda')
        print(orig_embeddings.shape)
        loss = mse_loss(last_hidden, orig_embeddings)
        loss.backward(retain_graph=True)
        optimizer.step()

        running_loss += loss.item()
    print(running_loss)


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using GPU


0it [00:00, ?it/s]

torch.Size([10, 768])
torch.Size([10, 768])


1it [00:01,  1.34s/it]

torch.Size([10, 768])
torch.Size([10, 768])


2it [00:02,  1.00s/it]

torch.Size([10, 768])
torch.Size([10, 768])


3it [00:02,  1.12it/s]

torch.Size([10, 768])
torch.Size([10, 768])


4it [00:03,  1.19it/s]

torch.Size([10, 768])
torch.Size([10, 768])


5it [00:04,  1.24it/s]

torch.Size([10, 768])
torch.Size([10, 768])


6it [00:05,  1.25it/s]

torch.Size([10, 768])
torch.Size([10, 768])


7it [00:05,  1.27it/s]

torch.Size([10, 768])
torch.Size([10, 768])


8it [00:06,  1.29it/s]

torch.Size([10, 768])
torch.Size([10, 768])


9it [00:07,  1.30it/s]

torch.Size([10, 768])
torch.Size([10, 768])


10it [00:08,  1.30it/s]

torch.Size([10, 768])
torch.Size([10, 768])


11it [00:08,  1.30it/s]

torch.Size([10, 768])
torch.Size([10, 768])


12it [00:09,  1.30it/s]

torch.Size([10, 768])
torch.Size([10, 768])


13it [00:10,  1.31it/s]

torch.Size([10, 768])
torch.Size([10, 768])


14it [00:11,  1.30it/s]

torch.Size([10, 768])
torch.Size([10, 768])


15it [00:12,  1.30it/s]

torch.Size([10, 768])
torch.Size([10, 768])


16it [00:12,  1.30it/s]

torch.Size([10, 768])
torch.Size([10, 768])


17it [00:13,  1.30it/s]

torch.Size([10, 768])
torch.Size([10, 768])


18it [00:14,  1.30it/s]

torch.Size([10, 768])
torch.Size([10, 768])


19it [00:15,  1.30it/s]

torch.Size([10, 768])
torch.Size([10, 768])


19it [00:15,  1.20it/s]


KeyboardInterrupt: 

In [None]:
running_loss = 0.0

#for j, data in enumerate(training_loader, 0):
#    print(data)
#model = RobertaModel.from_pretrained("roberta-base")
#model = RobertaModel.from_pretrained("weights/first_test")
model.to('cuda')
for j, data in tqdm(enumerate(training_loader, 0)):

    input_ids = data['input']['input_ids'].to('cuda')
    outputs = model(input_ids=input_ids)

    last_hidden = outputs.last_hidden_state[:,0,:]
    #print(last_hidden.shape)
    orig_embeddings = data['output'].to('cuda')
    #print(orig_embeddings.shape)
    loss = mse_loss(last_hidden, orig_embeddings)

    running_loss += loss.item()
print(running_loss)

#print(running_loss / 11)

1120it [01:49, 10.19it/s]

14.820028885267675





In [None]:
model.save_pretrained('weights/test2')

In [None]:
test_params = {'batch_size': 10,
                'shuffle': False,
                'num_workers': 0
                }

testing_loader = DataLoader(test_dataset, **test_params)
#test_dataset = DefinitionAndEmbeddings(train_words, train_embeddings, train_definitions, tokenizer)
# 14.785202487371862 / 1120

In [None]:
for j, data in tqdm(enumerate(testing_loader, 0)):

    input_ids = data['input']['input_ids'].to('cuda')
    outputs = model(input_ids=input_ids)

    last_hidden = outputs.last_hidden_state[:,0,:]
    #print(last_hidden.shape)
    orig_embeddings = data['output'].to('cuda')
    #print(orig_embeddings.shape)
    loss = mse_loss(last_hidden, orig_embeddings)

    running_loss += loss.item()
print(running_loss)

125it [00:12, 10.28it/s]

16.4687871215865





In [None]:
modelBasic = RobertaModel.from_pretrained("roberta-base")
modelBasic.to('cuda')
for j, data in tqdm(enumerate(testing_loader, 0)):

    input_ids = data['input']['input_ids'].to('cuda')
    outputs = modelBasic(input_ids=input_ids)

    last_hidden = outputs.last_hidden_state[:,0,:]
    #print(last_hidden.shape)
    orig_embeddings = data['output'].to('cuda')
    #print(orig_embeddings.shape)
    loss = mse_loss(last_hidden, orig_embeddings)

    running_loss += loss.item()
print(running_loss)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
125it [00:10, 11.58it/s]

37.53986347001046





In [None]:
%history > history.txt