# CS 224N - Neural Network Prediction of Word Embeddings
Grabbing GPT-2 word embeddings and training RoBERTa to learn the correct embeddings.

## Setting up PyTorch
Using PyTorch on the GPU

In [1]:
import torch
print("Using GPU: " + str(torch.cuda.is_available()))
from torch.utils.data import Dataset, DataLoader

Using GPU: True


## Grab GPT-2 and Word Embeddings
Look at word embeddings GPT-2 has processed

In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
word_embeddings = model.transformer.wte.weight

## Look at WordNet Definitions and Words
Using online package `wn`.

In [3]:
from nltk.corpus import wordnet as wn

def create_wordnet_dataset():
    words, embeddings, definitions = [], [], []

    for ss in wn.all_synsets():
        for lemma in ss.lemmas():
            word = lemma.name()
            if (word is not None):
                tokens = tokenizer.encode(word,add_prefix_space=True)
                if (len(tokens) == 1):
                    words.append(word)
                    definitions.append(ss.definition())
                    embeddings.append(word_embeddings[tokens,:])
    return words, embeddings, definitions

## Look at Wordset Dictionary Definitions
Using `Wordset`, find all of the dictionary words and their definitions. Right now, starting off with all the words from letter `A`.

In [4]:
import json
import numpy as np
import time

def create_wordset_dataset():
    # Create return arrays
    final_embeddings, final_words, final_definitions = [], [], []

    # All letters of alphabet, plus all eventual words and definitions
    letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
    words = np.array([])
    definitions = np.array([])

    # Grab all the words
    for letter in letters:
        # Load in the data for each letter
        f = open('dictionary/' + letter + '.json')
        data = json.load(f)

        # Define empty current words and definitions
        curr_words, curr_definitions = [], []
        for word in list(data.keys()):
            if ('meanings' in data[word]):
                curr_words.append(word)
                curr_definitions.append("")
                for index in range(len(data[word]['meanings'])):
                    curr_definitions[-1] += data[word]['meanings'][index]['def'] + ". "

        # Update existing numpy array
        words = np.concatenate((words, np.array(curr_words)))
        definitions = np.concatenate((definitions, np.array(curr_definitions)))
    
    # Get all words that have embeddings
    for i in range(len(words)):
        word = words[i]
        tokens = tokenizer.encode(word,add_prefix_space=True)
        if (len(tokens) == 1):
            final_embeddings.append(word_embeddings[tokens,:])
            final_words.append(word)
            final_definitions.append(definitions[i])
            
    return final_words, final_embeddings, final_definitions

## Shuffle Dataset
Randomize order of words and definitions + embeddings.

In [5]:
import random

def shuffle_dataset(final_words, final_embeddings, final_definitions):
    c = list(zip(final_words, final_definitions, final_embeddings))
    random.shuffle(c)
    final_words, final_definitions, final_embeddings = zip(*c)
    return final_words, final_embeddings, final_definitions

## Create the Custom Dataset
Helpful for extracting embeddings and definitions

In [6]:
class DefinitionAndEmbeddings(Dataset):

    def __init__(self, final_words, final_embeddings, final_definitions, tokenizer):
        self.words = final_words
        self.input = final_definitions
        self.labels = final_embeddings
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.words)

    def __getitem__(self, idx):
        curr_definition = self.input[idx]
        print(curr_definition)
        tokenized_input = self.tokenizer(curr_definition, return_tensors="pt", padding='max_length', truncation=True, max_length=512)
        tokenized_input['input_ids'] = tokenized_input['input_ids'].squeeze(0)
        curr_embedding = self.labels[idx].squeeze(0)
        return {'input': tokenized_input, 'output': curr_embedding, 'word': self.words[idx]}

## Construct Train and Test Datasets
Call from above to generate from either WordNet or WordSet.

In [7]:
from transformers import AutoTokenizer, RobertaModel
import torch

# Create the stuff from existing functions
final_words, final_embeddings, final_definitions = create_wordnet_dataset()
final_words, final_embeddings, final_definitions = shuffle_dataset(final_words, final_embeddings, final_definitions)

tokenizer = AutoTokenizer.from_pretrained("roberta-large")

train_words = final_words[:int(0.9 * len(final_words))]
train_definitions = final_definitions[:int(0.9 * len(final_definitions))]
train_embeddings = final_embeddings[:int(0.9 * len(final_embeddings))]

train_dataset = DefinitionAndEmbeddings(train_words, train_embeddings, train_definitions, tokenizer)

test_words = final_words[int(0.9 * len(final_words)):]
test_definitions = final_definitions[int(0.9 * len(final_definitions)):]
test_embeddings = final_embeddings[int(0.9 * len(final_embeddings)):]

test_dataset = DefinitionAndEmbeddings(test_words, test_embeddings, test_definitions, tokenizer)

## Set up Training and Testing `DataLoader`s
For use in iterating and processing through batches.

In [8]:
train_params = {'batch_size': 2,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(train_dataset, **train_params)

test_params = {'batch_size': 2,
                'shuffle': False,
                'num_workers': 0
                }

testing_loader = DataLoader(test_dataset, **test_params)

## Training the Network
Also make sure to save weights after every couple of iterations.

In [9]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

model = GPT2LMHeadModel.from_pretrained("gpt2-medium")
if (torch.cuda.is_available()):
    print("Using GPU")
    model.to('cuda')

# Define loss function and optimizer
mse_loss = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

model.train()
best = float('inf')
for i in range(5):
    training_running_loss = 0.0
    
    for j, data in tqdm(enumerate(training_loader, 0)):
        if (j % 10 == 0):
            optimizer.step()
            optimizer.zero_grad()

        input_ids = data['input']['input_ids'].to('cuda')
        outputs = model(input_ids=input_ids)

        last_hidden = outputs.last_hidden_state[:,0,:]
        orig_embeddings = data['output'].to('cuda')
        loss = mse_loss(last_hidden, orig_embeddings)
        loss.backward(retain_graph=True)

        training_running_loss += loss.item()
    
    # Take a step once we get outside the batches
    optimizer.step()
    optimizer.zero_grad()
    
    # Printing and saving
    print("training running loss: ", training_running_loss)
    if i % 5 == 0:
        model.save_pretrained('weights/wordnetepoch' + str(i+1))

    # evaluate on test set after every epoch:
    testing_running_loss = 0 

    for j, data in tqdm(enumerate(testing_loader, 0)):

        input_ids = data['input']['input_ids'].to('cuda')
        outputs = model(input_ids=input_ids)

        last_hidden = outputs.last_hidden_state[:,0,:]
        orig_embeddings = data['output'].to('cuda')
        loss = mse_loss(last_hidden, orig_embeddings)
        testing_running_loss += loss.item()

    if testing_running_loss < best:
        best = testing_running_loss
        model.save_pretrained('weights/GPT2Wordnet')
    print("testing running loss: ", testing_running_loss)
    print("")

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using GPU


21524it [1:47:24,  3.34it/s]


training running loss:  203.53077329788357


2392it [02:34, 15.49it/s]


testing running loss:  19.590693108271807



21524it [1:47:06,  3.35it/s]


training running loss:  176.6444700337015


2392it [02:34, 15.50it/s]


testing running loss:  19.57458110107109



21524it [1:47:10,  3.35it/s]


training running loss:  176.58015252044424


2392it [02:34, 15.51it/s]


testing running loss:  19.569623061455786



21524it [1:47:30,  3.34it/s]


training running loss:  176.55484220013022


2392it [02:34, 15.46it/s]


testing running loss:  19.567571917083114



21524it [1:47:43,  3.33it/s]


training running loss:  176.5474554519169


2392it [02:34, 15.47it/s]

testing running loss:  19.582651044707745






## Saving Model Weights
Save the final model weights!

In [10]:
model.save_pretrained('weights/GPT2WordnetFinal')