# CS 224N - Training LLMs to Predict Word Embeddings
Using RoBERTa and GPT-2 to predict word embeddings.

## Setting up PyTorch
Using PyTorch on the GPU

In [1]:
import torch
print("Using GPU: " + str(torch.cuda.is_available()))
from torch.utils.data import Dataset, DataLoader

Using GPU: True


## Grab GPT-2 and Word Embeddings
Look at word embeddings GPT-2 has processed

In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
word_embeddings = model.transformer.wte.weight

## Import WinoDict Dataset
Used so that we don't have the overlapping definitions from the test set.

In [3]:
import pandas as pd
first_set = pd.read_csv("winodict/prob1_of_5.csv")
winodict_words = first_set['lemma'].tolist()

## Look at WordNet Definitions and Words
Using online package `wn`.

In [4]:
from nltk.corpus import wordnet as wn

def create_wordnet_dataset():
    words, embeddings, definitions = [], [], []

    for ss in wn.all_synsets():
        for lemma in ss.lemmas():
            word = lemma.name()
            if (word is not None and word in winodict_words):
                tokens = tokenizer.encode(word,add_prefix_space=True)
                if (len(tokens) == 1):
                    words.append(word)
                    definitions.append(ss.definition())
                    embeddings.append(word_embeddings[tokens,:])
    return words, embeddings, definitions

## Separate Saving
Separate work to save everything.

In [5]:
words, embeddings, definitions = create_wordnet_dataset()
words = list(set(words))
with open("datasets/wordnet.txt", "w") as txt_file:
    for word in words:
        txt_file.write(word + "\n")

## Look at Wordset Dictionary Definitions
Using `Wordset`, find all of the dictionary words and their definitions. Right now, starting off with all the words from letter `A`.

In [6]:
import json
import numpy as np
import time

def create_wordset_dataset():
    # Create return arrays
    final_embeddings, final_words, final_definitions = [], [], []

    # All letters of alphabet, plus all eventual words and definitions
    letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
    words = np.array([])
    definitions = np.array([])

    # Grab all the words
    for letter in letters:
        # Load in the data for each letter
        f = open('dictionary/' + letter + '.json')
        data = json.load(f)

        # Define empty current words and definitions
        curr_words, curr_definitions = [], []
        for word in list(data.keys()):
            if ('meanings' in data[word]):
                curr_words.append(word)
                curr_definitions.append("")
                for index in range(len(data[word]['meanings'])):
                    curr_definitions[-1] += data[word]['meanings'][index]['def'] + ". "

        # Update existing numpy array
        words = np.concatenate((words, np.array(curr_words)))
        definitions = np.concatenate((definitions, np.array(curr_definitions)))
    
    # Get all words that have embeddings
    for i in range(len(words)):
        word = words[i]
        tokens = tokenizer.encode(word,add_prefix_space=True)
        if (len(tokens) == 1):
            final_embeddings.append(word_embeddings[tokens,:])
            final_words.append(word)
            final_definitions.append(definitions[i])
            
    return final_words, final_embeddings, final_definitions

## Shuffle Dataset
Randomize order of words and definitions + embeddings.

In [7]:
import random

def shuffle_dataset(final_words, final_embeddings, final_definitions):
    c = list(zip(final_words, final_definitions, final_embeddings))
    random.shuffle(c)
    final_words, final_definitions, final_embeddings = zip(*c)
    return final_words, final_embeddings, final_definitions

## Create the Custom Dataset
Helpful for extracting embeddings and definitions

In [8]:
class DefinitionAndEmbeddings(Dataset):

    def __init__(self, final_words, final_embeddings, final_definitions, tokenizer):
        self.words = final_words
        self.input = final_definitions
        self.labels = final_embeddings
        self.tokenizer = tokenizer
        self.max_length = 512

    def __len__(self):
        return len(self.words)

    def __getitem__(self, idx):
        curr_definition = self.input[idx]
        
        # Tokenized input + padding = length of max_length - 1
        tokenized_input = self.tokenizer(curr_definition, return_tensors="pt", padding='max_length', truncation=True, max_length=self.max_length - 1)
        tokenized_cls = self.tokenizer(" [CLS]", return_tensors="pt")
        
        # Remove first dimension from all the embeddings
        tokenized_input['input_ids'] = tokenized_input['input_ids'].squeeze(0)
        tokenized_cls['input_ids'] = tokenized_cls['input_ids'].squeeze(0)
        curr_embedding = self.labels[idx].squeeze(0)
        
        # Combine the input + padding + CLS token at the end
        tokenized_input['input_ids'] = torch.cat((tokenized_input['input_ids'], tokenized_cls['input_ids']), dim=0)
        return {'input': tokenized_input, 'output': curr_embedding, 'word': self.words[idx]}

## Add an Initial `CLS` Embedding
Per John Hewitt's blog post.

In [8]:
def add_initial_cls(tokenizer, model):
    # Add CLS token
    tokenizer.add_tokens(['[CLS]'])
    model.resize_token_embeddings(len(tokenizer))
    
    # Get all the pre-expansion embeddings
    params = model.state_dict()
    embeddings = params['transformer.wte.weight']
    pre_expansion_embeddings = embeddings[:-1,:]
    
    # Calculate mean, sigma, n
    mu = torch.mean(pre_expansion_embeddings, dim=0)
    n = pre_expansion_embeddings.size()[0]
    sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
    
    # Calculate the distribution
    dist = torch.distributions.multivariate_normal.MultivariateNormal(
            mu, covariance_matrix=1e-5*sigma)
    
    # Load in the new embedding for the CLS token
    new_embeddings = torch.stack(tuple((dist.sample() for _ in range(1))), dim=0)
    embeddings[-1:,:] = new_embeddings
    params['transformer.wte.weight'][-1:,:] = new_embeddings
    model.load_state_dict(params)

## Construct Train and Test Datasets
Call from above to generate from either WordNet or WordSet.

In [9]:
import torch

# Create the stuff from existing functions
final_words, final_embeddings, final_definitions = create_wordnet_dataset()
final_words, final_embeddings, final_definitions = shuffle_dataset(final_words, final_embeddings, final_definitions)

# Create the new tokenizer (GPT-2 specific)
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
tokenizer.pad_token = tokenizer.eos_token
add_initial_cls(tokenizer, model)

train_words = final_words[:int(0.9 * len(final_words))]
train_definitions = final_definitions[:int(0.9 * len(final_definitions))]
train_embeddings = final_embeddings[:int(0.9 * len(final_embeddings))]

train_dataset = DefinitionAndEmbeddings(train_words, train_embeddings, train_definitions, tokenizer)

test_words = final_words[int(0.9 * len(final_words)):]
test_definitions = final_definitions[int(0.9 * len(final_definitions)):]
test_embeddings = final_embeddings[int(0.9 * len(final_embeddings)):]

test_dataset = DefinitionAndEmbeddings(test_words, test_embeddings, test_definitions, tokenizer)

## Set up Training and Testing `DataLoader`s
For use in iterating and processing through batches.

In [10]:
train_params = {'batch_size': 2,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(train_dataset, **train_params)

test_params = {'batch_size': 2,
                'shuffle': False,
                'num_workers': 0
                }

testing_loader = DataLoader(test_dataset, **test_params)

## Training the Network
Also make sure to save weights after every couple of iterations.

In [11]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

# Add to GPU
model = GPT2LMHeadModel.from_pretrained("weights/G2G1")
if (torch.cuda.is_available()):
    print("Using GPU")
    model.to('cuda')

# Define loss function and optimizer
mse_loss = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.train()
best = float('inf')
for i in range(1):
    training_running_loss = 0.0
    
    for j, data in tqdm(enumerate(training_loader, 0)):
        # Only optimize after every 10th batch or so -- make training more efficient
        if (j % 10 == 0):
            optimizer.step()
            optimizer.zero_grad()
        
        # Run the model on the inputs
        input_ids = data['input']['input_ids'].to('cuda')
        outputs = model(input_ids=input_ids, output_hidden_states=True)
        
        # Get last hidden state
        last_hidden = outputs.hidden_states[-1][:,511,:]
        
        # Get the original embeddings and calculate the loss
        orig_embeddings = data['output'].to('cuda')
        loss = mse_loss(last_hidden, orig_embeddings)
        loss.backward(retain_graph=True)

        training_running_loss += loss.item()
    
    # Take a step once we get outside the batches
    optimizer.step()
    optimizer.zero_grad()
    
    # Printing and saving
    print("training running loss: ", training_running_loss)
    model.save_pretrained('weights/G2GNext' + str(i+1))

    # evaluate on test set after every epoch:
    testing_running_loss = 0 

    for j, data in tqdm(enumerate(testing_loader, 0)):

        input_ids = data['input']['input_ids'].to('cuda')
        outputs = model(input_ids=input_ids, output_hidden_states=True)

         # Get last hidden state
        last_hidden = outputs.hidden_states[-1][:,511,:]
        
        orig_embeddings = data['output'].to('cuda')
        loss = mse_loss(last_hidden, orig_embeddings)
        testing_running_loss += loss.item()

    if testing_running_loss < best:
        best = testing_running_loss
        model.save_pretrained('weights/GPT2Wordnet')
    print("testing running loss: ", testing_running_loss)
    print("")

Using GPU


20194it [1:22:53,  4.06it/s]


training running loss:  174.81623319908977


2244it [02:59, 12.47it/s]


testing running loss:  18.26175558101386



## Saving Model Weights
Save the final model weights!

In [10]:
model.save_pretrained('weights/GPT2WordnetFinal')