# CS 224N - Training LLMs to Predict Word Embeddings
Using RoBERTa and GPT-2 to predict word embeddings.

## Setting up PyTorch
Using PyTorch on the GPU

In [1]:
import torch
print("Using GPU: " + str(torch.cuda.is_available()))
from torch.utils.data import Dataset, DataLoader

Using GPU: True


## Grab GPT-2 and Word Embeddings
Look at word embeddings GPT-2 has processed

In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model = GPT2LMHeadModel.from_pretrained('gpt2-medium').to("cuda")
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
word_embeddings = model.transformer.wte.weight

## Create Hewett embeddings
We just tell GPT-2 to predict Hewett embeddings

In [3]:
import pandas as pd
first_set = pd.read_csv("winodict/prob1_of_5.csv")
winodict_words = first_set['lemma'].tolist()

## Shuffle Dataset
Randomize order of words and definitions + embeddings.

In [4]:
from nltk.corpus import wordnet as wn

boat_syns = wn.synsets('boat')
for syn in boat_syns:
    print(syn.definition())

a small vessel for travel on water
a dish (often boat-shaped) for serving gravy or sauce
ride in a boat on water


## Create the Custom Dataset
Helpful for extracting embeddings and definitions

In [5]:
class DefinitionAndEmbeddings(Dataset):

    def __init__(self, tokenizer, input):
        self.input = tokenizer("This is a very normal sentence that will have normal results to give to something. boat", return_tensors = "pt", padding = "max_length",
                                          max_length = 511)
        self.tokenized_cls = tokenizer(" [CLS]", return_tensors = "pt")
        self.input["input_ids"] = torch.cat((self.input["input_ids"], self.tokenized_cls["input_ids"]), dim = 1).to("cuda")
        self.input["attention_mask"] = torch.cat((self.input["attention_mask"], self.tokenized_cls["attention_mask"]),dim = 1).to("cuda")
        self.label = input

    def __len__(self):
        return 2000

    def __getitem__(self, idx):
        return {
            "input_ids": self.input["input_ids"][0],
            "attention_mask": self.input["attention_mask"][0],
            "label": self.label,
        }


## Add an Initial `CLS` Embedding
Per John Hewitt's blog post.
https://nlp.stanford.edu/~johnhew/vocab-expansion.html

In [6]:
def add_initial_cls(tokenizer, model):
    # Add CLS token
    tokenizer.add_tokens(['[CLS]'])
    model.resize_token_embeddings(len(tokenizer))
    
    # Get all the pre-expansion embeddings
    params = model.state_dict()
    embeddings = params['transformer.wte.weight']
    pre_expansion_embeddings = embeddings[:-1,:]
    
    # Calculate mean, sigma, n
    mu = torch.mean(pre_expansion_embeddings, dim=0)
    n = pre_expansion_embeddings.size()[0]
    sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
    
    # Calculate the distribution
    dist = torch.distributions.multivariate_normal.MultivariateNormal(
            mu, covariance_matrix=1e-5*sigma)
    
    # Load in the new embedding for the CLS token
    new_embeddings = torch.stack(tuple((dist.sample() for _ in range(1))), dim=0)
    embeddings[-1:,:] = new_embeddings
    params['transformer.wte.weight'][-1:,:] = new_embeddings
    model.load_state_dict(params)

    return mu

## Construct Train and Test Datasets
Call from above to generate from either WordNet or WordSet.

In [7]:
# Create the new tokenizer (GPT-2 specific)
tokenizer.pad_token = tokenizer.eos_token

# Create a new embedding
boat_token = tokenizer(" boat")['input_ids']
boat_embedding = model.transformer.wte.weight[boat_token,:].detach().clone()
boat_embedding = boat_embedding.squeeze(0)

# Note we are temporarily hardcoding the size
train_dataset = DefinitionAndEmbeddings(tokenizer, boat_embedding)

## Set up Training and Testing `DataLoader`s
For use in iterating and processing through batches.

In [8]:
train_params = {'batch_size': 1,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(train_dataset, **train_params)


## Training the Network
Also make sure to save weights after every couple of iterations.

In [None]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

# Add to GPU
import_model = False
if import_model:
    print("loading in old model and tokenizer")
    model = GPT2LMHeadModel.from_pretrained("weights/G2GMaskingEpoch3M")
    tokenizer = GPT2Tokenizer.from_pretrained('weights/G2GMaskingT')
    if (torch.cuda.is_available()):
        print("Using GPU")
        model.to('cuda')

# Define loss function and optimizer
mse_loss = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.train()
for i in range(10):
    training_running_loss = 0.0
    
    for j, data in tqdm(enumerate(training_loader, 0)):
        # Only optimize after every 10th batch or so -- make training more efficient
        optimizer.zero_grad()
        
        # Run the model on the inputs
        outputs = model(input_ids=data["input_ids"], output_hidden_states=True, attention_mask = data["attention_mask"])
        
        # Get last hidden state
        last_hidden = outputs.hidden_states[-1][:,511,:]
        
        # Get the original embeddings and calculate the loss
        loss = mse_loss(last_hidden, data["label"].to("cuda"))
        loss.backward(retain_graph=True)

        training_running_loss += loss.item()

        optimizer.step()
    
    # Take a step once we get outside the batches
    optimizer.step()
    optimizer.zero_grad()
    
    # Printing and saving
    print("training running loss: ", training_running_loss)
    print("")

In [None]:
model.save_pretrained('weights/G2GBoatForceM')
tokenizer.save_pretrained('weights/G2GBoatForceT')

('weights/G2GSamplingT/tokenizer_config.json',
 'weights/G2GSamplingT/special_tokens_map.json',
 'weights/G2GSamplingT/vocab.json',
 'weights/G2GSamplingT/merges.txt',
 'weights/G2GSamplingT/added_tokens.json')