# CS 224N - Training LLMs to Predict Word Embeddings
Using RoBERTa and GPT-2 to predict word embeddings.

## Setting up PyTorch
Using PyTorch on the GPU

In [1]:
import torch
print("Using GPU: " + str(torch.cuda.is_available()))
from torch.utils.data import Dataset, DataLoader

Using GPU: True


## Grab GPT-2 and Word Embeddings
Look at word embeddings GPT-2 has processed

In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model = GPT2LMHeadModel.from_pretrained('gpt2-medium').to("cuda")
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
word_embeddings = model.transformer.wte.weight

## Import WinoDict Dataset
Used so that we don't have the overlapping definitions from the test set.

In [3]:
import pandas as pd
first_set = pd.read_csv("winodict/prob1_of_5.csv")
winodict_words = first_set['lemma'].tolist()

## Look at WordNet Definitions and Words
Using online package `wn`.

In [4]:
from nltk.corpus import wordnet as wn

def create_wordnet_dataset():
    words, embeddings, definitions = [], [], []

    for ss in wn.all_synsets():
        for lemma in ss.lemmas():
            word = lemma.name()
            if (word is not None and word in winodict_words):
                tokens = tokenizer.encode(word,add_prefix_space=True)
                if (len(tokens) == 1):
                    words.append(word)
                    definitions.append(ss.definition())
                    embeddings.append(word_embeddings[tokens,:])
    return words, embeddings, definitions

## Separate Saving
Separate work to save everything.

In [5]:
words, embeddings, definitions = create_wordnet_dataset()
words = list(set(words))
with open("datasets/wordnet.txt", "w") as txt_file:
    for word in words:
        txt_file.write(word + "\n")

## Shuffle Dataset
Randomize order of words and definitions + embeddings.

In [6]:
import random

def shuffle_dataset(final_words, final_embeddings, final_definitions):
    c = list(zip(final_words, final_definitions, final_embeddings))
    random.shuffle(c)
    final_words, final_definitions, final_embeddings = zip(*c)
    return final_words, final_embeddings, final_definitions

## Create the Custom Dataset
Helpful for extracting embeddings and definitions

In [7]:
class DefinitionAndEmbeddings(Dataset):

    def __init__(self, final_words, final_embeddings, final_definitions, tokenizer, model_len):
        self.words = final_words

        self.inputs = tokenizer(final_definitions, return_tensors = "pt", padding = "max_length",
                                          max_length = model_len - 1)
        self.tokenized_cls = tokenizer([" [CLS]"] * len(final_definitions), return_tensors = "pt")

        self.inputs["input_ids"] = torch.cat((self.inputs["input_ids"], self.tokenized_cls["input_ids"]), dim = 1).to("cuda")
        self.inputs["attention_mask"] = torch.cat((self.inputs["attention_mask"], self.tokenized_cls["attention_mask"]),dim = 1).to("cuda")
        self.labels = final_embeddings

    def __len__(self):
        return len(self.words)

    def __getitem__(self, idx):
        return {
            "input_ids": self.inputs["input_ids"][idx],
            "attention_mask": self.inputs["attention_mask"][idx],
            "label": self.labels[idx][0],
            #temporary solution where we are storing the labels as a list of unsqueezed torch tensors
            "word": self.words[idx]
        }


## Add an Initial `CLS` Embedding
Per John Hewitt's blog post.

In [8]:
def add_initial_cls(tokenizer, model):
    # Add CLS token
    tokenizer.add_tokens(['[CLS]'])
    model.resize_token_embeddings(len(tokenizer))
    
    # Get all the pre-expansion embeddings
    params = model.state_dict()
    embeddings = params['transformer.wte.weight']
    pre_expansion_embeddings = embeddings[:-1,:]
    
    # Calculate mean, sigma, n
    mu = torch.mean(pre_expansion_embeddings, dim=0)
    n = pre_expansion_embeddings.size()[0]
    sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
    
    # Calculate the distribution
    dist = torch.distributions.multivariate_normal.MultivariateNormal(
            mu, covariance_matrix=1e-5*sigma)
    
    # Load in the new embedding for the CLS token
    new_embeddings = torch.stack(tuple((dist.sample() for _ in range(1))), dim=0)
    embeddings[-1:,:] = new_embeddings
    params['transformer.wte.weight'][-1:,:] = new_embeddings
    model.load_state_dict(params)

## Construct Train and Test Datasets
Call from above to generate from either WordNet or WordSet.

In [9]:
# Create the stuff from existing functions
final_words, final_embeddings, final_definitions = create_wordnet_dataset()
final_words, final_embeddings, final_definitions = shuffle_dataset(final_words, final_embeddings, final_definitions)

# Create the new tokenizer (GPT-2 specific)
tokenizer.pad_token = tokenizer.eos_token
add_initial_cls(tokenizer, model)

train_words = final_words[:int(0.9 * len(final_words))]
train_definitions = final_definitions[:int(0.9 * len(final_definitions))]
train_embeddings = final_embeddings[:int(0.9 * len(final_embeddings))]

#Note we are temporarily hardcoding the size
train_dataset = DefinitionAndEmbeddings(train_words, train_embeddings, train_definitions, tokenizer, 512)

test_words = final_words[int(0.9 * len(final_words)):]
test_definitions = final_definitions[int(0.9 * len(final_definitions)):]
test_embeddings = final_embeddings[int(0.9 * len(final_embeddings)):]

test_dataset = DefinitionAndEmbeddings(test_words, test_embeddings, test_definitions, tokenizer, 512)

## Set up Training and Testing `DataLoader`s
For use in iterating and processing through batches.

In [10]:
tokenizer.save_pretrained('weights/G2GMaskingT')

('weights/G2GMaskingT/tokenizer_config.json',
 'weights/G2GMaskingT/special_tokens_map.json',
 'weights/G2GMaskingT/vocab.json',
 'weights/G2GMaskingT/merges.txt',
 'weights/G2GMaskingT/added_tokens.json')

In [11]:
train_params = {'batch_size': 2,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(train_dataset, **train_params)

test_params = {'batch_size': 2,
                'shuffle': False,
                'num_workers': 0
                }

testing_loader = DataLoader(test_dataset, **test_params)

## Training the Network
Also make sure to save weights after every couple of iterations.

In [12]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

# Add to GPU
import_model = True
if import_model:
    print("loading in old model and tokenizer")
    model = GPT2LMHeadModel.from_pretrained("weights/G2GMaskingEpoch3M")
    tokenizer = GPT2Tokenizer.from_pretrained('weights/G2GMaskingT')
    if (torch.cuda.is_available()):
        print("Using GPU")
        model.to('cuda')

# Define loss function and optimizer
mse_loss = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.train()
best = float('inf')
for i in range(15):
    training_running_loss = 0.0
    
    for j, data in tqdm(enumerate(training_loader, 0)):
        # Only optimize after every 10th batch or so -- make training more efficient
        if (j % 10 == 0):
            optimizer.step()
            optimizer.zero_grad()
        
        # Run the model on the inputs
        outputs = model(input_ids=data["input_ids"], output_hidden_states=True, attention_mask = data["attention_mask"])
        
        # Get last hidden state
        last_hidden = outputs.hidden_states[-1][:,511,:]
        
        # Get the original embeddings and calculate the loss
        loss = mse_loss(last_hidden, data["label"].to("cuda"))
        loss.backward(retain_graph=True)

        training_running_loss += loss.item()
    
    # Take a step once we get outside the batches
    optimizer.step()
    optimizer.zero_grad()
    
    # Printing and saving
    print("training running loss: ", training_running_loss)
    model.save_pretrained('weights/G2GMaskingEpoch' + str(i + 4) + 'M')

    # evaluate on test set after every epoch:
    testing_running_loss = 0 

    for j, data in tqdm(enumerate(testing_loader, 0)):

        outputs = model(input_ids=data["input_ids"], output_hidden_states=True, attention_mask = data["attention_mask"])

        # Get last hidden state
        last_hidden = outputs.hidden_states[-1][:,511,:]
        loss = mse_loss(last_hidden, data["label"].to("cuda"))
        testing_running_loss += loss.item()

    if testing_running_loss < best:
        best = testing_running_loss
        model.save_pretrained('weights/G2GMaskingBestM')
    print("testing running loss: ", testing_running_loss)
    print("")

loading in old model and tokenizer
Using GPU


1329it [05:17,  4.18it/s]


training running loss:  93.40311047341675


148it [00:12, 12.09it/s]


testing running loss:  1.2255758973769844



1329it [05:18,  4.17it/s]


training running loss:  10.733476765220985


148it [00:12, 12.07it/s]


testing running loss:  1.1407861835323274



1329it [05:18,  4.18it/s]


training running loss:  9.990708225406706


148it [00:12, 12.06it/s]


testing running loss:  1.0775466617196798



1329it [05:18,  4.18it/s]


training running loss:  9.60497555974871


148it [00:12, 12.07it/s]


testing running loss:  1.0611579371616244



1329it [05:18,  4.18it/s]


training running loss:  9.491591854719445


148it [00:12, 12.07it/s]


testing running loss:  1.0361177208833396



1329it [05:18,  4.18it/s]


training running loss:  9.418701958376914


148it [00:12, 12.08it/s]


testing running loss:  1.0385296198073775



1329it [05:18,  4.18it/s]


training running loss:  9.323146499227732


148it [00:12, 12.07it/s]


testing running loss:  1.0242210845462978



1329it [05:17,  4.18it/s]


training running loss:  9.227712941821665


148it [00:12, 12.06it/s]


testing running loss:  1.0368527872487903



1329it [05:18,  4.17it/s]


training running loss:  9.227445332333446


148it [00:12, 12.06it/s]


testing running loss:  1.0191450798884034



1329it [05:18,  4.18it/s]


training running loss:  9.132990080164745


148it [00:12, 12.05it/s]


testing running loss:  1.0150716756470501



1329it [05:17,  4.18it/s]


training running loss:  9.035422204528004


148it [00:12, 12.07it/s]


testing running loss:  1.04282828560099



1329it [05:18,  4.18it/s]


training running loss:  9.053890063427389


148it [00:12, 12.08it/s]


testing running loss:  0.9796658505219966



1329it [05:17,  4.18it/s]


training running loss:  8.958520439919084


148it [00:12, 12.06it/s]


testing running loss:  0.9781427565030754



1329it [05:18,  4.18it/s]


training running loss:  8.839024084620178


148it [00:12, 12.06it/s]


testing running loss:  0.9655803644564003



1329it [05:17,  4.18it/s]


training running loss:  8.692172870505601


148it [00:12, 12.12it/s]

testing running loss:  0.9658376432489604






## Saving Model Weights
Save the final model weights!

In [13]:
model.save_pretrained('weights/MG2GMaskingFinal')