# CS 224N - Training LLMs to Predict Word Embeddings
Using RoBERTa and GPT-2 to predict word embeddings.

## Setting up PyTorch
Using PyTorch on the GPU

In [1]:
import torch
print("Using GPU: " + str(torch.cuda.is_available()))
from torch.utils.data import Dataset, DataLoader

Using GPU: True


## Grab GPT-2 and Word Embeddings
Look at word embeddings GPT-2 has processed

In [2]:
from transformers import RobertaModel, RobertaTokenizer

# Set up model, tokenizer, and grab weights
model = RobertaModel.from_pretrained('roberta-large').to("cuda")
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
word_embeddings = model.embeddings.word_embeddings.weight

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Import WinoDict Dataset
Used so that we don't have the overlapping definitions from the test set.

In [3]:
import pandas as pd
first_set = pd.read_csv("winodict/prob1_of_5.csv")
winodict_words = first_set['lemma'].tolist()

## Look at WordNet Definitions and Words
Using online package `wn`.

In [4]:
from nltk.corpus import wordnet as wn

def create_wordnet_dataset():
    words, embeddings, definitions = [], [], []

    for ss in wn.all_synsets():
        for lemma in ss.lemmas():
            word = lemma.name()
            if (word is not None and word in winodict_words):
                tokens = tokenizer.encode(word,add_prefix_space=True)
                if (len(tokens) == 3):
                    words.append(word)
                    definitions.append(ss.definition())
                    embeddings.append(word_embeddings[tokens[1],:])
    return words, embeddings, definitions

## Separate Saving
Separate work to save everything.

In [5]:
def save_words():
    words, embeddings, definitions = create_wordnet_dataset()
    words = list(set(words))
    with open("datasets/wordnet.txt", "w") as txt_file:
        for word in words:
            txt_file.write(word + "\n")

## Shuffle Dataset
Randomize order of words and definitions + embeddings.

In [6]:
import random

def shuffle_dataset(final_words, final_embeddings, final_definitions):
    c = list(zip(final_words, final_definitions, final_embeddings))
    random.shuffle(c)
    final_words, final_definitions, final_embeddings = zip(*c)
    return final_words, final_embeddings, final_definitions

## Create the Custom Dataset
Helpful for extracting embeddings and definitions

In [7]:
class DefinitionAndEmbeddings(Dataset):

    def __init__(self, final_words, final_embeddings, final_definitions, tokenizer):
        self.words = final_words

        self.inputs = tokenizer(final_definitions, return_tensors = "pt", padding = "max_length", max_length = 512)

        self.inputs["input_ids"] = self.inputs["input_ids"].to("cuda")
        self.inputs["attention_mask"] = self.inputs["attention_mask"].to("cuda")
        self.labels = final_embeddings

    def __len__(self):
        return len(self.words)

    def __getitem__(self, idx):
        return {
            "input_ids": self.inputs["input_ids"][idx],
            "attention_mask": self.inputs["attention_mask"][idx],
            "label": self.labels[idx],
            "word": self.words[idx]
        }


## Construct Train and Test Datasets
Call from above to generate from either WordNet or WordSet.

In [8]:
# Create the stuff from existing functions
final_words, final_embeddings, final_definitions = create_wordnet_dataset()
final_words, final_embeddings, final_definitions = shuffle_dataset(final_words, final_embeddings, final_definitions)

# Create the train splits
train_words = final_words[:int(0.9 * len(final_words))]
train_definitions = final_definitions[:int(0.9 * len(final_definitions))]
train_embeddings = final_embeddings[:int(0.9 * len(final_embeddings))]

# Create the train dataset
train_dataset = DefinitionAndEmbeddings(train_words, train_embeddings, train_definitions, tokenizer)

# Create the test splits
test_words = final_words[int(0.9 * len(final_words)):]
test_definitions = final_definitions[int(0.9 * len(final_definitions)):]
test_embeddings = final_embeddings[int(0.9 * len(final_embeddings)):]

# Create the test dataset
test_dataset = DefinitionAndEmbeddings(test_words, test_embeddings, test_definitions, tokenizer)

## Set up Training and Testing `DataLoader`s
For use in iterating and processing through batches.

In [9]:
train_params = {'batch_size': 2,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(train_dataset, **train_params)

test_params = {'batch_size': 2,
                'shuffle': False,
                'num_workers': 0
                }

testing_loader = DataLoader(test_dataset, **test_params)

In [10]:
torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7fa649d58520>

## Training the Network
Also make sure to save weights after every couple of iterations.

In [10]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

# Add to GPU
import_model = False
if import_model:
    model = GPT2LMHeadModel.from_pretrained("weights/G2G1")
    if (torch.cuda.is_available()):
        print("Using GPU")
        model.to('cuda')

# Define loss function and optimizer
mse_loss = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.train()
best = float('inf')
aggregated_gradients = torch.zeros(model.transformer.wte.weight.shape).to("cuda")
for i in range(5):
    training_running_loss = 0.0
    
    for j, data in tqdm(enumerate(training_loader, 0)):
        if j % 10 == 0:
            # Finish up optimization
            model.transformer.wte.weight.grad = aggregated_gradients
            optimizer.step()
            aggregated_gradients = torch.zeros(model.transformer.wte.weight.shape).to("cuda")
            optimizer.zero_grad()

        # Run the model on the inputs
        outputs = model(input_ids=data["input_ids"], output_hidden_states=True, attention_mask = data["attention_mask"])
        
        # Get last hidden state
        last_hidden = outputs.hidden_states[-1][:,0,:].detach()
        
        # Get the original embeddings and calculate the loss
        loss = mse_loss(last_hidden, data["label"].to("cuda"))
        loss.backward(retain_graph=True)
        aggregated_gradients += model.transformer.wte.weight.grad
        optimizer.zero_grad()

        training_running_loss += loss.item()

    # Take a step once we get outside the batches
    model.transformer.wte.weight.grad = aggregated_gradients
    optimizer.step()
    aggregated_gradients = torch.zeros(model.transformer.wte.weight.shape).to("cuda")
    optimizer.zero_grad()
    
    # Printing and saving
    print("training running loss: ", training_running_loss)
    model.save_pretrained('weights/R2GNew' + str(i+1))

    # evaluate on test set after every epoch:
    testing_running_loss = 0 

    for j, data in tqdm(enumerate(testing_loader, 0)):

        outputs = model(input_ids=data["input_ids"], output_hidden_states=True, attention_mask = data["attention_mask"])

        # Get last hidden state
        last_hidden = outputs.hidden_states[-1][:,0,:]
        loss = mse_loss(last_hidden, data["label"].to("cuda"))
        testing_running_loss += loss.item()

    if testing_running_loss < best:
        best = testing_running_loss
        model.save_pretrained('weights/R2GNewBest')
    print("testing running loss: ", testing_running_loss)
    print("")

AttributeError: 'RobertaModel' object has no attribute 'transformer'

In [11]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

# Add to GPU
import_model = False
if import_model:
    model = GPT2LMHeadModel.from_pretrained("weights/G2G1")
    if (torch.cuda.is_available()):
        print("Using GPU")
        model.to('cuda')

# Define loss function and optimizer
mse_loss = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.train()
best = float('inf')
for i in range(5):
    training_running_loss = 0.0
    
    for j, data in tqdm(enumerate(training_loader, 0)):
        if j % 10 == 0:
            # Finish up optimization
            optimizer.step()
            optimizer.zero_grad()

        # Run the model on the inputs
        outputs = model(input_ids=data["input_ids"], output_hidden_states=True, attention_mask = data["attention_mask"])
        
        # Get last hidden state
        last_hidden = outputs.hidden_states[-1][:,0,:].detach()
        
        # Get the original embeddings and calculate the loss
        loss = mse_loss(last_hidden, data["label"].to("cuda"))
        loss.backward(retain_graph=True)

        training_running_loss += loss.item()

    # Take a step once we get outside the batches
    optimizer.step()
    optimizer.zero_grad()
    
    # Printing and saving
    print("training running loss: ", training_running_loss)
    model.save_pretrained('weights/R2GNew' + str(i+1))

    # evaluate on test set after every epoch:
    testing_running_loss = 0 

    for j, data in tqdm(enumerate(testing_loader, 0)):

        outputs = model(input_ids=data["input_ids"], output_hidden_states=True, attention_mask = data["attention_mask"])

        # Get last hidden state
        last_hidden = outputs.hidden_states[-1][:,0,:]
        loss = mse_loss(last_hidden, data["label"].to("cuda"))
        testing_running_loss += loss.item()

    if testing_running_loss < best:
        best = testing_running_loss
        model.save_pretrained('weights/R2GNewBest')
    print("testing running loss: ", testing_running_loss)
    print("")

  File "/opt/conda/envs/pytorch/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/opt/conda/envs/pytorch/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/home/ubuntu/.local/lib/python3.9/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/home/ubuntu/.local/lib/python3.9/site-packages/traitlets/config/application.py", line 1043, in launch_instance
    app.start()
  File "/home/ubuntu/.local/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 725, in start
    self.io_loop.start()
  File "/home/ubuntu/.local/lib/python3.9/site-packages/tornado/platform/asyncio.py", line 215, in start
    self.asyncio_loop.run_forever()
  File "/opt/conda/envs/pytorch/lib/python3.9/asyncio/base_events.py", line 601, in run_forever
    self._run_once()
  File "/opt/conda/envs/pytorch/lib/python3.9/asyncio/base_events.py", line 1905, in _run_once
    handle.

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [1024]], which is output 0 of AsStridedBackward0, is at version 4; expected version 3 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

## Saving Model Weights
Save the final model weights!

In [None]:
model.save_pretrained('weights/R2GNewFinal')