## CS224N Project - Fine-Tuning GPT-2 on Example Sentences
Using G2G and R2G to fine-tune on existing embeddings.

In [1]:
import torch
print("Using GPU: " + str(torch.cuda.is_available()))

Using GPU: True


## Get all WinoDict Words
Get all the words from the WinoDict dataset.

In [2]:
import pandas as pd
first_set = pd.read_csv("winodict/prob1_of_5.csv")
real_words, winodict_words = first_set['lemma'].tolist(), first_set['fake_lemma'].tolist()

## Function to Get WordNet Definition of WinoDict Word
Using WordNet to get the WordNet definition of a fake word.

In [3]:
from nltk.corpus import wordnet as wn

def find_definition(word):
    definition = ""
    for synset in wn.synsets(word):
        definition += synset.definition() + ". "
    return definition

## Load in G2G/R2G Model for Predicting Embeddings
This will be used to initialize our initial word embeddings for the fake words.

In [4]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# G2G/R2G used to Predict
predict_model = GPT2LMHeadModel.from_pretrained("weights/G2GNext1")
predict_tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
predict_tokenizer.add_tokens(['[CLS]'])

# GPT-2 Model and Tokenizer to be fine-tuned
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')

# Standalone tokenizer
ORIG_TOKENIZER = GPT2Tokenizer.from_pretrained('gpt2-medium')

params = model.state_dict()
if (torch.equal(params['transformer.wte.weight'], params['lm_head.weight'])):
    print("Yes")

Yes


## Create a New Model with Updated Word Embedding
For each new fake word, create a new word embedding based on the fake word's definition and then update base GPT-2.

In [6]:
def fake_word_into_embedding(replacement, fake_word):
    # Get definition of the word
    definition = find_definition(replacement)
    
    # Pass into the tokenizer
    predict_tokenizer.pad_token = tokenizer.eos_token
    tokenized_input = predict_tokenizer(definition, return_tensors="pt", padding='max_length', truncation=True, max_length=511)
    tokenized_cls = predict_tokenizer(" [CLS]", return_tensors="pt")
    tokenized_input['input_ids'] = torch.cat((tokenized_input['input_ids'], tokenized_cls['input_ids']), dim=1)

    # Pass into the model and extract the predicted embedding
    outputs = predict_model(input_ids=tokenized_input['input_ids'], output_hidden_states=True)
    last_hidden = outputs.hidden_states[-1][:,511,:]
    predicted_embedding = last_hidden.squeeze(0)

    # Add the new token and resize the model embedding
    tokenizer.add_tokens([fake_word])
    model.resize_token_embeddings(len(tokenizer))

    # Get model parameters and embeddings
    params = model.state_dict()
    embeddings = params['transformer.wte.weight']

    # Update with the new embedding
    embeddings[-1:,:] = predicted_embedding
    params['transformer.wte.weight'][-1:,:] = predicted_embedding
    model.load_state_dict(params)
    
    # Check for model equality
    print("Checking!")
    params = model.state_dict()
    if (torch.equal(params['transformer.wte.weight'][-1], params['lm_head.weight'][-1])):
        print("Yes")
    else:
        print("No")
    
    # Print message for debugging
    print("New word updated")

## Removing Sentences that are not Tokenized Correctly
Procedure of tokenizing the sentence, then tokenizing the fake word, and then verifying that the token for the fake word is in the sentence tokens.

In [7]:
def tokenized_correctly(real, fake, sentence):
    fake_word_into_embedding(real, fake)
    tokens = tokenizer(sentence)['input_ids']
    word_token = tokenizer(fake)['input_ids'][0]
    return (word_token in tokens)

## Load in Example Sentences
Load in the dataset of example sentences for all WinoDict real lemma words.

In [8]:
import json

def add_fake_words():
    used_fake_words = {}
    example_sentences = json.load(open('datasets/example_sentences.json'))

    for fake_word in winodict_words:


def get_example_sentences():
    # Define all the empty arrays and load in the JSON
    corr_words, corr_fake, all_sentences, definitions = [], [], [], []
    example_sentences = json.load(open('datasets/example_sentences.json'))

    # Iterate through each example sentence
    i = 0
    for word in example_sentences:#len(example_sentences):
        for sentence in example_sentences[word]:
            # Get the corresponding fake word and craft the new sentence
            fake_word = winodict_words[real_words.index(word)]
            fake_sentence = sentence.replace(word, fake_word) 

            # Create the list of real words, fake words, all sentences, and definitions
            if (tokenized_correctly(word, fake_word, fake_sentence)):
                corr_words.append(word)
                corr_fake.append(fake_word)
                all_sentences.append(fake_sentence)
                definitions.append(find_definition(word))
                i += 1
            
            # Update model embeddings to be original
            tokenizer = ORIG_TOKENIZER
            model.resize_token_embeddings(len(tokenizer))
            
            if (i > 20):
                print("Here")
                break
        
        if (i > 20):
            print("Here")
            break

    # Return all of the data
    return corr_words, corr_fake, all_sentences, definitions

## Dataset Class for Example Sentences
Using the PyTorch class for a dataset.

In [9]:
from torch.utils.data import Dataset, DataLoader

class ExampleSentences(Dataset):

    def __init__(self, real_words, fake_words, sentences, definitions):
        self.real_words = real_words
        self.fake_words = fake_words
        self.sentences = sentences
        self.definitions = definitions
        self.max_length = 512

    def __len__(self):
        return len(self.real_words)

    def __getitem__(self, idx):
        return {
            'sentence': self.sentences[idx], 
            'definition': self.definitions[idx], 
            'real': self.real_words[idx],
            'fake': self.fake_words[idx]
        }

## Construct the Dataset
Full end to end pipeline, filled both with creating the Datasets and their DataLoaders.

In [10]:
def create_full_dataset():
    # Getting all of the example content
    corr_words, corr_fake, all_sentences, definitions = get_example_sentences()
    
    # Create the train splits
    train_real = corr_words[:int(0.9 * len(corr_words))]
    train_fake = corr_fake[:int(0.9 * len(corr_fake))]
    train_sentences = all_sentences[:int(0.9 * len(all_sentences))]
    train_definitions = definitions[:int(0.9 * len(definitions))]
    
    # Create the train dataset
    train_dataset = ExampleSentences(train_real, train_fake, train_sentences, train_definitions)
    
    # Create the train DataLoader
    train_params = {
        'batch_size': 1,
        'shuffle': False,
        'num_workers': 0
    }
    training_loader = DataLoader(train_dataset, **train_params)
    
    # Create the test splits
    test_real = corr_words[int(0.9 * len(corr_words)):]
    test_fake = corr_fake[int(0.9 * len(corr_fake)):]
    test_sentences = all_sentences[int(0.9 * len(all_sentences)):]
    test_definitions = definitions[int(0.9 * len(definitions)):]
    
    # Create the test dataset
    test_dataset = ExampleSentences(test_real, test_fake, test_sentences, test_definitions)
    
    # Create the test DataLoader
    test_params = {
        'batch_size': 1,
        'shuffle': False,
        'num_workers': 0
    }
    testing_loader = DataLoader(test_dataset, **test_params)
    
    # Return!
    return train_dataset, training_loader, test_dataset, testing_loader

## Training Loop
Full, end-to-end loop to train the model!

In [11]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

# Number of epochs for training
NUM_EPOCHS = 5

# Set up model for training -- put on GPU, set up optimizer
model.to('cuda')
#print(model.state_dict()['lm_head.weight'])
#for param_tensor in model.state_dict():
#    print(param_tensor, "\t", model.state_dict()[param_tensor].size())
    
#tokenizer.add_tokens(["efafaef"])
#model.resize_token_embeddings(len(tokenizer))
#one = (model.state_dict()['lm_head.weight'][-1])
#two = (model.state_dict()['transformer.wte.weight'][-1])
#if (torch.equal(one, two)):
#    print("Yes")
    
#for param_tensor in model.state_dict():
#    print(param_tensor, "\t", model.state_dict()[param_tensor].size())
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Get all of the data
train_dataset, training_loader, test_dataset, testing_loader = create_full_dataset()

# Getting to the epochs
print("Got it to work")
for i in range(NUM_EPOCHS):
    for j, data in tdqm(enumerate(training_loader, 0)):
        print(data)

'''
def train():
    for i in range(NUM_EPOCHS):
        # Keep a running loss
        training_running_loss = 0.0
        
        # Iterate through each example in the training loader
        for j, data in tqdm(enumerate(training_loader, 0)):
            # Only optimize after every 10th batch for efficiency
            if (j % 10 == 0):
                optimizer.step()
                optimizer.zero_grad()
        
        # Run the model on the inputs
        input_ids = data['input']['input_ids'].to('cuda')
        outputs = model(input_ids=input_ids, output_hidden_states=True)
        
        # Get last hidden state
        last_hidden = outputs.hidden_states[-1][:,511,:]
        
        # Get the original embeddings and calculate the loss
        orig_embeddings = data['output'].to('cuda')
        loss = mse_loss(last_hidden, orig_embeddings)
        loss.backward(retain_graph=True)

        training_running_loss += loss.item()
    
    # Take a step once we get outside the batches
    optimizer.step()
    optimizer.zero_grad()
'''

    
'''
# Add to GPU
model = GPT2LMHeadModel.from_pretrained("weights/G2G1")
if (torch.cuda.is_available()):
    print("Using GPU")
    model.to('cuda')

# Define loss function and optimizer
mse_loss = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.train()
best = float('inf')
for i in range(1):
    training_running_loss = 0.0
    
    for j, data in tqdm(enumerate(training_loader, 0)):
        # Only optimize after every 10th batch or so -- make training more efficient
        if (j % 10 == 0):
            optimizer.step()
            optimizer.zero_grad()
        
        # Run the model on the inputs
        input_ids = data['input']['input_ids'].to('cuda')
        outputs = model(input_ids=input_ids, output_hidden_states=True)
        
        # Get last hidden state
        last_hidden = outputs.hidden_states[-1][:,511,:]
        
        # Get the original embeddings and calculate the loss
        orig_embeddings = data['output'].to('cuda')
        loss = mse_loss(last_hidden, orig_embeddings)
        loss.backward(retain_graph=True)

        training_running_loss += loss.item()
    
    # Take a step once we get outside the batches
    optimizer.step()
    optimizer.zero_grad()
    
    # Printing and saving
    print("training running loss: ", training_running_loss)
    model.save_pretrained('weights/G2GNext' + str(i+1))

    # evaluate on test set after every epoch:
    testing_running_loss = 0 

    for j, data in tqdm(enumerate(testing_loader, 0)):

        input_ids = data['input']['input_ids'].to('cuda')
        outputs = model(input_ids=input_ids, output_hidden_states=True)

         # Get last hidden state
        last_hidden = outputs.hidden_states[-1][:,511,:]
        
        orig_embeddings = data['output'].to('cuda')
        loss = mse_loss(last_hidden, orig_embeddings)
        testing_running_loss += loss.item()

    if testing_running_loss < best:
        best = testing_running_loss
        model.save_pretrained('weights/GPT2Wordnet')
    print("testing running loss: ", testing_running_loss)
    print("")'''

Checking!
Yes
New word updated
Checking!
Yes
New word updated


KeyboardInterrupt: 