# CS 224N Final Project - Evaluating Reloaded Models on WinoDict Dataset
By: Christopher Pondoc, Joseph Guman, and Joseph O'Brien

In [1]:
import torch
print("Using GPU: " + str(torch.cuda.is_available()))

Using GPU: True


## Load in GPT-2 Model
Using HuggingFace Transformers

In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, RobertaModel, RobertaTokenizer

# G2G used to Predict
predict_model = GPT2LMHeadModel.from_pretrained("../weights/G2GMaskingBest").to("cuda")
predict_tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
predict_tokenizer.add_tokens(['[CLS]'])

# R2G used to Predict
#predict_model = RobertaModel.from_pretrained("/home/ubuntu/test/weights/wordnetCrazy").to("cuda")
#predict_tokenizer = RobertaTokenizer.from_pretrained("roberta-large")

# GPT-2 Model and Tokenizer to be fine-tuned
model = GPT2LMHeadModel.from_pretrained('gpt2-medium').to("cuda")
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')

## Load in WinoDict Dataset and Real-to-Fake Words
HuggingFace + Previously Generated.

In [3]:
import pandas as pd

# Load in dataset
first_set = pd.read_csv("../winodict/prob1_of_5.csv")

# Import CSV and get the real to fake
rtf_df = pd.read_csv("../datasets/realtofake.csv")
real_words_list = rtf_df["Real"].tolist()
fake_words_list = rtf_df["Fake"].tolist()

# Populate dictionary
real_to_fake_dict = {}
for i in range(len(real_words_list)):
    real_to_fake_dict[real_words_list[i]] = fake_words_list[i]

## Gather all WordNet Definitions
Use all of the real words to gather all WordNet definitions that will be used.

In [4]:
from nltk.corpus import wordnet as wn

# Keep track of all the final definitions
final_definitions = []

# Loop through each real word and append
for word in real_words_list:
    definition = ""
    for synset in wn.synsets(word):
        definition += synset.definition() + ". "
    final_definitions.append(definition)

# Quick sanity check
assert(len(real_words_list) == len(final_definitions))
assert(len(fake_words_list) == len(final_definitions))

## G2G: Batch All Embeddings
Insert all the embeddings into the model.

In [5]:
def g2g_embeddings():
    # Helpful Debug Message
    print("Number of total definitions: " + str(len(final_definitions)))

    # Tokenizing all of the definitions at once
    predict_tokenizer.pad_token = tokenizer.eos_token
    tokenized_inputs = predict_tokenizer(final_definitions, return_tensors="pt", padding='max_length', truncation=True, max_length=511)
    tokenized_cls = predict_tokenizer([" [CLS]"] * len(final_definitions), return_tensors="pt")
    tokenized_inputs['input_ids'] = torch.cat((tokenized_inputs['input_ids'], tokenized_cls['input_ids']), dim=1).to("cuda")
    tokenized_inputs['attention_mask'] = torch.cat((tokenized_inputs['attention_mask'], tokenized_cls['attention_mask']), dim=1).to("cuda")

    # Add the new tokens and resize the model embeddings matrix
    displacement = len(tokenizer)
    tokenizer.add_tokens(fake_words_list)
    model.resize_token_embeddings(len(tokenizer))
    params = model.state_dict()

    # Adding new embeddings in a range of 4
    for i in range(0, len(final_definitions), 4):
        outputs = predict_model(input_ids=tokenized_inputs['input_ids'][i:min(len(final_definitions), i + 4)], output_hidden_states=True, attention_mask=tokenized_inputs['attention_mask'][i:min(len(final_definitions), i + 4)])
        params['transformer.wte.weight'][displacement + i: displacement + min(len(final_definitions), i + 4),:] = outputs.hidden_states[-1][:,511,:].detach().clone()
    model.load_state_dict(params)

## R2G: Batch All Embeddings
Use R2G to predict word embeddings instead.

In [6]:
def r2g_embeddings():
    # Update definitions to have the CLS token at the beginning
    for i in range(len(final_definitions)):
        final_definitions[i] = "[CLS] " + final_definitions[i]
    
    # Helpful Debug Message
    print("Number of total definitions: " + str(len(final_definitions)))

    # Tokenizing all of the definitions at once
    predict_tokenizer.pad_token = tokenizer.eos_token
    tokenized_inputs = predict_tokenizer(final_definitions, return_tensors="pt", padding='max_length', truncation=True, max_length=511).to('cuda')

    # Add the new tokens and resize the model embeddings matrix
    displacement = len(tokenizer)
    tokenizer.add_tokens(fake_words_list)
    model.resize_token_embeddings(len(tokenizer))
    params = model.state_dict()

    # Adding new embeddings in a range of 4
    for i in range(0, len(final_definitions), 4):
        outputs = predict_model(input_ids=tokenized_inputs['input_ids'][i:min(len(final_definitions), i + 4)], output_hidden_states=True)
        params['transformer.wte.weight'][displacement + i: displacement + min(len(final_definitions), i + 4),:] = outputs.hidden_states[-1][:,0,:].detach().clone()
    model.load_state_dict(params)

## Random Initialization: Batch Add All Embeddings
Create updated embeddings for GPT-2 using Hewitt random initialization.

In [7]:
def random_embeddings():    
    # Helpful Debug Message
    print("Number of total definitions: " + str(len(final_definitions)))
    
    # Use Hewitt code to get embeddings that are average of all other embeddings
    params = model.state_dict()
    embeddings = params['transformer.wte.weight']
    mu = torch.mean(embeddings, dim=0)
    n = embeddings.size()[0]
    sigma = ((embeddings - mu).T @ (embeddings - mu)) / n
    dist = torch.distributions.multivariate_normal.MultivariateNormal(
            mu, covariance_matrix=1e-5*sigma)
    
    # Generate new embeddings, add new tokens, and resize the model embeddings matrix
    displacement = len(tokenizer)
    tokenizer.add_tokens(fake_words_list)
    model.resize_token_embeddings(len(tokenizer))
    params = model.state_dict()
    
    # Adding new embeddings in a range of 4
    new_embeddings = torch.stack(tuple((dist.sample() for _ in range(len(final_definitions)))), dim=0)
    params['transformer.wte.weight'][displacement:, :] = new_embeddings
    model.load_state_dict(params)

## Evaluating on One Example
Writing a function that is reusable and works for one example

In [8]:
def evaluate_winodict(example):
    # First, replace the word with each of the options
    if ('_' in example['sentence']):
        # Change 'the' to lowercase
        first_choice, second_choice = example['option1'], example['option2']
        if (first_choice[:4] == "The "):
            first_choice = "the " + first_choice[4:]
        if (second_choice[:4] == "The "):
            second_choice = "the " + second_choice[4:]
    
        # Get the fake word and replace the definition
        fake_word = real_to_fake_dict[example['lemma']]
        example['definition'] = example['definition'].replace(example['fake_lemma'], fake_word)
        example['sentence'] = example['sentence'].replace(example['fake_lemma'], fake_word)
        
        # Replace the text
        first_text, second_text = example['sentence'], example['sentence']
        pronoun_loc = example['sentence'].index('_')
        first_option = example['definition'] + " " + first_text[:pronoun_loc] + first_choice + first_text[pronoun_loc + 1:]
        second_option = example['definition'] + " " + second_text[:pronoun_loc] + second_choice + second_text[pronoun_loc + 1:]
    
        # Tokenize each string and produce labels
        first_inputs, second_inputs = tokenizer(first_option, return_tensors="pt").to('cuda'), tokenizer(second_option, return_tensors="pt").to('cuda')

        # Create the first token labels
        first_masked_tokens = tokenizer(example['definition'] + " " + first_text[:pronoun_loc] + first_choice, return_tensors="pt")
        first_labels = first_masked_tokens["input_ids"][0]
        first_mask = torch.full((1, first_labels.shape[0]), -100)
        first_fill = tokenizer(first_text[pronoun_loc + 1:], return_tensors="pt")["input_ids"]
        final_first_labels = torch.cat((first_mask, first_fill), dim=1).to('cuda')

        # Create the second token labels
        second_masked_tokens = tokenizer(example['definition'] + " " + second_text[:pronoun_loc] + second_choice, return_tensors="pt")
        second_labels = second_masked_tokens["input_ids"][0]
        second_mask = torch.full((1, second_labels.shape[0]), -100)
        second_fill = tokenizer(second_text[pronoun_loc + 1:], return_tensors="pt")["input_ids"]
        final_second_labels = torch.cat((second_mask, second_fill), dim=1).to('cuda')
        
        # Evaluate the model on each example and check
        first_loss = model(**first_inputs, labels=final_first_labels).loss
        second_loss = model(**second_inputs, labels=final_second_labels).loss
        
        # Write down the correct value and check
        if (first_loss < second_loss):
            return (int(example['label']) == 0)
        else:
            return (int(example['label']) == 1)

## Evaluating WinoDict on GPT-2
Looking specifically at `WinoDict`, with the first generated examples and adding in the definition and substituting in the word.

In [9]:
# Define all the new model embeddings
g2g_embeddings()

# Calculate statistics
correct, total = 0, 0
for index, row in first_set.iterrows():
    if (row['lemma'] != "lemma"):
        total += 1
        correct += int(evaluate_winodict(row))
    
print("GPT-2 Medium achieved a score of: " + str(float(correct) / float(total)))

Number of total definitions: 343
GPT-2 Medium achieved a score of: 0.4678714859437751
