# CS 224N Final Project - Evaluating Fine-Tuned Models on WinoDict Dataset
By: Christopher Pondoc, Joseph Guman, and Joseph O'Brien

In [1]:
import torch
print("Using GPU: " + str(torch.cuda.is_available()))

Using GPU: True


## Load in Off-the-Shelf and Noised GPT-2 Model
Using HuggingFace Transformers and Custom Noising.

In [2]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Get tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium").to('cuda')

# Zero out all embedding layers
params = model.state_dict()
for param in params:
    params[param].requires_grad = False
model.load_state_dict(params)

# Get just the embeddings
noised_model = GPT2LMHeadModel.from_pretrained("../weights/noisedInverted10").to('cuda')
noised_embeddings = noised_model.transformer.wte.weight.detach().clone()

# Delete the model itself
del noised_model

## Load in WinoDict Dataset and Real-to-Fake Words
HuggingFace + Previously Generated.

In [3]:
import pandas as pd
first_set = pd.read_csv("winograd-labels.csv")

## Evaluating on One Example
Writing a function that is reusable and works for one example

In [4]:
def evaluate_winodict(example):
    # First, replace the word with each of the options
    if ('_' in example['sentence']):
        # Change 'the' to lowercase
        first_choice, second_choice = example['option1'], example['option2']
        if (first_choice[:4] == "The "):
            first_choice = "the " + first_choice[4:]
        if (second_choice[:4] == "The "):
            second_choice = "the " + second_choice[4:]
    
        # Get the fake word and replace the definition
        example['sentence'] = example['sentence'].replace(example['fake_lemma'], example['lemma'])
        
        # Replace with noised embeddings
        tokens = tokenizer(example['lemma'])['input_ids']
        orig_embeddings = model.transformer.wte.weight[tokens,:].detach().clone()
        model.transformer.wte.weight[tokens,:] = noised_embeddings[tokens,:].detach().clone()
        
        # Replace the text
        first_text, second_text = example['sentence'], example['sentence']
        pronoun_loc = example['sentence'].index('_')
        first_option = first_text[:pronoun_loc] + first_choice + first_text[pronoun_loc + 1:]
        second_option = second_text[:pronoun_loc] + second_choice + second_text[pronoun_loc + 1:]
    
        # Tokenize each string and produce labels
        first_inputs, second_inputs = tokenizer(first_option, return_tensors="pt").to('cuda'), tokenizer(second_option, return_tensors="pt").to('cuda')

        # Create the first token labels
        first_masked_tokens = tokenizer(first_text[:pronoun_loc] + first_choice, return_tensors="pt")
        first_labels = first_masked_tokens["input_ids"][0]
        first_mask = torch.full((1, first_labels.shape[0]), -100)
        first_fill = tokenizer(first_text[pronoun_loc + 1:], return_tensors="pt")["input_ids"]
        final_first_labels = torch.cat((first_mask, first_fill), dim=1).to('cuda')

        # Create the second token labels
        second_masked_tokens = tokenizer(second_text[:pronoun_loc] + second_choice, return_tensors="pt")
        second_labels = second_masked_tokens["input_ids"][0]
        second_mask = torch.full((1, second_labels.shape[0]), -100)
        second_fill = tokenizer(second_text[pronoun_loc + 1:], return_tensors="pt")["input_ids"]
        final_second_labels = torch.cat((second_mask, second_fill), dim=1).to('cuda')
        
        # Evaluate the model on each example and check
        first_loss = model(**first_inputs, labels=final_first_labels).loss
        second_loss = model(**second_inputs, labels=final_second_labels).loss
        
        # Return model back to original embeddings
        model.transformer.wte.weight[tokens,:] = orig_embeddings
        
        # Write down the correct value and check
        if (first_loss < second_loss):
            return (int(example['label']) == 0)
        else:
            return (int(example['label']) == 1)

## Evaluating WinoDict on GPT-2
Looking specifically at `WinoDict`, with the first generated examples and adding in the definition and substituting in the word.

In [5]:
with torch.no_grad():
    correct, total = 0, 0
    for index, row in first_set.iterrows():
        if (row['lemma'] != "lemma"):
            total += 1
            correct += int(evaluate_winodict(row))

    print("GPT-2 Medium achieved a score of: " + str(float(correct) / float(total)))

GPT-2 Medium achieved a score of: 0.5766666666666667
