# CS 224N Final Project - Evaluating on WinoDict Dataset
By: Christopher Pondoc, Joseph Guman, and Joseph O'Brien

In [5]:
import torch
print("Using GPU: " + str(torch.cuda.is_available()))

Using GPU: True


## Load in GPT-2 Model
Using HuggingFace Transformers

In [6]:
import torch
from transformers import AutoTokenizer, GPT2LMHeadModel
tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")

## Load in Winograd Dataset
Also taken from HuggingFace

In [7]:
import pandas as pd
first_set = pd.read_csv("winodict/prob1_of_5.csv")

## Evaluating on One Example
Writing a function that is reusable and works for one example

In [8]:
def evaluate_winodict(example):
    # First, replace the word with each of the options
    if ('_' in example['sentence']):
        # Change 'the' to lowercase
        first_choice, second_choice = example['option1'], example['option2']
        if (first_choice[:4] == "The "):
            first_choice = "the " + first_choice[4:]
        if (second_choice[:4] == "The "):
            second_choice = "the " + second_choice[4:]

        # Replace the text
        first_text, second_text = example['sentence'], example['sentence']
        pronoun_loc = example['sentence'].index('_')
        first_option = example['definition'] + " " + first_text[:pronoun_loc] + first_choice + first_text[pronoun_loc + 1:]
        second_option = example['definition'] + " " + second_text[:pronoun_loc] + second_choice + second_text[pronoun_loc + 1:]

        # Tokenize each string and produce labels
        first_inputs, second_inputs = tokenizer(first_option, return_tensors="pt"), tokenizer(second_option, return_tensors="pt")

        # Create the first token labels
        first_masked_tokens = tokenizer(example['definition'] + " " + first_text[:pronoun_loc] + first_choice, return_tensors="pt")
        first_labels = first_masked_tokens["input_ids"][0]
        first_mask = torch.full((1, first_labels.shape[0]), -100)
        first_fill = tokenizer(first_text[pronoun_loc + 1:], return_tensors="pt")["input_ids"]
        final_first_labels = torch.cat((first_mask, first_fill), dim=1)

        # Create the second token labels
        second_masked_tokens = tokenizer(example['definition'] + " " + second_text[:pronoun_loc] + second_choice, return_tensors="pt")
        second_labels = second_masked_tokens["input_ids"][0]
        second_mask = torch.full((1, second_labels.shape[0]), -100)
        second_fill = tokenizer(second_text[pronoun_loc + 1:], return_tensors="pt")["input_ids"]
        final_second_labels = torch.cat((second_mask, second_fill), dim=1)

        # Evaluate the model on each example and check
        first_loss = model(**first_inputs, labels=final_first_labels).loss
        second_loss = model(**second_inputs, labels=final_second_labels).loss
        
        # Write down the correct value and check
        if (first_loss < second_loss):
            return (int(example['label']) == 0)
        else:
            return (int(example['label']) == 1)

        '''first_option, second_option = example['sentence'], example['sentence']
        pronoun_loc = example['sentence'].index('_')
        first_option = example['definition'] + " " + first_option[:pronoun_loc] + example['option1'] + first_option[pronoun_loc + 1:]
        second_option = example['definition'] + " " + second_option[:pronoun_loc] + example['option2'] + second_option[pronoun_loc + 1:]

        # Tokenize each string and produce labels
        first_inputs, second_inputs = tokenizer(first_option, return_tensors="pt"), tokenizer(second_option, return_tensors="pt")
        first_labels, second_labels = torch.clone(first_inputs["input_ids"]), torch.clone(second_inputs["input_ids"])
        
        # Find positioning of tokens of the underscore to split
        start_str, start_ind, end_ind = "", -1, -1
        original_inputs = tokenizer(example['definition'] + " " + example['sentence'], return_tensors="pt")
        for i in range(len(original_inputs["input_ids"][0])):
            value = original_inputs["input_ids"][0][i]
            if (tokenizer.decode(value).strip()) in "_":
                start_str += tokenizer.decode(value).strip()
                if (start_ind == -1):
                    start_ind = i
                if (start_str == "_"):
                    end_ind = i
                    break
            else:
                if (end_ind == -1):
                    start_ind = -1
                    start_str = ""
        
        # Create masked string for first option
        original_labels = torch.clone(original_inputs["input_ids"])
        first_text_tokens = tokenizer(" " + example['option1'], return_tensors="pt")["input_ids"][0]
        middle_tensor = torch.full((1, len(first_text_tokens)), -100)
        final_first_labels = torch.cat((original_labels[:,0:start_ind], middle_tensor, original_labels[:,end_ind + 1:]), dim=1)

        # Create masked string for second option
        second_text_tokens = tokenizer(" " + example['option2'], return_tensors="pt")["input_ids"][0]
        middle_tensor = torch.full((1, len(second_text_tokens)), -100)
        final_second_labels = torch.cat((original_labels[:,0:start_ind], middle_tensor, original_labels[:,end_ind + 1:]), dim=1)
        
        # Evaluate the model on each example and check
        first_loss = model(**first_inputs, labels=final_first_labels).loss
        second_loss = model(**second_inputs, labels=final_second_labels).loss
        
        # Write down the correct value and check
        if (first_loss < second_loss):
            return (int(example['label']) == 0)
        else:
            return (int(example['label']) == 1)'''

## Evaluating WinoDict on GPT-2
Looking specifically at `WinoDict`, with the first generated examples and adding in the definition and substituting in the word.

In [9]:
correct, total = 0, 0
for index, row in first_set.iterrows():
    if (row['lemma'] != "lemma"):
        total += 1
        correct += evaluate_winodict(row)
        print(correct)
    
print("GPT-2 Large achieved a score of: " + str(float(correct) / float(total)))

GPT-2 Large achieved a score of: 0.0
