# CS 224N Final Project - Evaluating on WinoDict Dataset
By: Christopher Pondoc, Joseph Guman, and Joseph O'Brien

In [1]:
import torch
print("Using GPU: " + str(torch.cuda.is_available()))

Using GPU: True


## Load in GPT-2 Model
Using HuggingFace Transformers

In [2]:
import torch
from transformers import AutoTokenizer, GPT2LMHeadModel
tokenizer = AutoTokenizer.from_pretrained("gpt2-large")
model = GPT2LMHeadModel.from_pretrained("gpt2-large")



## Load in Winograd Dataset
Also taken from HuggingFace

In [6]:
import pandas as pd
first_set = pd.read_csv("winodict/prob1_of_5.csv")

                                   id     lemma   fake_lemma   pos  tag  \
0                                   0      fear    hydrubous  VERB  VBD   
1                                   1  advocate    hydrubous  VERB  VBD   
2                                   2     large    ntionessy   ADJ   JJ   
3                                   3     small    ntionessy   ADJ   JJ   
4                                   4   receive    broinking  VERB  VBN   
..                                ...       ...          ...   ...  ...   
494  3ZZAYRN1I857UKRI3FD7KHU86YXOTC-2     light  warditedian   ADJ   JJ   
495  3ZZAYRN1I857UKRI3FD7KHU973UOTM-1   useless    unduodity   ADJ   JJ   
496  3ZZAYRN1I857UKRI3FD7KHU973UOTM-2    useful    unduodity   ADJ   JJ   
497  3ZZAYRN1I857UKRI3FD7KHU989DOTI-1    humble        ntury   ADJ   JJ   
498  3ZZAYRN1I857UKRI3FD7KHU989DOTI-2  arrogant        ntury   ADJ   JJ   

    pronoun                                         definition  \
0      they  The verb to hydrubou

## Evaluating on One Example
Writing a function that is reusable and works for one example

In [72]:
def evaluate_winodict(example):
    # First, replace the word with each of the options
    if ('_' in example['sentence']):
        first_option, second_option = example['sentence'], example['sentence']
        pronoun_loc = example['sentence'].index('_')
        first_option = example['definition'] + " " + first_option[:pronoun_loc] + example['option1'] + first_option[pronoun_loc + 1:]
        second_option = example['definition'] + " " + second_option[:pronoun_loc] + example['option2'] + second_option[pronoun_loc + 1:]

        # Tokenize each string and produce labels
        first_inputs, second_inputs = tokenizer(first_option, return_tensors="pt"), tokenizer(second_option, return_tensors="pt")
        first_labels, second_labels = torch.clone(first_inputs["input_ids"]), torch.clone(second_inputs["input_ids"])
        
        # Find positioning of tokens of the underscore to split
        start_str, start_ind, end_ind = "", -1, -1
        original_inputs = tokenizer(example['definition'] + " " + example['sentence'], return_tensors="pt")
        for i in range(len(original_inputs["input_ids"][0])):
            value = original_inputs["input_ids"][0][i]
            if (tokenizer.decode(value).strip()) in "_":
                start_str += tokenizer.decode(value).strip()
                if (start_ind == -1):
                    start_ind = i
                if (start_str == "_"):
                    end_ind = i
                    break
            else:
                if (end_ind == -1):
                    start_ind = -1
                    start_str = ""
        
        # Create masked string for first option
        original_labels = torch.clone(original_inputs["input_ids"])
        first_text_tokens = tokenizer(" " + example['option1'], return_tensors="pt")["input_ids"][0]
        middle_tensor = torch.full((1, len(first_text_tokens)), -100)
        final_first_labels = torch.cat((original_labels[:,0:start_ind], middle_tensor, original_labels[:,end_ind + 1:]), dim=1)

        # Create masked string for second option
        second_text_tokens = tokenizer(" " + example['option2'], return_tensors="pt")["input_ids"][0]
        middle_tensor = torch.full((1, len(second_text_tokens)), -100)
        final_second_labels = torch.cat((original_labels[:,0:start_ind], middle_tensor, original_labels[:,end_ind + 1:]), dim=1)
        
        # Evaluate the model on each example and check
        first_loss = model(**first_inputs, labels=final_first_labels).loss
        second_loss = model(**second_inputs, labels=final_second_labels).loss
        
        # Write down the correct value and check
        if (first_loss < second_loss):
            return (int(example['label']) == 0)
        else:
            return (int(example['label']) == 1)

## Evaluating Winograd on GPT-2
Looking specifically at `WinoDict`, with the first generated examples and adding in the definition and substituting in the word.

In [73]:
correct, total = 0, 0
for index, row in first_set.iterrows():
    if (row['lemma'] != "lemma"):
        total += 1
        correct += evaluate_winodict(row)
    
print("GPT-2 Large achieved a score of: " + str(float(correct) / float(total)))

GPT-2 Large achieved a score of: 0.5100401606425703
