# CS 224N Final Project - Evaluating on Winograd Dataset
By: Christopher Pondoc, Joseph Guman, and Joseph O'Brien

In [1]:
import torch
print("Using GPU: " + str(torch.cuda.is_available()))

Using GPU: True


## Load in GPT-2 Model
Using HuggingFace Transformers

In [2]:
import torch
from transformers import AutoTokenizer, GPT2LMHeadModel
tokenizer = AutoTokenizer.from_pretrained("gpt2-large")
model = GPT2LMHeadModel.from_pretrained("gpt2-large")



## Load in Winograd Dataset
Also taken from HuggingFace

In [7]:
from datasets import load_dataset
dataset = load_dataset("winograd_wsc", 'wsc285')

Downloading and preparing dataset winograd_wsc/wsc285 to /home/ubuntu/.cache/huggingface/datasets/winograd_wsc/wsc285/0.0.0/0651311f3b6dda14889d9a063030a02458395ee50ab9f41cca4cd5a89c0c3dce...


Downloading data:   0%|          | 0.00/113k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/285 [00:00<?, ? examples/s]

Dataset winograd_wsc downloaded and prepared to /home/ubuntu/.cache/huggingface/datasets/winograd_wsc/wsc285/0.0.0/0651311f3b6dda14889d9a063030a02458395ee50ab9f41cca4cd5a89c0c3dce. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

## Evaluating on One Example
Writing a function that is reusable and works for one example

In [109]:
def evaluate_winograd(example):
    # First, replace the word with each of the options
    first_option, second_option = example['text'], example['text']
    first_option = first_option[:example['pronoun_loc']] + example['options'][0] + first_option[example['pronoun_loc'] + len(example['pronoun']):]
    second_option = second_option[:example['pronoun_loc']] + example['options'][1] + second_option[example['pronoun_loc'] + len(example['pronoun']):]
    
    # Tokenize each string and produce labels
    first_inputs, second_inputs = tokenizer(first_option, return_tensors="pt"), tokenizer(second_option, return_tensors="pt")
    first_labels, second_labels = torch.clone(first_inputs["input_ids"]), torch.clone(second_inputs["input_ids"])
    
    # Find positioning of tokens of pronoun to split
    pronoun = example['pronoun']
    start_str, start_ind, end_ind = "", -1, -1
    original_inputs = tokenizer(example['text'], return_tensors="pt")
    for i in range(len(original_inputs["input_ids"][0])):
        value = original_inputs["input_ids"][0][i]
        if (tokenizer.decode(value).strip()) in pronoun:
            start_str += tokenizer.decode(value).strip()
            if (start_ind == -1):
                start_ind = i
            if (start_str == pronoun):
                end_ind = i
        else:
            if (end_ind == -1):
                start_ind = -1
                start_str = ""
    
    # Create masked string for first option
    original_labels = torch.clone(original_inputs["input_ids"])
    first_text_tokens = tokenizer(" " + example['options'][0], return_tensors="pt")["input_ids"][0]
    middle_tensor = torch.full((1, len(first_text_tokens)), -100)
    final_first_labels = torch.cat((original_labels[:,0:start_ind], middle_tensor, original_labels[:,end_ind + 1:]), dim=1)
    
    # Create masked string for second option
    second_text_tokens = tokenizer(" " + example['options'][1], return_tensors="pt")["input_ids"][0]
    middle_tensor = torch.full((1, len(second_text_tokens)), -100)
    final_second_labels = torch.cat((original_labels[:,0:start_ind], middle_tensor, original_labels[:,end_ind + 1:]), dim=1)
    
    # Evaluate the model on each example and check
    first_loss = model(**first_inputs, labels=final_first_labels).loss
    second_loss = model(**second_inputs, labels=final_second_labels).loss
    
    # Write down the correct value and check
    if (first_loss < second_loss):
        return (example['label'] == 0)
    else:
        return (example['label'] == 1)

## Evaluating Winograd on GPT-2
Looking specifically at `wsc285`, or the first $285$ examples.

In [110]:
correct = 0
for example in dataset['test']:
    correct += evaluate_winograd(example)
    
print("GPT-2 Large achieved a score of: " + str((float(correct) / float(len(dataset['test'])))))

GPT-2 Large achieved a score of: 0.6807017543859649
