# CS 224N Final Project - Evaluating on Winograd Dataset
By: Christopher Pondoc, Joseph Guman, and Joseph O'Brien

In [1]:
import torch
print("Using GPU: " + str(torch.cuda.is_available()))

Using GPU: True


In [19]:
import time

## Load in GPT-2 Model
Using HuggingFace Transformers

In [47]:
import torch
from transformers import AutoTokenizer, GPT2LMHeadModel
tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")

## Load in Winograd Dataset
Also taken from HuggingFace

In [4]:
from datasets import load_dataset
dataset = load_dataset("winograd_wsc", 'wsc273')

Downloading and preparing dataset winograd_wsc/wsc273 to /home/ubuntu/.cache/huggingface/datasets/winograd_wsc/wsc273/0.0.0/0651311f3b6dda14889d9a063030a02458395ee50ab9f41cca4cd5a89c0c3dce...


Generating test split:   0%|          | 0/273 [00:00<?, ? examples/s]

Dataset winograd_wsc downloaded and prepared to /home/ubuntu/.cache/huggingface/datasets/winograd_wsc/wsc273/0.0.0/0651311f3b6dda14889d9a063030a02458395ee50ab9f41cca4cd5a89c0c3dce. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

## Evaluating on One Example
Writing a function that is reusable and works for one example

In [45]:
def evaluate_winograd(example):
    # Change 'the' to lowercase
    first_choice, second_choice = example['options'][0], example['options'][1]
    if (example['options'][0][:4] == "The "):
        first_choice = "the " + first_choice[4:]
    if (example['options'][1][:4] == "The "):
        second_choice = "the " + second_choice[4:]

    # First, replace the word with each of the options
    first_text, second_text = example['text'], example['text']
    first_option = first_text[:example['pronoun_loc']] + first_choice + first_text[example['pronoun_loc'] + len(example['pronoun']):]
    second_option = second_text[:example['pronoun_loc']] + second_choice + second_text[example['pronoun_loc'] + len(example['pronoun']):]

    # Tokenize each string and produce labels
    first_inputs, second_inputs = tokenizer(first_option, return_tensors="pt"), tokenizer(second_option, return_tensors="pt")
    
    # Create the first token labels
    first_masked_tokens = tokenizer(first_text[:example['pronoun_loc']] + first_choice, return_tensors="pt")
    first_labels = first_masked_tokens["input_ids"][0]
    first_mask = torch.full((1, first_labels.shape[0]), -100)
    first_fill = tokenizer(first_text[example['pronoun_loc'] + len(example['pronoun']):], return_tensors="pt")["input_ids"]
    final_first_labels = torch.cat((first_mask, first_fill), dim=1)

    # Create the second token labels
    second_masked_tokens = tokenizer(second_text[:example['pronoun_loc']] + second_choice, return_tensors="pt")
    second_labels = second_masked_tokens["input_ids"][0]
    second_mask = torch.full((1, second_labels.shape[0]), -100)
    second_fill = tokenizer(second_text[example['pronoun_loc'] + len(example['pronoun']):], return_tensors="pt")["input_ids"]
    final_second_labels = torch.cat((second_mask, second_fill), dim=1)

    # Evaluate the model on each example and check
    first_loss = model(**first_inputs, labels=final_first_labels).loss
    second_loss = model(**second_inputs, labels=final_second_labels).loss
    
    # Write down the correct value and check
    if (first_loss < second_loss):
        return (example['label'] == 0)
    else:
        return (example['label'] == 1)

## Evaluating Winograd on GPT-2
Looking specifically at `wsc285`, or the first $285$ examples.

In [46]:
correct = 0
for example in dataset['test']:
    correct += evaluate_winograd(example)
    
print("GPT-2 Large achieved a score of: " + str((float(correct) / float(len(dataset['test'])))))

GPT-2 Large achieved a score of: 0.7106227106227107
