# CS 224N - WinoDict Evaluation using RoBERTa Embeddings
Evaluating on WinoDict task using RoBERTa finetuned to predict GPT-2 embeddings.

## Setting up PyTorch
Using PyTorch on the GPU

In [1]:
import torch
print("Using GPU: " + str(torch.cuda.is_available()))

Using GPU: True


## Load in WinoDict Dataset
Load in the first generated set.

In [2]:
import pandas as pd
first_set = pd.read_csv("winodict/prob1_of_5.csv")

## Check Definitions of Words in Wordset
With old Wordset dataset, check if they end up existing.

In [3]:
import json

def find_definition(word):
    # Load in the data for the first letter
    letter = word[0]
    f = open('dictionary/' + letter + '.json')
    data = json.load(f)
    
    # Look through each of the definitions
    definition = ""
    if (word in data.keys()):
        if ('meanings' in data[word]):
            for index in range(len(data[word]['meanings'])):
                definition += data[word]['meanings'][index]['def'] + ". "
    
    return definition

## Grab GPT-2 and RoBERTa
Look at GPT-2 and RoBERTa fine-tuned for downstream task.

In [4]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer, RobertaModel

# GPT-2 Model and Tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# RoBERTa Model and Tokenizer
ro_model = RobertaModel.from_pretrained("weights/test2")
ro_tokenizer = AutoTokenizer.from_pretrained("roberta-base")



## Turning Fake Words into Embeddings in GPT-2!
Using our old model, put everything together.

In [5]:
replacements = first_set['lemma'].tolist()
fake_words = first_set['fake_lemma'].tolist()

for i in range(len(replacements)):
    # Get definition of the word
    definition = find_definition(replacements[i])
    
    # Adding the next word
    if (definition != ""):
        # Pass into the tokenizer
        tokenized_input = ro_tokenizer(definition, return_tensors="pt", padding='max_length', truncation=True, max_length=512)

        # Pass into the model and extract the predicted embedding
        outputs = ro_model(input_ids=tokenized_input['input_ids'])
        last_hidden = outputs.last_hidden_state[:,0,:]
        predicted_embedding = last_hidden.squeeze(0)
        
        # Add the new token and resize the model embedding
        tokenizer.add_tokens([fake_words[i]])
        model.resize_token_embeddings(len(tokenizer))
        
        # Get model parameters and embeddings
        params = model.state_dict()
        embeddings = params['transformer.wte.weight']
        
        # Update with the new embedding
        embeddings[-1:,:] = predicted_embedding
        params['transformer.wte.weight'][-1:,:] = predicted_embedding
        model.load_state_dict(params)
        
        # Confirmation message
        print("Finished another!")

Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished another!
Finished a

## Evaluating WinoDict on One Example
Writing a function that is reusable and works for one example.

In [17]:
def evaluate_winodict(example):
    # First, replace the word with each of the options
    if ('_' in example['sentence']):
        first_option, second_option = example['sentence'], example['sentence']
        pronoun_loc = example['sentence'].index('_')
        first_option = example['definition'] + " " + first_option[:pronoun_loc] + example['option1'] + first_option[pronoun_loc + 1:]
        second_option = example['definition'] + " " + second_option[:pronoun_loc] + example['option2'] + second_option[pronoun_loc + 1:]

        # Tokenize each string and produce labels
        first_inputs, second_inputs = tokenizer(first_option, return_tensors="pt"), tokenizer(second_option, return_tensors="pt")
        first_labels, second_labels = torch.clone(first_inputs["input_ids"]), torch.clone(second_inputs["input_ids"])
        
        # Find positioning of tokens of the underscore to split
        start_str, start_ind, end_ind = "", -1, -1
        original_inputs = tokenizer(example['definition'] + " " + example['sentence'], return_tensors="pt")
        for i in range(len(original_inputs["input_ids"][0])):
            value = original_inputs["input_ids"][0][i]
            if (tokenizer.decode(value).strip()) in "_":
                start_str += tokenizer.decode(value).strip()
                if (start_ind == -1):
                    start_ind = i
                if (start_str == "_"):
                    end_ind = i
                    break
            else:
                if (end_ind == -1):
                    start_ind = -1
                    start_str = ""
        
        # Create masked string for first option
        original_labels = torch.clone(original_inputs["input_ids"])
        first_text_tokens = tokenizer(" " + example['option1'], return_tensors="pt")["input_ids"][0]
        middle_tensor = torch.full((1, len(first_text_tokens)), -100)
        final_first_labels = torch.cat((original_labels[:,0:start_ind], middle_tensor, original_labels[:,end_ind + 1:]), dim=1)

        # Create masked string for second option
        second_text_tokens = tokenizer(" " + example['option2'], return_tensors="pt")["input_ids"][0]
        middle_tensor = torch.full((1, len(second_text_tokens)), -100)
        final_second_labels = torch.cat((original_labels[:,0:start_ind], middle_tensor, original_labels[:,end_ind + 1:]), dim=1)
        
        # Evaluate the model on each example and check
        if (first_inputs['input_ids'][0].shape[0] == final_first_labels.shape[1] and second_inputs['input_ids'][0].shape[0] == final_second_labels.shape[1]):
            first_loss = model(**first_inputs, labels=final_first_labels).loss
            second_loss = model(**second_inputs, labels=final_second_labels).loss

            # Write down the correct value and check
            if (first_loss < second_loss):
                return (int(example['label']) == 0)
            else:
                return (int(example['label']) == 1)
        else:
            print(first_inputs['input_ids'][0].shape)
            print(final_first_labels.shape)
            print("")
            return 0

## Evaluating Winograd on GPT-2
Looking specifically at `WinoDict`, with the first generated examples and adding in the definition and substituting in the word.

In [18]:
correct, total = 0, 0
for index, row in first_set.iterrows():
    if (row['lemma'] != "lemma"):
        total += 1
        correct += evaluate_winodict(row)
    
print("GPT-2 Large achieved a score of: " + str(float(correct) / float(total)))

torch.Size([43])
torch.Size([1, 83])

torch.Size([32])
torch.Size([1, 61])

torch.Size([41])
torch.Size([1, 79])

torch.Size([32])
torch.Size([1, 61])

torch.Size([25])
torch.Size([1, 24])

torch.Size([24])
torch.Size([1, 23])

torch.Size([41])
torch.Size([1, 41])

torch.Size([46])
torch.Size([1, 46])

GPT-2 Large achieved a score of: 0.5020080321285141
