# CS 224N - WinoDict Evaluation using RoBERTa Embeddings
Evaluating on WinoDict task using RoBERTa finetuned to predict GPT-2 embeddings.

## Setting up PyTorch
Using PyTorch on the GPU

In [1]:
import torch
print("Using GPU: " + str(torch.cuda.is_available()))

Using GPU: True


## Load in WinoDict Dataset
Load in the first generated set.

In [2]:
import pandas as pd
first_set = pd.read_csv("winodict/prob1_of_5.csv")

## Check Definitions of Words in Wordset
With old Wordset dataset, check if they end up existing.

In [3]:
import json

def find_definition(word):
    # Load in the data for the first letter
    letter = word[0]
    f = open('dictionary/' + letter + '.json')
    data = json.load(f)
    
    # Look through each of the definitions
    definition = ""
    if (word in data.keys()):
        if ('meanings' in data[word]):
            for index in range(len(data[word]['meanings'])):
                definition += data[word]['meanings'][index]['def'] + ". "
    
    return definition

## Grab GPT-2 and RoBERTa
Look at GPT-2 and RoBERTa fine-tuned for downstream task.

In [4]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer, RobertaModel

# GPT-2 Model and Tokenizer
ro_model = GPT2LMHeadModel.from_pretrained("weights/G2GNext1").to("cuda")
ro_tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
ro_tokenizer.add_tokens(['[CLS]']) 

1

## Turning Fake Words into Embeddings in GPT-2!
Using a standard GPT-2 model, added the new word embedding specifically for the fake word.

In [5]:
def fake_word_into_embedding(replacement, fake_word):
    # GPT-2 Model and Tokenizer
    model = GPT2LMHeadModel.from_pretrained('gpt2-medium').to("cuda")
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
    
    # Get definition of the word
    definition = find_definition(replacement)
    
    # Adding the next word
    if (definition != ""):
        # Pass into the tokenizer
        ro_tokenizer.pad_token = tokenizer.eos_token
        tokenized_input = ro_tokenizer(definition, return_tensors="pt", padding='max_length', truncation=True, max_length=511).to("cuda")
        tokenized_cls = ro_tokenizer(" [CLS]", return_tensors="pt").to("cuda")
        tokenized_input['input_ids'] = torch.cat((tokenized_input['input_ids'], tokenized_cls['input_ids']), dim=1).to("cuda")
        
        # Pass into the model and extract the predicted embedding
        outputs = ro_model(input_ids=tokenized_input['input_ids'].to("cuda"), output_hidden_states=True)
        last_hidden = outputs.hidden_states[-1][:,511,:]
        predicted_embedding = last_hidden.squeeze(0)
        
        # Add the new token and resize the model embedding
        tokenizer.add_tokens([fake_word])
        model.resize_token_embeddings(len(tokenizer))
        
        # Get model parameters and embeddings
        params = model.state_dict()
        embeddings = params['transformer.wte.weight']
        
        # Update with the new embedding
        embeddings[-1:,:] = predicted_embedding
        params['transformer.wte.weight'][-1:,:] = predicted_embedding
        model.load_state_dict(params)
    
    # Done!
    print("Finished with creating the new model and tokenizer")
    return model, tokenizer

## Evaluating WinoDict on One Example
Writing a function that is reusable and works for one example.

In [6]:
def evaluate_winodict(example):
    # First, replace the word with each of the options
    if ('_' in example['sentence']):
        # Get the correct evaluation model
        model, tokenizer = fake_word_into_embedding(example['lemma'], example['fake_lemma'])
        
        # Change 'the' to lowercase
        first_choice, second_choice = example['option1'], example['option2']
        if (first_choice[:4] == "The "):
            first_choice = "the " + first_choice[4:]
        if (second_choice[:4] == "The "):
            second_choice = "the " + second_choice[4:]

        # Replace the text
        first_text, second_text = example['sentence'], example['sentence']
        pronoun_loc = example['sentence'].index('_')
        first_option = example['definition'] + " " + first_text[:pronoun_loc] + first_choice + first_text[pronoun_loc + 1:]
        second_option = example['definition'] + " " + second_text[:pronoun_loc] + second_choice + second_text[pronoun_loc + 1:]

        # Tokenize each string and produce labels
        first_inputs, second_inputs = tokenizer(first_option, return_tensors="pt").to("cuda"), tokenizer(second_option, return_tensors="pt").to("cuda")

        # Create the first token labels
        first_masked_tokens = tokenizer(example['definition'] + " " + first_text[:pronoun_loc] + first_choice, return_tensors="pt").to("cuda")
        first_labels = first_masked_tokens["input_ids"][0].to("cuda")
        first_mask = torch.full((1, first_labels.shape[0]), -100).to("cuda")
        first_fill = tokenizer(first_text[pronoun_loc + 1:], return_tensors="pt")["input_ids"].to("cuda")
        final_first_labels = torch.cat((first_mask, first_fill), dim=1).to("cuda")

        # Create the second token labels
        second_masked_tokens = tokenizer(example['definition'] + " " + second_text[:pronoun_loc] + second_choice, return_tensors="pt").to("cuda")
        second_labels = second_masked_tokens["input_ids"][0].to("cuda")
        second_mask = torch.full((1, second_labels.shape[0]), -100).to("cuda")
        second_fill = tokenizer(second_text[pronoun_loc + 1:], return_tensors="pt")["input_ids"].to("cuda")
        final_second_labels = torch.cat((second_mask, second_fill), dim=1).to("cuda")

        # Evaluate the model on each example and check
        first_loss = model(**first_inputs, labels=final_first_labels).loss
        second_loss = model(**second_inputs, labels=final_second_labels).loss
        
        # Write down the correct value and check
        if (first_loss < second_loss):
            print("Finished Evaluation")
            return (int(example['label']) == 0)
        else:
            print("Finished Evaluation")
            return (int(example['label']) == 1)

## Evaluating Winograd on GPT-2
Looking specifically at `WinoDict`, with the first generated examples and adding in the definition and substituting in the word.

In [7]:
correct, total = 0, 0
for index, row in first_set.iterrows():
    if (row['lemma'] != "lemma"):
        total += 1
        correct += evaluate_winodict(row)
        print(correct)
        print(total)
        print("")
    
print("GPT-2 Medium achieved a score of: " + str(float(correct) / float(total)))

Finished with creating the new model and tokenizer


AttributeError: 'CausalLMOutputWithCrossAttentions' object has no attribute 'to'