# CS 224N Final Project - Evaluating on CBT Dataset
By: Joseph O'Brien, Christopher Pondoc, Joseph Guman

In [1]:
import torch
print("Using GPU: " + str(torch.cuda.is_available()))

Using GPU: True


## Load in 1:1 Real to Fake Words
Load in from the `.csv` file.

In [2]:
import pandas as pd

# Import CSV and get the real to fake
rtf_df = pd.read_csv("/home/ubuntu/test/datasets/realtofake.csv")
real_words_list = rtf_df["Real"].tolist()
fake_words_list = rtf_df["Fake"].tolist()

# Populate dictionary
real_to_fake_dict = {}
for i in range(len(real_words_list)):
    real_to_fake_dict[real_words_list[i]] = fake_words_list[i]

## Get Definitions for all of the WinoDict Words
Get each of the definitions for the words.

In [3]:
from nltk.corpus import wordnet as wn

# Keep track of all the final definitions
final_definitions = []

# Loop through each real word and append
for word in real_words_list:
    definition = ""
    for synset in wn.synsets(word):
        definition += synset.definition() + ". "
    final_definitions.append(definition)

# Quick sanity check
assert(len(real_words_list) == len(final_definitions))
assert(len(fake_words_list) == len(final_definitions))

In [4]:
same_sentence_list = [] 

for i in range(len(real_words_list)):
    same_sentence_list.append("This is a very normal sentence that will have normal results to give to someting")

assert(len(fake_words_list) == len(real_words_list))

## Define Prediction and Actual Model
Retrieve G2G and R2G and baseline GPT-2 model

In [5]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# G2G/R2G used to Predict
predict_model = GPT2LMHeadModel.from_pretrained("/home/ubuntu/test/weights/G2GSamplingM").to("cuda")
predict_tokenizer = GPT2Tokenizer.from_pretrained("/home/ubuntu/test/weights/G2GSamplingT")

# GPT-2 Model and Tokenizer to be fine-tuned
model = GPT2LMHeadModel.from_pretrained('gpt2-medium').to("cuda")
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')


## Batch All Embeddings
Predict all of the embeddings for each of the definitions.

In [6]:
# Helpful Debug Message
print("Number of total definitions: " + str(len(final_definitions)))

# Tokenizing all of the definitions at once
#tokenized_inputs = predict_tokenizer(final_definitions, return_tensors="pt", padding='max_length', truncation=True, max_length=511)
tokenized_inputs = predict_tokenizer(same_sentence_list, return_tensors="pt", padding='max_length', truncation=True, max_length=511)
tokenized_cls = predict_tokenizer([" [CLS]"] * len(final_definitions), return_tensors="pt")

# Get the correct input IDs and and attention mask
tokenized_inputs['input_ids'] = torch.cat((tokenized_inputs['input_ids'], tokenized_cls['input_ids']), dim=1).to("cuda")
tokenized_inputs['attention_mask'] = torch.cat((tokenized_inputs['attention_mask'], tokenized_cls['attention_mask']), dim=1).to("cuda")

# Add the new tokens and resize the model embeddings matrix
displacement = len(tokenizer)
tokenizer.add_tokens(fake_words_list)
model.resize_token_embeddings(len(tokenizer))
params = model.state_dict()

# Adding new embeddings in a range of 4
for i in range(0, len(final_definitions), 4):
    outputs = predict_model(input_ids=tokenized_inputs['input_ids'][i:min(len(final_definitions), i + 4)], output_hidden_states=True, attention_mask=tokenized_inputs['attention_mask'][i:min(len(final_definitions), i + 4)])
    params['transformer.wte.weight'][displacement + i: displacement + min(len(final_definitions), i + 4),:] = outputs.hidden_states[-1][:,511,:].detach().clone()
model.load_state_dict(params)

Number of total definitions: 343


<All keys matched successfully>

In [7]:
del predict_model
del predict_tokenizer
torch.cuda.empty_cache()

In [8]:
params = model.state_dict()
embeddings = params['transformer.wte.weight']
embeddings = embeddings[50257:]
torch.set_printoptions(threshold=10000)
print(embeddings[0])

tensor([-5.1018e-02, -1.0472e-02, -2.6941e-02, -9.4060e-02, -3.8462e-02,
         8.0635e-02, -4.2176e-02, -6.7524e-02, -3.2577e-02,  1.6920e-01,
        -6.7162e-03,  1.1194e-02, -5.2289e-02, -3.4464e-02, -8.0983e-03,
         3.1571e-02,  8.9290e-02, -1.7303e-01, -5.9624e-02,  7.2877e-02,
        -2.6239e-02, -5.4489e-03,  3.9657e-02,  1.9218e-03,  1.2603e-02,
        -1.8843e-02,  4.0793e-02,  4.6155e-02,  1.7052e-02, -4.4290e-02,
        -3.1765e-02,  2.9278e-02,  9.9705e-02, -2.1224e-02,  2.9825e-02,
        -3.3536e-02,  2.1350e-02,  1.2614e-02, -1.1166e-02,  7.4746e-02,
        -4.6766e-02, -6.5998e-03,  2.2396e-02,  1.7701e-02,  1.2839e-02,
        -2.1378e-02,  5.9309e-02, -7.8575e-03,  1.1850e-01,  4.0888e-02,
        -9.7272e-02,  8.8698e-02, -3.2979e-02,  8.4463e-04, -1.5666e-02,
        -4.9736e-02,  7.3885e-02, -1.7835e-02, -8.3907e-02,  4.1915e-02,
        -1.4975e-03,  7.7594e-02,  4.5326e-02, -4.3028e-02,  2.9704e-02,
         3.9824e-02, -5.2617e-02, -1.6799e-02,  1.1

## Load in Children's Book Test Dataset
Also taken from HuggingFace

In [9]:
import pandas
cbt_df = pd.read_csv("/home/ubuntu/test/cbt/cbt_extract/cbt_csv_files/updated_cbt_info.csv")

## Evaluating on One Example
Writing a function that is reusable and works for one example

In [10]:
def evaluate_cbt(example):
    # extract the sentence
    sentence = example["sentences"]
    sentence = sentence.replace('"','')
    sentence = sentence.replace("\n","")
    sentence = sentence[1:len(sentence)-1]

    # extract the question
    question = example["question"]

    # make the total query with the sentence and the question
    total_query = sentence + " " + question

    print(total_query)

    
    #replace words in example with winodict fake words (currently, we replace all possible identified examples)
    replace_words = example["winodict_words"]
    replace_words = replace_words[1:len(replace_words)-1].replace("'","").split(",")

    for word in replace_words:
        word = word.strip()
        word = " "+word+" "
        while word in total_query:
            total_query = total_query.replace(word," "+real_to_fake_dict[word.strip()]+" ")
    

    # get the answer options for the model into a list
    options = example["options"].replace("'","")
    options = options.replace("\n","")
    options = options[1:len(options)-1].split(" ")

    # get the answer to the problem
    answer = example["answer"]
    print(answer)
    print(options)

    
    if answer in replace_words:
        answer = real_to_fake_dict[answer][0]
    
    

    # initialize best answer and best loss (will be lowest value)
    best_answer = ""
    best_loss = float("inf")

    # compute the loss for each option (using full scoring)
    for option in options:
        
        if option in replace_words:
            option = real_to_fake_dict[option][0]
        

        updated_query = total_query.replace("XXXXX", option)
        
        # Tokenize each string and produce labels
        updated_input = tokenizer(updated_query, return_tensors="pt").to("cuda")

        """
        if len(updated_input[0]) > 1024:
            return -1
        """

        #print(updated_input['input_ids'])

        current_loss = model(**updated_input, labels=updated_input["input_ids"].to("cuda")).loss
        
        if current_loss < best_loss:
            best_answer = option
            best_loss = current_loss
    return best_answer == answer 



## Evaluating CBT on GPT-2

In [11]:
eval_indices = []

with open("/home/ubuntu/test/cbt/cbt_eval_info/eval_indices", "r") as f:
    eval_indices = f.read()

eval_indices = eval_indices[1:len(eval_indices)-1].split(", ")
for i in range(len(eval_indices)):
    eval_indices[i] = int(eval_indices[i])

In [12]:
# evaluating gpt-2 AFTER REPLACING words and saving indices and stats
correct_list = []
total_list = []

correct = 0 
total = 0 
for index, row in cbt_df.iterrows():
    #print(index)

    if index not in eval_indices:
        continue

    update = evaluate_cbt(row)
    if update == -1:
        continue
    if update == 1:
        correct_list.append(index)
    total_list.append(index)
    correct += update
    total += 1
    print(total)
    if total == 600:
        break
    if total == 200:
        break
    break
    print("")

correct_list = str(correct_list)
total_list = str(total_list)


with open("/home/ubuntu/test/cbt/cbt_eval_info/GPT2-CLSTokenOnly/GPT2-CLSTokenOnly_correct", "w") as file1:
    file1.write(correct_list)

with open("/home/ubuntu/test/cbt/cbt_eval_info/GPT2-CLSTokenOnly/GPT2-CLSTokenOnly_total", "w") as file2:
    file2.write(total_list)


print("")
print("Number correct: ", correct)
print("Total: ", total)
print("Correct percentage: ",correct/total)

information_list = str([correct, total, correct/total])


with open("/home/ubuntu/test/cbt/cbt_eval_info/GPT2-CLSTokenOnly/GPT2-CLSTokenOnly_statistics", "w") as file3:
    file3.write(information_list)


'You must have noticed it , Jaqueline ; you sat up later .' How the dogs howled ! '' `` No ; I mean yes , '' murmured poor Jaqueline , who of course had caused the whole affair by her magic arts , but who had forgotten , in the excitement of the moment , that an eclipse of the moon , especially if entirely unexpected , is likely to attract very general attention . 'Jaqueline could not bear to tell a fib , especially to a king who had been so kind to her ; besides , fibbing would not alter the facts .' `` Yes , I did see it , '' she admitted , blushing . `` Had it not been predicted ? '' `` Not a word about it whispered anywhere , '' said his Majesty . '`` I looked up the almanack at once .' It is the most extraordinary thing I ever saw , and I 've seen a good many . '' `` The astronomers must be duffers , '' said Prince Ricardo . '`` I never thought there was much in physical science of any sort ; most dreary stuff .' Why , they say the earth goes round the sun , whereas any fool can s