In [1]:
import pandas as pd

In [2]:
# read the sentence
sent_short = pd.read_csv('data/sentence_short.csv')
sent_short_sim = pd.read_csv('data/sentence_short_simple.csv')
sent_short_past = pd.read_csv('data/sentence_short_past.csv')
sent_short_ps = pd.read_csv('data/sentence_short_ps.csv')
sent_long = pd.read_csv('data/sentence_long.csv')
sent_long_sim = pd.read_csv('data/sentence_long_simple.csv')
sent_long_past = pd.read_csv('data/sentence_long_past.csv')
sent_long_ps = pd.read_csv('data/sentence_long_ps.csv')

In [3]:
# disable gradient computations
import torch
torch.set_grad_enabled(False)

# load the BERT model
from transformers import BertTokenizer
from transformers import BertForMaskedLM

MODEL_NAME = "bert-base-cased"

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForMaskedLM.from_pretrained(MODEL_NAME)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
# define softmax function that we'll use later on
f_softmax = torch.nn.Softmax(dim = 0)

# determine whether give the correct token the higher possibility
def computeCorrectness(df):
    
    df_new = df.copy()
    prediction_list = []
    accuracy_list = []
    correct_probs = []
    wrong_probs = []
    
    for i in range(0, len(df_new)):
        sentence = df_new['sentence'][i]
        correct = df_new['correct'][i]
        wrong = df_new['wrong'][i]
        
        # convert words to indices
        inputs = tokenizer(sentence, return_tensors = "pt")
        
        # get model outputs
        outputs = model(**inputs)
        
        # determine position of the masked token
        mask_index = int((inputs['input_ids'][0] == tokenizer.mask_token_id).nonzero())
        
        # determine correct and wrong indices
        correct_index = tokenizer.convert_tokens_to_ids(correct)
        wrong_index = tokenizer.convert_tokens_to_ids(wrong)
        
        # convert logits to probabilities with softmax
        probs = f_softmax(outputs['logits'][0, mask_index, :])
        
        # extract logits and probability for correct and wrong tokens
        correct_logit = outputs.logits[0, mask_index, correct_index]
        correct_probs.append(probs[correct_index])
        wrong_logit = outputs.logits[0, mask_index, wrong_index]
        wrong_probs.append(probs[wrong_index])
        
        # correct prediction insert correct, wrong prediction insert wrong
        if correct_logit > wrong_logit:
            prediction_list.append('correct')
            accuracy_list.append(1)
        elif wrong_logit > correct_logit:
            prediction_list.append('wrong')
            accuracy_list.append(0)
        else:
            # no clear preference
            prediction_list.append('no preference')
            accuracy_list.append(0)
    
    df_new['prediction'] = prediction_list
    df_new['accuracy'] = accuracy_list
    df_new['correct_prob'] = correct_probs
    df_new['wrong_prob'] = wrong_probs
    
    return df_new

In [5]:
# define softmax function that we'll use later on
f_softmax = torch.nn.Softmax(dim = 0)

# determine if the top prediction is our correct token,
# and of our correct token is in top five prediction
def getTopPredictions(df):
    
    df_new = df.copy()
    top_list = []
    top5_list = []
    top5_prob = []
    top_is_correct = []
    top_is_wrong = []
    correct_in_top5 = []
    wrong_in_top5 = []
    
    for i in range(0, len(df_new)):
        sentence = df_new['sentence'][i]
        correct = df_new['correct'][i]
        wrong = df_new['wrong'][i]
        
        # convert words to indices
        inputs = tokenizer(sentence, return_tensors = "pt")
        
        # get model outputs
        outputs = model(**inputs)
        
        # determine position of the masked token
        mask_index = int((inputs['input_ids'][0] == tokenizer.mask_token_id).nonzero())
        
        # convert logits to probabilities with softmax
        probs = f_softmax(outputs['logits'][0, mask_index, :])
        
        # select top 5 predictions
        top_prediction_ids = torch.argsort(probs, descending = True)[0:5]
        top_prediction_tokens = [tokenizer.convert_ids_to_tokens(int(i)) 
                                 for i in top_prediction_ids]
        top_list.append(top_prediction_tokens[0])
        top5_list.append(top_prediction_tokens)
        top5_prob.append([probs[i] for i in top_prediction_ids])
        
        # determine whether top prediction is correct/wrong, or whether it is in top 5
        if correct == top_prediction_tokens[0]:
            top_is_correct.append(1)
        else:
            top_is_correct.append(0)
        if wrong == top_prediction_tokens[0]:
            top_is_wrong.append(1)
        else:
            top_is_wrong.append(0)
            
        # determine whether correct/wrong token is in the top five prediction
        if correct in top_prediction_tokens:
            correct_in_top5.append(1)
        else:
            correct_in_top5.append(0)
        if wrong in top_prediction_tokens:
            wrong_in_top5.append(1)
        else:
            wrong_in_top5.append(0)
    
    df_new['top predict'] = top_list
    df_new['top 5 predict'] = top5_list
    df_new['top 5 prob'] = top5_prob
    df_new['top is correct'] = top_is_correct
    df_new['top is wrong'] = top_is_wrong
    df_new['correct in top 5'] = correct_in_top5
    df_new['wrong in top 5'] = wrong_in_top5
    
    return df_new

In [6]:
# build the result dataframes
sent_short_result = getTopPredictions(computeCorrectness(sent_short))
sent_short_sim_result = getTopPredictions(computeCorrectness(sent_short_sim))
sent_short_past_result = getTopPredictions(computeCorrectness(sent_short_past))
sent_short_ps_result = getTopPredictions(computeCorrectness(sent_short_ps))
sent_long_result = getTopPredictions(computeCorrectness(sent_long))
sent_long_sim_result = getTopPredictions(computeCorrectness(sent_long_sim))
sent_long_past_result = getTopPredictions(computeCorrectness(sent_long_past))
sent_long_ps_result = getTopPredictions(computeCorrectness(sent_long_ps))

In [7]:
# export the results
# sent_short_result.to_csv('result/sent_short_result.csv', index = False)
# sent_short_sim_result.to_csv('result/sent_short_sim_result.csv', index = False)
# sent_short_past_result.to_csv('result/sent_short_past_result.csv', index = False)
# sent_short_ps_result.to_csv('result/sent_short_ps_result.csv', index = False)
# sent_long_result.to_csv('result/sent_long_result.csv', index = False)
# sent_long_sim_result.to_csv('result/sent_long_sim_result.csv', index = False)
# sent_long_past_result.to_csv('result/sent_long_past_result.csv', index = False)
# sent_long_ps_result.to_csv('result/sent_long_ps_result.csv', index = False)