# Estimate sentence probability with BERT
## From https://github.com/huggingface/transformers/issues/37, with bugs fixed and updated to newest transformers version

In [2]:
import numpy as np
import torch
from transformers import BertTokenizer, BertForMaskedLM

# Load pre-trained model (weights)
with torch.no_grad():
    model = BertForMaskedLM.from_pretrained('bert-large-uncased')
    model.eval()
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

In [3]:
def print_top_predictions(probs, k=5):
    probs = probs.detach().numpy()
    top_indexes = np.argpartition(probs, -k)[-k:]
    sorted_indexes = top_indexes[np.argsort(-probs[top_indexes])]
    top_tokens = tokenizer.convert_ids_to_tokens(sorted_indexes)
    print(f"Ordered top predicted tokens: {top_tokens}")
    print(f"Ordered top predicted values: {probs[sorted_indexes]}")

In [4]:
BOS_TOKEN = '[CLS]'
EOS_TOKEN = '[SEP]'
MASK_TOKEN = '[MASK]'

def get_sentence_prob(sentence, verbose=False):
    sm = torch.nn.Softmax(dim=0) # used to convert last hidden state to probs
    
    # Pre-process sentence, adding special tokens
    tokenized_input = tokenizer.tokenize(sentence)
    sent_len = len(tokenized_input)
    if tokenized_input[0] != BOS_TOKEN:
        tokenized_input.insert(0, BOS_TOKEN)
    if tokenized_input[-1] != EOS_TOKEN:
        tokenized_input.append(EOS_TOKEN)
    ids_input = tokenizer.convert_tokens_to_ids(tokenized_input)
    print(f"Processing sentence: {tokenized_input}")
    #print(f"Sentence ids: {ids_input}")
    
    #sent_prob = 1
    sum_lp = 0
    # Mask non-special tokens and calculate their probabilities
    for i in range(1,len(tokenized_input)-1): # Ignore first and last tokens
        current_tokenized = tokenized_input[:]
        current_tokenized[i] = MASK_TOKEN
        if verbose: print(current_tokenized)
        masked_input = torch.tensor([tokenizer.convert_tokens_to_ids(current_tokenized)])
        outputs = model(masked_input)
        predictions = outputs[0]
        current_probs = sm(predictions[0, i]) # Softmax to get probabilities
        current_prob = current_probs[ids_input[i]] # Prediction for masked word
        #sent_prob *= current_prob
        
        sum_lp += np.log(current_prob.detach().numpy())
        
        print(f"Word: {tokenized_input[i]} \t Prob: {current_prob}")
        if verbose: print_top_predictions(current_probs)

    #print(f"\nSentence probability: {sent_prob.item()}\n")
    print(f"\nNormalized sentence prob: log(P(sentence)) / sent_length: {sum_lp / sent_len}\n")
    return sum_lp / sent_len

In [5]:
get_sentence_prob("He answered unequivocally.")
get_sentence_prob("He answered quickly.")

Processing sentence: ['[CLS]', 'he', 'answered', 'une', '##qui', '##vo', '##cal', '##ly', '.', '[SEP]']
Word: he 	 Prob: 0.2814375162124634
Word: answered 	 Prob: 0.006721243727952242
Word: une 	 Prob: 0.9973625540733337
Word: ##qui 	 Prob: 0.9999865293502808
Word: ##vo 	 Prob: 0.9999856948852539
Word: ##cal 	 Prob: 0.9999865293502808
Word: ##ly 	 Prob: 0.9979932308197021
Word: . 	 Prob: 0.9998167157173157

Normalized sentence prob: log(P(sentence)) / sent_length: -0.784400124636818

Processing sentence: ['[CLS]', 'he', 'answered', 'quickly', '.', '[SEP]']
Word: he 	 Prob: 0.2151750773191452
Word: answered 	 Prob: 0.026344342157244682
Word: quickly 	 Prob: 0.05330450460314751
Word: . 	 Prob: 0.9981406927108765

Normalized sentence prob: log(P(sentence)) / sent_length: -2.0266001676791348



-2.0266001676791348

In [12]:
get_sentence_prob("The guy with small hands demanded a quid pro quo.")
get_sentence_prob("The guy with small hands demanded an exchange.")

Processing sentence: ['[CLS]', 'the', 'guy', 'with', 'small', 'hands', 'demanded', 'a', 'qui', '##d', 'pro', 'quo', '.', '[SEP]']
Word: the 	 Prob: 0.6742717027664185
Word: guy 	 Prob: 0.006106184795498848
Word: with 	 Prob: 0.9959086179733276
Word: small 	 Prob: 0.001629635225981474
Word: hands 	 Prob: 0.20016466081142426
Word: demanded 	 Prob: 0.03818148002028465
Word: a 	 Prob: 0.5014763474464417
Word: qui 	 Prob: 0.9985383749008179
Word: ##d 	 Prob: 0.9992328882217407
Word: pro 	 Prob: 0.9958876967430115
Word: quo 	 Prob: 0.9983682036399841
Word: . 	 Prob: 0.9850805401802063

Normalized sentence prob: log(P(sentence)) / sent_length: -1.4586090460895018

Processing sentence: ['[CLS]', 'the', 'guy', 'with', 'small', 'hands', 'demanded', 'an', 'exchange', '.', '[SEP]']
Word: the 	 Prob: 0.6960532069206238
Word: guy 	 Prob: 0.002731535118073225
Word: with 	 Prob: 0.9953562617301941
Word: small 	 Prob: 0.001821734826080501
Word: hands 	 Prob: 0.21409577131271362
Word: demanded 	 Prob: 0

-2.3705652872514396

In [None]:
get_sentence_prob("This is a sentence.")
get_sentence_prob("This is a macrame.", verbose=False)
get_sentence_prob("This is a joke.", verbose=False)
get_sentence_prob("Are you kidding.", verbose=False)


In [None]:
get_sentence_prob("Rachel was wearing a lovely satin dress last night.")

In [None]:
get_sentence_prob("Rachel was wearing a lovely satin dress last night.")
get_sentence_prob("Grandma was wearing a lovely satin dress last night.")
get_sentence_prob("Mother was wearing a lovely satin dress last night.")
get_sentence_prob("She was wearing a lovely satin dress last night.")
get_sentence_prob("He was wearing a lovely satin dress last night.")
get_sentence_prob("I was wearing a lovely satin dress last night.")
get_sentence_prob("Angela was wearing a lovely satin dress last night.")
get_sentence_prob("Roberta was wearing a lovely satin dress last night.")

In [None]:
get_sentence_prob("The man ate the steak.")
get_sentence_prob("The man who arrived late ate the steak with a glass of wine.")
get_sentence_prob("The steak was eaten by the man.")
get_sentence_prob("The stake ate the man.")

In [None]:
#get_sentence_prob("I fed my cat some of it and he damn near passed out")
get_sentence_prob("He was born in Berlin.")
get_sentence_prob("He was born in Santiago.")
get_sentence_prob("He was born in France.")
get_sentence_prob("He was born in window.")
get_sentence_prob("He was born in was.")


In [None]:
get_sentence_prob("I fed my cat some of it and he damn near passed out.")
get_sentence_prob("I fed my dog some of it and he damn near passed out.")
get_sentence_prob("I fed my window some of it and he damn near passed out.")
get_sentence_prob("I fed my the some of it and he damn near passed out.")

In [None]:
print("Should have similar/high probs\n")
get_sentence_prob("I forgot to take my medicine.")
get_sentence_prob("I forgot to take my medicines.")
get_sentence_prob("I forgot to take my medication.")
get_sentence_prob("I forgot to take my pills.")
print("Should have low probs\n")
get_sentence_prob("I forgot to take my turn.")
get_sentence_prob("I forgot to take my medical.")
get_sentence_prob("I forgot to take my medically.")
get_sentence_prob("I forgot to take my turned.")

In [None]:
# Load pre-trained model (weights)
with torch.no_grad():
    model = BertForMaskedLM.from_pretrained('bert-base-uncased')
    model.eval()
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
#get_sentence_prob("I fed my cat some of it and he damn near passed out")
get_sentence_prob("He was born in Berlin.")
get_sentence_prob("He was born in Santiago.")
get_sentence_prob("He was born in France.")
get_sentence_prob("He was born in window.")
get_sentence_prob("He was born in was.")

In [None]:
get_sentence_prob("I fed my cat some of it and he damn near passed out.")
get_sentence_prob("I fed my dog some of it and he damn near passed out.")
get_sentence_prob("I fed my window some of it and he damn near passed out.")
get_sentence_prob("I fed my the some of it and he damn near passed out.")

In [None]:
print("Should have similar/high probs\n")
get_sentence_prob("I forgot to take my medicine.")
get_sentence_prob("I forgot to take my medicines.")
get_sentence_prob("I forgot to take my medication.")
get_sentence_prob("I forgot to take my pills.")
print("Should have low probs\n")
get_sentence_prob("I forgot to take my turn.")
get_sentence_prob("I forgot to take my medical.")
get_sentence_prob("I forgot to take my medically.")
get_sentence_prob("I forgot to take my turned.")