# Estimate sentence probability with BERT
## From https://github.com/huggingface/transformers/issues/37, but updated to newest transformers version

In [1]:
import numpy as np
import torch
from transformers import BertTokenizer, BertForMaskedLM

# Load pre-trained model (weights)
with torch.no_grad():
    model = BertForMaskedLM.from_pretrained('bert-large-uncased')
    model.eval()
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=314.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1344997306.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [2]:
def print_top_predictions(probs, index, k=5):
    probs = probs.detach().numpy()
    top_indexes = np.argpartition(probs, -k)[-k:]
    sorted_indexes = top_indexes[np.argsort(-probs[top_indexes])]
    top_tokens = tokenizer.convert_ids_to_tokens(sorted_indexes)
    print(f"Ordered top predicted tokens: {top_tokens}")
    print(f"Ordered top predicted values: {probs[sorted_indexes]}")

In [3]:
BOS_TOKEN = '[CLS]'
EOS_TOKEN = '[SEP]'
MASK_TOKEN = '[MASK]'

def get_sentence_prob(sentence):
    sm = torch.nn.Softmax(dim=0) # used to convert last hidden state to probs
    
    # Pre-process sentence, adding special tokens
    tokenized_input = tokenizer.tokenize(sentence)
    if tokenized_input[0] != BOS_TOKEN:
        tokenized_input.insert(0, BOS_TOKEN)
    if tokenized_input[-1] != EOS_TOKEN:
        tokenized_input.append(EOS_TOKEN)
    ids_input = tokenizer.convert_tokens_to_ids(tokenized_input)
    print(f"Processing sentence: {tokenized_input}")
    #print(f"Sentence ids: {ids_input}")
    
    sent_prob = 1
    # Mask non-special tokens and calculate their probabilities
    for i in range(1,len(tokenized_input)-1): # Ignore first and last tokens
        current_tokenized = tokenized_input[:]
        current_tokenized[i] = MASK_TOKEN
        masked_input = torch.tensor([tokenizer.convert_tokens_to_ids(current_tokenized)])
        outputs = model(masked_input)
        predictions = outputs[0]
        current_probs = sm(predictions[0, i]) # Softmax to get probabilities
        current_prob = current_probs[ids_input[i]] # Prediction for masked word
        sent_prob *= current_prob
        
        print(f"Word: {tokenized_input[i]} \t Prob: {current_prob}")
        #print_top_predictions(current_probs, ids_input[i])

    print(f"Sentence probability: {sent_prob.item()}")
    return sent_prob

In [4]:
#get_sentence_prob("I fed my cat some of it and he damn near passed out")
get_sentence_prob("He was born in Berlin.")
get_sentence_prob("He was born in Santiago.")
get_sentence_prob("He was born in Chile.")
get_sentence_prob("He was born in window.")


Processing sentence: ['[CLS]', 'he', 'was', 'born', 'in', 'berlin', '.', '[SEP]']
Word: he 	 Prob: 0.7967859506607056
Word: was 	 Prob: 0.9999992847442627
Word: born 	 Prob: 0.9977497458457947
Word: in 	 Prob: 0.9979470372200012
Word: berlin 	 Prob: 0.02355594001710415
Word: . 	 Prob: 0.9999347925186157
Sentence probability: 0.018687129020690918
Processing sentence: ['[CLS]', 'he', 'was', 'born', 'in', 'santiago', '.', '[SEP]']
Word: he 	 Prob: 0.7612152695655823
Word: was 	 Prob: 0.9999862909317017
Word: born 	 Prob: 0.9960402250289917
Word: in 	 Prob: 0.997549831867218
Word: santiago 	 Prob: 0.0008775214664638042
Word: . 	 Prob: 0.9998825788497925
Sentence probability: 0.0006636204198002815
Processing sentence: ['[CLS]', 'he', 'was', 'born', 'in', 'chile', '.', '[SEP]']
Word: he 	 Prob: 0.6504054069519043
Word: was 	 Prob: 0.9999675750732422
Word: born 	 Prob: 0.9977337121963501
Word: in 	 Prob: 0.9998539686203003
Word: chile 	 Prob: 0.0006050239317119122
Word: . 	 Prob: 0.9999569654

tensor(6.0044e-11, grad_fn=<MulBackward0>)