# Estimate sentence probability with BERT
## From https://github.com/huggingface/transformers/issues/37, with bugs fixed and updated to newest transformers version

In [7]:
import numpy as np
import torch
from transformers import BertTokenizer, BertForMaskedLM

# Load pre-trained model (weights)
with torch.no_grad():
    model = BertForMaskedLM.from_pretrained('bert-large-uncased')
    model.eval()
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

In [8]:
def print_top_predictions(probs, k=5):
    probs = probs.detach().numpy()
    top_indexes = np.argpartition(probs, -k)[-k:]
    sorted_indexes = top_indexes[np.argsort(-probs[top_indexes])]
    top_tokens = tokenizer.convert_ids_to_tokens(sorted_indexes)
    print(f"Ordered top predicted tokens: {top_tokens}")
    print(f"Ordered top predicted values: {probs[sorted_indexes]}")

In [9]:
BOS_TOKEN = '[CLS]'
EOS_TOKEN = '[SEP]'
MASK_TOKEN = '[MASK]'

def get_sentence_prob(sentence, verbose=False):
    sm = torch.nn.Softmax(dim=0) # used to convert last hidden state to probs
    
    # Pre-process sentence, adding special tokens
    tokenized_input = tokenizer.tokenize(sentence)
    sent_len = len(tokenized_input)
    if tokenized_input[0] != BOS_TOKEN:
        tokenized_input.insert(0, BOS_TOKEN)
    if tokenized_input[-1] != EOS_TOKEN:
        tokenized_input.append(EOS_TOKEN)
    ids_input = tokenizer.convert_tokens_to_ids(tokenized_input)
    print(f"Processing sentence: {tokenized_input}")
    #print(f"Sentence ids: {ids_input}")
    
    #sent_prob = 1
    sum_lp = 0
    # Mask non-special tokens and calculate their probabilities
    for i in range(1,len(tokenized_input)-1): # Ignore first and last tokens
        current_tokenized = tokenized_input[:]
        current_tokenized[i] = MASK_TOKEN
        if verbose: print(current_tokenized)
        masked_input = torch.tensor([tokenizer.convert_tokens_to_ids(current_tokenized)])
        outputs = model(masked_input)
        predictions = outputs[0]
        current_probs = sm(predictions[0, i]) # Softmax to get probabilities
        current_prob = current_probs[ids_input[i]] # Prediction for masked word
        #sent_prob *= current_prob
        
        sum_lp += np.log(current_prob.detach().numpy())
        
        print(f"Word: {tokenized_input[i]} \t Prob: {current_prob}")
        if verbose: print_top_predictions(current_probs)

    #print(f"\nSentence probability: {sent_prob.item()}\n")
    print(f"\nNormalized sentence prob: log(P(sentence)) / sent_length: {sum_lp / sent_len}\n")
    return sum_lp / sent_len

In [10]:
get_sentence_prob("Colorless green ideas sleep furiously.")
get_sentence_prob("Genius dumb mirrors eat endlessly.")
get_sentence_prob("Hairless ugly men complain constantly.")

Processing sentence: ['[CLS]', 'color', '##less', 'green', 'ideas', 'sleep', 'furiously', '.', '[SEP]']
Word: color 	 Prob: 0.07855580002069473
Word: ##less 	 Prob: 0.26501238346099854
Word: green 	 Prob: 0.007004397921264172
Word: ideas 	 Prob: 7.900826858531218e-06
Word: sleep 	 Prob: 4.1728594624146353e-07
Word: furiously 	 Prob: 0.00012131692346883938
Word: . 	 Prob: 0.9802482724189758

Normalized sentence prob: log(P(sentence)) / sent_length: -6.329747859122498

Processing sentence: ['[CLS]', 'genius', 'dumb', 'mirrors', 'eat', 'endless', '##ly', '.', '[SEP]']
Word: genius 	 Prob: 1.0253714322061569e-07
Word: dumb 	 Prob: 2.9548760949182906e-07
Word: mirrors 	 Prob: 0.0001510217844042927
Word: eat 	 Prob: 3.409974306123331e-05
Word: endless 	 Prob: 0.2609398365020752
Word: ##ly 	 Prob: 0.304989218711853
Word: . 	 Prob: 0.9387707710266113

Normalized sentence prob: log(P(sentence)) / sent_length: -7.543730827314513

Processing sentence: ['[CLS]', 'hair', '##less', 'ugly', 'men', 'c

-2.635146169524108

In [6]:
get_sentence_prob("The test was a success.")
get_sentence_prob("The party was a success.")
get_sentence_prob("The plan was a success.")
get_sentence_prob("The was test a success.")
get_sentence_prob("The test was success a.")
get_sentence_prob("The farewell party was definitely not a success.")

Processing sentence: ['[CLS]', 'the', 'test', 'was', 'a', 'success', '.', '[SEP]']
Word: the 	 Prob: 0.9590326547622681
Word: test 	 Prob: 0.006375086028128862
Word: was 	 Prob: 0.9607516527175903
Word: a 	 Prob: 0.9966248273849487
Word: success 	 Prob: 0.4098852872848511
Word: . 	 Prob: 0.9452541470527649

Normalized sentence prob: log(P(sentence)) / sent_length: -1.0147979485336691

Processing sentence: ['[CLS]', 'the', 'party', 'was', 'a', 'success', '.', '[SEP]']
Word: the 	 Prob: 0.9431305527687073
Word: party 	 Prob: 0.005764061585068703
Word: was 	 Prob: 0.9452731013298035
Word: a 	 Prob: 0.9702098369598389
Word: success 	 Prob: 0.40087464451789856
Word: . 	 Prob: 0.9529269337654114

Normalized sentence prob: log(P(sentence)) / sent_length: -1.04391859130313

Processing sentence: ['[CLS]', 'the', 'plan', 'was', 'a', 'success', '.', '[SEP]']
Word: the 	 Prob: 0.8995306491851807
Word: plan 	 Prob: 0.0032632441725581884
Word: was 	 Prob: 0.9528480172157288
Word: a 	 Prob: 0.9953371

-2.1912700758677803

In [4]:
get_sentence_prob("He answered unequivocally.")
get_sentence_prob("He answered quickly.")

Processing sentence: ['[CLS]', 'he', 'answered', 'une', '##qui', '##vo', '##cal', '##ly', '.', '[SEP]']
Word: he 	 Prob: 0.2814375162124634
Word: answered 	 Prob: 0.006721243727952242
Word: une 	 Prob: 0.9973625540733337
Word: ##qui 	 Prob: 0.9999865293502808
Word: ##vo 	 Prob: 0.9999856948852539
Word: ##cal 	 Prob: 0.9999865293502808
Word: ##ly 	 Prob: 0.9979932308197021
Word: . 	 Prob: 0.9998167157173157

Normalized sentence prob: log(P(sentence)) / sent_length: -0.784400124636818

Processing sentence: ['[CLS]', 'he', 'answered', 'quickly', '.', '[SEP]']
Word: he 	 Prob: 0.2151750773191452
Word: answered 	 Prob: 0.026344342157244682
Word: quickly 	 Prob: 0.05330450460314751
Word: . 	 Prob: 0.9981406927108765

Normalized sentence prob: log(P(sentence)) / sent_length: -2.0266001676791348



-2.0266001676791348

In [5]:
get_sentence_prob("The guy with small hands demanded a quid pro quo.")
get_sentence_prob("The guy with small hands demanded an exchange.")

Processing sentence: ['[CLS]', 'the', 'guy', 'with', 'small', 'hands', 'demanded', 'a', 'qui', '##d', 'pro', 'quo', '.', '[SEP]']
Word: the 	 Prob: 0.6742717027664185
Word: guy 	 Prob: 0.006106184795498848
Word: with 	 Prob: 0.9959086179733276
Word: small 	 Prob: 0.001629635225981474
Word: hands 	 Prob: 0.20016466081142426
Word: demanded 	 Prob: 0.03818148002028465
Word: a 	 Prob: 0.5014763474464417
Word: qui 	 Prob: 0.9985383749008179
Word: ##d 	 Prob: 0.9992328882217407
Word: pro 	 Prob: 0.9958876967430115
Word: quo 	 Prob: 0.9983682036399841
Word: . 	 Prob: 0.9850805401802063

Normalized sentence prob: log(P(sentence)) / sent_length: -1.4586090460895018

Processing sentence: ['[CLS]', 'the', 'guy', 'with', 'small', 'hands', 'demanded', 'an', 'exchange', '.', '[SEP]']
Word: the 	 Prob: 0.6960532069206238
Word: guy 	 Prob: 0.002731535118073225
Word: with 	 Prob: 0.9953562617301941
Word: small 	 Prob: 0.001821734826080501
Word: hands 	 Prob: 0.21409577131271362
Word: demanded 	 Prob: 0

-2.3705652872514396

In [6]:
get_sentence_prob("This is a sentence.")
get_sentence_prob("This is a macrame.", verbose=False)
get_sentence_prob("This is a joke.", verbose=False)
get_sentence_prob("Are you kidding.", verbose=False)


Processing sentence: ['[CLS]', 'this', 'is', 'a', 'sentence', '.', '[SEP]']
Word: this 	 Prob: 0.060409143567085266
Word: is 	 Prob: 0.71123206615448
Word: a 	 Prob: 0.3749244213104248
Word: sentence 	 Prob: 0.00016662826237734407
Word: . 	 Prob: 0.966092050075531

Normalized sentence prob: log(P(sentence)) / sent_length: -2.572528720647097

Processing sentence: ['[CLS]', 'this', 'is', 'a', 'mac', '##ram', '##e', '.', '[SEP]']
Word: this 	 Prob: 0.055456843227148056
Word: is 	 Prob: 0.8841150999069214
Word: a 	 Prob: 0.35453030467033386
Word: mac 	 Prob: 0.04465892165899277
Word: ##ram 	 Prob: 0.19538208842277527
Word: ##e 	 Prob: 0.9994390606880188
Word: . 	 Prob: 0.9644730091094971

Normalized sentence prob: log(P(sentence)) / sent_length: -1.2615019382376755

Processing sentence: ['[CLS]', 'this', 'is', 'a', 'joke', '.', '[SEP]']
Word: this 	 Prob: 0.9431005716323853
Word: is 	 Prob: 0.5840965509414673
Word: a 	 Prob: 0.6737552881240845
Word: joke 	 Prob: 0.018048686906695366
Word: 

-4.337637407676084

In [7]:
get_sentence_prob("Rachel was wearing a lovely satin dress last night.")

Processing sentence: ['[CLS]', 'rachel', 'was', 'wearing', 'a', 'lovely', 'satin', 'dress', 'last', 'night', '.', '[SEP]']
Word: rachel 	 Prob: 0.0008810244617052376
Word: was 	 Prob: 0.9967688322067261
Word: wearing 	 Prob: 0.9564428329467773
Word: a 	 Prob: 0.9443942904472351
Word: lovely 	 Prob: 0.00043303932761773467
Word: satin 	 Prob: 0.002335268072783947
Word: dress 	 Prob: 0.3856201171875
Word: last 	 Prob: 0.013361506164073944
Word: night 	 Prob: 0.9468490481376648
Word: . 	 Prob: 0.9899153709411621

Normalized sentence prob: log(P(sentence)) / sent_length: -2.6276748973177746



-2.6276748973177746

In [None]:
get_sentence_prob("Rachel was wearing a lovely satin dress last night.")
get_sentence_prob("Grandma was wearing a lovely satin dress last night.")
get_sentence_prob("Mother was wearing a lovely satin dress last night.")
get_sentence_prob("She was wearing a lovely satin dress last night.")
get_sentence_prob("He was wearing a lovely satin dress last night.")
get_sentence_prob("I was wearing a lovely satin dress last night.")
get_sentence_prob("Angela was wearing a lovely satin dress last night.")
get_sentence_prob("Roberta was wearing a lovely satin dress last night.")

In [5]:
get_sentence_prob("The man ate the steak.")
get_sentence_prob("The man who arrived late ate the steak with a glass of wine.")
get_sentence_prob("The steak was eaten by the man.")
get_sentence_prob("The stake ate the man.")

Processing sentence: ['[CLS]', 'the', 'man', 'ate', 'the', 'steak', '.', '[SEP]']
Word: the 	 Prob: 0.9430958032608032
Word: man 	 Prob: 0.15097321569919586
Word: ate 	 Prob: 0.11828337609767914
Word: the 	 Prob: 0.10330334305763245
Word: steak 	 Prob: 0.004455209709703922
Word: . 	 Prob: 0.9944341778755188

Normalized sentence prob: log(P(sentence)) / sent_length: -1.9622100537332396

Processing sentence: ['[CLS]', 'the', 'man', 'who', 'arrived', 'late', 'ate', 'the', 'steak', 'with', 'a', 'glass', 'of', 'wine', '.', '[SEP]']
Word: the 	 Prob: 0.9333303570747375
Word: man 	 Prob: 0.06445129215717316
Word: who 	 Prob: 0.9256716966629028
Word: arrived 	 Prob: 0.10185236483812332
Word: late 	 Prob: 0.003638619789853692
Word: ate 	 Prob: 0.15281958878040314
Word: the 	 Prob: 0.005081328563392162
Word: steak 	 Prob: 0.013520020060241222
Word: with 	 Prob: 0.37167930603027344
Word: a 	 Prob: 0.9855746030807495
Word: glass 	 Prob: 0.8360260725021362
Word: of 	 Prob: 0.9999445676803589
Word: 

-3.6060804301135554

In [6]:
#get_sentence_prob("I fed my cat some of it and he damn near passed out")
get_sentence_prob("He was born in Berlin.")
get_sentence_prob("He was born in Santiago.")
get_sentence_prob("He was born in France.")
get_sentence_prob("He was born in window.")
get_sentence_prob("He was born in was.")


Processing sentence: ['[CLS]', 'he', 'was', 'born', 'in', 'berlin', '.', '[SEP]']
Word: he 	 Prob: 0.7967859506607056
Word: was 	 Prob: 0.9999992847442627
Word: born 	 Prob: 0.9977497458457947
Word: in 	 Prob: 0.9979470372200012
Word: berlin 	 Prob: 0.02355594001710415
Word: . 	 Prob: 0.9999347925186157

Normalized sentence prob: log(P(sentence)) / sent_length: -0.6633200494943973

Processing sentence: ['[CLS]', 'he', 'was', 'born', 'in', 'santiago', '.', '[SEP]']
Word: he 	 Prob: 0.7612152695655823
Word: was 	 Prob: 0.9999862909317017
Word: born 	 Prob: 0.9960402250289917
Word: in 	 Prob: 0.997549831867218
Word: santiago 	 Prob: 0.0008775214664638042
Word: . 	 Prob: 0.9998825788497925

Normalized sentence prob: log(P(sentence)) / sent_length: -1.2196333849568266

Processing sentence: ['[CLS]', 'he', 'was', 'born', 'in', 'france', '.', '[SEP]']
Word: he 	 Prob: 0.7930527329444885
Word: was 	 Prob: 0.9999958276748657
Word: born 	 Prob: 0.9916587471961975
Word: in 	 Prob: 0.9998917579650

-3.478431378123787

In [None]:
get_sentence_prob("I fed my cat some of it and he damn near passed out.")
get_sentence_prob("I fed my dog some of it and he damn near passed out.")
get_sentence_prob("I fed my window some of it and he damn near passed out.")
get_sentence_prob("I fed my the some of it and he damn near passed out.")

In [None]:
print("Should have similar/high probs\n")
get_sentence_prob("I forgot to take my medicine.")
get_sentence_prob("I forgot to take my medicines.")
get_sentence_prob("I forgot to take my medication.")
get_sentence_prob("I forgot to take my pills.")
print("Should have low probs\n")
get_sentence_prob("I forgot to take my turn.")
get_sentence_prob("I forgot to take my medical.")
get_sentence_prob("I forgot to take my medically.")
get_sentence_prob("I forgot to take my turned.")

In [None]:
# Load pre-trained model (weights)
with torch.no_grad():
    model = BertForMaskedLM.from_pretrained('bert-base-uncased')
    model.eval()
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
#get_sentence_prob("I fed my cat some of it and he damn near passed out")
get_sentence_prob("He was born in Berlin.")
get_sentence_prob("He was born in Santiago.")
get_sentence_prob("He was born in France.")
get_sentence_prob("He was born in window.")
get_sentence_prob("He was born in was.")

In [None]:
get_sentence_prob("I fed my cat some of it and he damn near passed out.")
get_sentence_prob("I fed my dog some of it and he damn near passed out.")
get_sentence_prob("I fed my window some of it and he damn near passed out.")
get_sentence_prob("I fed my the some of it and he damn near passed out.")

In [None]:
print("Should have similar/high probs\n")
get_sentence_prob("I forgot to take my medicine.")
get_sentence_prob("I forgot to take my medicines.")
get_sentence_prob("I forgot to take my medication.")
get_sentence_prob("I forgot to take my pills.")
print("Should have low probs\n")
get_sentence_prob("I forgot to take my turn.")
get_sentence_prob("I forgot to take my medical.")
get_sentence_prob("I forgot to take my medically.")
get_sentence_prob("I forgot to take my turned.")