# Estimate sentence probability with BERT
## Growing sentence word by word:
normalized Log(P(S)) = log(P(w_0) * P(w_1|w_0) * P(w_2|w_0, w_1) * ... * P(w_N)) / len(S)

NOTE: After attempting the algorithm below, I notice it doesn't work, because it creates the sentence word by word, which gives bad probabilities.

In [8]:
import numpy as np
import torch
from transformers import BertTokenizer, BertForMaskedLM

# Load pre-trained model (weights)
with torch.no_grad():
    model = BertForMaskedLM.from_pretrained('bert-large-uncased')
    model.eval()
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

In [9]:
def print_top_predictions(probs, k=5):
    probs = probs.detach().numpy()
    top_indexes = np.argpartition(probs, -k)[-k:]
    sorted_indexes = top_indexes[np.argsort(-probs[top_indexes])]
    top_tokens = tokenizer.convert_ids_to_tokens(sorted_indexes)
    print(f"Ordered top predicted tokens: {top_tokens}")
    print(f"Ordered top predicted values: {probs[sorted_indexes]}")

In [32]:
def get_forward_prob(tokenized_input, verbose=False):
    sm = torch.nn.Softmax(dim=0) # used to convert last hidden state to probs
    
    sent_len = len(tokenized_input)
    ids_input = tokenizer.convert_tokens_to_ids(tokenized_input)
    print(f"Processing sentence: {tokenized_input}")
    #print(f"Sentence ids: {ids_input}")
    #sent_prob = 1
    sum_lp = 0
    # Mask non-special tokens and calculate their probabilities
    for i in range(1,len(tokenized_input)-1): # Ignore first and last tokens
        current_tokenized = tokenized_input[:i+1] + [EOS_TOKEN]  # Copy tokens to the left of current one
        current_tokenized[i] = MASK_TOKEN
        if verbose: print(current_tokenized)
        masked_input = torch.tensor([tokenizer.convert_tokens_to_ids(current_tokenized)])
        outputs = model(masked_input)
        predictions = outputs[0]
        current_probs = sm(predictions[0, i]) # Softmax to get probabilities
        current_prob = current_probs[ids_input[i]] # Prediction for masked word
        #sent_prob *= current_prob

        sum_lp += np.log(current_prob.detach().numpy())

        print(f"Word: {tokenized_input[i]} \t Prob: {current_prob}")
        if verbose: print_top_predictions(current_probs)

    #print(f"\nSentence probability: {sent_prob.item()}\n")
    print(f"\nNormalized forward sentence prob: log(P(sentence)) / sent_length: {sum_lp / sent_len}\n")
    return sum_lp / sent_len

In [31]:
a = [0,1,2,3,4]
b = a[:3] + [EOS_TOKEN]
print(b)

[0, 1, 2, '[SEP]']


In [33]:
BOS_TOKEN = '[CLS]'
EOS_TOKEN = '[SEP]'
MASK_TOKEN = '[MASK]'

def get_sentence_prob(sentence, verbose=False):
    
    # Pre-process sentence, adding special tokens
    tokenized_input = tokenizer.tokenize(sentence)
    if tokenized_input[0] != BOS_TOKEN:
        tokenized_input.insert(0, BOS_TOKEN)
    if tokenized_input[-1] != EOS_TOKEN:
        tokenized_input.append(EOS_TOKEN)
    
    forward_prob = get_forward_prob(tokenized_input, verbose=verbose)
#     backward_prob = get_backward_prob(sentence, verbose=verbose)
    


In [34]:
get_sentence_prob("He answered unequivocally.")
get_sentence_prob("He answered quickly.", verbose=True)

Processing sentence: ['[CLS]', 'he', 'answered', 'une', '##qui', '##vo', '##cal', '##ly', '.', '[SEP]']
Word: he 	 Prob: 2.8220553005553484e-08
Word: answered 	 Prob: 5.180499917933901e-10
Word: une 	 Prob: 1.6526494799398717e-12
Word: ##qui 	 Prob: 1.4257877012369136e-07
Word: ##vo 	 Prob: 0.00012647721450775862
Word: ##cal 	 Prob: 0.9067971110343933
Word: ##ly 	 Prob: 5.350564720174589e-07
Word: . 	 Prob: 0.9998167157173157

Normalized forward sentence prob: log(P(sentence)) / sent_length: -10.517053819636931

Processing sentence: ['[CLS]', 'he', 'answered', 'quickly', '.', '[SEP]']
['[CLS]', '[MASK]', '[SEP]']
Word: he 	 Prob: 2.8220553005553484e-08
Ordered top predicted tokens: ['.', '|', ';', '!', '?']
Ordered top predicted values: [8.5319912e-01 1.4615905e-01 3.6399809e-04 1.6816972e-04 6.2668012e-05]
['[CLS]', 'he', '[MASK]', '[SEP]']
Word: answered 	 Prob: 5.180499917933901e-10
Ordered top predicted tokens: ['.', '|', ';', '!', '?']
Ordered top predicted values: [0.68385    0.1

In [35]:
get_sentence_prob("The guy with small hands demanded a quid pro quo.")
get_sentence_prob("The guy with small hands demanded an exchange.")

Processing sentence: ['[CLS]', 'the', 'guy', 'with', 'small', 'hands', 'demanded', 'a', 'qui', '##d', 'pro', 'quo', '.', '[SEP]']
Word: the 	 Prob: 3.751507620108896e-07
Word: guy 	 Prob: 3.2015158524245635e-08
Word: with 	 Prob: 2.52594538167894e-12
Word: small 	 Prob: 6.447998401881705e-08
Word: hands 	 Prob: 5.316450613079837e-10
Word: demanded 	 Prob: 1.0696359354887519e-17
Word: a 	 Prob: 2.2182765846423536e-08
Word: qui 	 Prob: 2.487666073136552e-09
Word: ##d 	 Prob: 2.7058053092332557e-05
Word: pro 	 Prob: 6.704656968503642e-11
Word: quo 	 Prob: 2.463259041185495e-10
Word: . 	 Prob: 0.9850805401802063

Normalized forward sentence prob: log(P(sentence)) / sent_length: -16.376029768226935

Processing sentence: ['[CLS]', 'the', 'guy', 'with', 'small', 'hands', 'demanded', 'an', 'exchange', '.', '[SEP]']
Word: the 	 Prob: 3.751507620108896e-07
Word: guy 	 Prob: 3.2015158524245635e-08
Word: with 	 Prob: 2.52594538167894e-12
Word: small 	 Prob: 6.447998401881705e-08
Word: hands 	 Prob

In [39]:
get_sentence_prob("This is a sentence.")
get_sentence_prob("This is a macrame.", verbose=False)
get_sentence_prob("This is a joke.", verbose=False)
get_sentence_prob("Are you kidding?", verbose=False)


Processing sentence: ['[CLS]', 'this', 'is', 'a', 'sentence', '.', '[SEP]']
Word: this 	 Prob: 6.5930851889106634e-09
Word: is 	 Prob: 3.7655132700820104e-07
Word: a 	 Prob: 1.6818952417452238e-06
Word: sentence 	 Prob: 2.2497308691526996e-06
Word: . 	 Prob: 0.966092050075531

Normalized forward sentence prob: log(P(sentence)) / sent_length: -8.566319989838771

Processing sentence: ['[CLS]', 'this', 'is', 'a', 'mac', '##ram', '##e', '.', '[SEP]']
Word: this 	 Prob: 6.5930851889106634e-09
Word: is 	 Prob: 3.7655132700820104e-07
Word: a 	 Prob: 1.6818952417452238e-06
Word: mac 	 Prob: 9.552974944426751e-08
Word: ##ram 	 Prob: 7.288339376465558e-11
Word: ##e 	 Prob: 4.372761264193059e-09
Word: . 	 Prob: 0.9644730091094971

Normalized forward sentence prob: log(P(sentence)) / sent_length: -11.746119774050182

Processing sentence: ['[CLS]', 'this', 'is', 'a', 'joke', '.', '[SEP]']
Word: this 	 Prob: 6.5930851889106634e-09
Word: is 	 Prob: 3.7655132700820104e-07
Word: a 	 Prob: 1.68189524174

In [40]:
get_sentence_prob("Rachel was wearing a lovely satin dress last night.")

Processing sentence: ['[CLS]', 'rachel', 'was', 'wearing', 'a', 'lovely', 'satin', 'dress', 'last', 'night', '.', '[SEP]']
Word: rachel 	 Prob: 4.6616849269653926e-11
Word: was 	 Prob: 7.485188142052068e-13
Word: wearing 	 Prob: 1.0644669173887067e-10
Word: a 	 Prob: 1.1416012313247847e-07
Word: lovely 	 Prob: 1.0477195644398307e-07
Word: satin 	 Prob: 2.1791837667706204e-08
Word: dress 	 Prob: 3.190381847595347e-10
Word: last 	 Prob: 1.5465513797992964e-13
Word: night 	 Prob: 8.449734661963859e-11
Word: . 	 Prob: 0.9899153709411621

Normalized forward sentence prob: log(P(sentence)) / sent_length: -16.578309947780024



In [41]:
get_sentence_prob("Rachel was wearing a lovely satin dress last night.")
get_sentence_prob("Grandma was wearing a lovely satin dress last night.")
get_sentence_prob("Mother was wearing a lovely satin dress last night.")
get_sentence_prob("She was wearing a lovely satin dress last night.")
get_sentence_prob("He was wearing a lovely satin dress last night.")
get_sentence_prob("I was wearing a lovely satin dress last night.")
get_sentence_prob("Angela was wearing a lovely satin dress last night.")
get_sentence_prob("Roberta was wearing a lovely satin dress last night.")

Processing sentence: ['[CLS]', 'rachel', 'was', 'wearing', 'a', 'lovely', 'satin', 'dress', 'last', 'night', '.', '[SEP]']
Word: rachel 	 Prob: 4.6616849269653926e-11
Word: was 	 Prob: 7.485188142052068e-13
Word: wearing 	 Prob: 1.0644669173887067e-10
Word: a 	 Prob: 1.1416012313247847e-07
Word: lovely 	 Prob: 1.0477195644398307e-07
Word: satin 	 Prob: 2.1791837667706204e-08
Word: dress 	 Prob: 3.190381847595347e-10
Word: last 	 Prob: 1.5465513797992964e-13
Word: night 	 Prob: 8.449734661963859e-11
Word: . 	 Prob: 0.9899153709411621

Normalized forward sentence prob: log(P(sentence)) / sent_length: -16.578309947780024

Processing sentence: ['[CLS]', 'grandma', 'was', 'wearing', 'a', 'lovely', 'satin', 'dress', 'last', 'night', '.', '[SEP]']
Word: grandma 	 Prob: 4.464792251190897e-12
Word: was 	 Prob: 6.6749254032578455e-15
Word: wearing 	 Prob: 2.2237471863917335e-09
Word: a 	 Prob: 9.947718808689388e-07
Word: lovely 	 Prob: 2.2267769850259356e-07
Word: satin 	 Prob: 1.434624721241562

In [42]:
get_sentence_prob("The man ate the steak.")
get_sentence_prob("The man who arrived late ate the steak with a glass of wine.")
get_sentence_prob("The steak was eaten by the man.")
get_sentence_prob("The stake ate the man.")

Processing sentence: ['[CLS]', 'the', 'man', 'ate', 'the', 'steak', '.', '[SEP]']
Word: the 	 Prob: 3.751507620108896e-07
Word: man 	 Prob: 7.494988949474646e-07
Word: ate 	 Prob: 7.185866485182046e-13
Word: the 	 Prob: 9.38571442787861e-10
Word: steak 	 Prob: 4.187582476333773e-07
Word: . 	 Prob: 0.9944341778755188

Normalized forward sentence prob: log(P(sentence)) / sent_length: -11.542438088566996

Processing sentence: ['[CLS]', 'the', 'man', 'who', 'arrived', 'late', 'ate', 'the', 'steak', 'with', 'a', 'glass', 'of', 'wine', '.', '[SEP]']
Word: the 	 Prob: 3.751507620108896e-07
Word: man 	 Prob: 7.494988949474646e-07
Word: who 	 Prob: 1.4188390046188104e-11
Word: arrived 	 Prob: 1.234168323094309e-09
Word: late 	 Prob: 2.5717524897855837e-12
Word: ate 	 Prob: 3.3093672116915607e-14
Word: the 	 Prob: 3.6328151598041813e-09
Word: steak 	 Prob: 8.559856723877601e-06
Word: with 	 Prob: 1.4159082795117683e-08
Word: a 	 Prob: 5.7526767704985105e-06
Word: glass 	 Prob: 2.21381674236909e-

In [43]:
#get_sentence_prob("I fed my cat some of it and he damn near passed out")
get_sentence_prob("He was born in Berlin.")
get_sentence_prob("He was born in Santiago.")
get_sentence_prob("He was born in France.")
get_sentence_prob("He was born in window.")
get_sentence_prob("He was born in was.")


Processing sentence: ['[CLS]', 'he', 'was', 'born', 'in', 'berlin', '.', '[SEP]']
Word: he 	 Prob: 2.8220553005553484e-08
Word: was 	 Prob: 8.548587615564429e-09
Word: born 	 Prob: 1.3297593026706522e-09
Word: in 	 Prob: 1.9216821556256036e-07
Word: berlin 	 Prob: 9.004227467812598e-05
Word: . 	 Prob: 0.9999347925186157

Normalized forward sentence prob: log(P(sentence)) / sent_length: -10.147396967081477

Processing sentence: ['[CLS]', 'he', 'was', 'born', 'in', 'santiago', '.', '[SEP]']
Word: he 	 Prob: 2.8220553005553484e-08
Word: was 	 Prob: 8.548587615564429e-09
Word: born 	 Prob: 1.3297593026706522e-09
Word: in 	 Prob: 1.9216821556256036e-07
Word: santiago 	 Prob: 8.016954211598204e-07
Word: . 	 Prob: 0.9998825788497925

Normalized forward sentence prob: log(P(sentence)) / sent_length: -10.737566606071596

Processing sentence: ['[CLS]', 'he', 'was', 'born', 'in', 'france', '.', '[SEP]']
Word: he 	 Prob: 2.8220553005553484e-08
Word: was 	 Prob: 8.548587615564429e-09
Word: born 	 P

In [44]:
get_sentence_prob("I fed my cat some of it and he damn near passed out.")
get_sentence_prob("I fed my dog some of it and he damn near passed out.")
get_sentence_prob("I fed my window some of it and he damn near passed out.")
get_sentence_prob("I fed my the some of it and he damn near passed out.")

Processing sentence: ['[CLS]', 'i', 'fed', 'my', 'cat', 'some', 'of', 'it', 'and', 'he', 'damn', 'near', 'passed', 'out', '.', '[SEP]']
Word: i 	 Prob: 1.1460726057066495e-07
Word: fed 	 Prob: 2.969326824350804e-12
Word: my 	 Prob: 1.7347387903324935e-13
Word: cat 	 Prob: 6.480977976508484e-09
Word: some 	 Prob: 2.6508239132666967e-15
Word: of 	 Prob: 1.4431978832482595e-12
Word: it 	 Prob: 5.614536803477677e-06
Word: and 	 Prob: 2.412100741366885e-08
Word: he 	 Prob: 1.6422809778759984e-08
Word: damn 	 Prob: 4.356678573458339e-08
Word: near 	 Prob: 1.3371936802286655e-07
Word: passed 	 Prob: 2.0931690158931815e-08
Word: out 	 Prob: 4.988439661346433e-10
Word: . 	 Prob: 0.9991564750671387

Normalized forward sentence prob: log(P(sentence)) / sent_length: -16.938915602757334

Processing sentence: ['[CLS]', 'i', 'fed', 'my', 'dog', 'some', 'of', 'it', 'and', 'he', 'damn', 'near', 'passed', 'out', '.', '[SEP]']
Word: i 	 Prob: 1.1460726057066495e-07
Word: fed 	 Prob: 2.969326824350804e-12

In [45]:
print("Should have similar/high probs\n")
get_sentence_prob("I forgot to take my medicine.")
get_sentence_prob("I forgot to take my medicines.")
get_sentence_prob("I forgot to take my medication.")
get_sentence_prob("I forgot to take my pills.")
print("Should have low probs\n")
get_sentence_prob("I forgot to take my turn.")
get_sentence_prob("I forgot to take my medical.")
get_sentence_prob("I forgot to take my medically.")
get_sentence_prob("I forgot to take my turned.")

Should have similar/high probs

Processing sentence: ['[CLS]', 'i', 'forgot', 'to', 'take', 'my', 'medicine', '.', '[SEP]']
Word: i 	 Prob: 1.1460726057066495e-07
Word: forgot 	 Prob: 1.1656596521358864e-10
Word: to 	 Prob: 2.567655831876847e-13
Word: take 	 Prob: 9.262473600613053e-12
Word: my 	 Prob: 1.7483587910760434e-08
Word: medicine 	 Prob: 1.6859313944905807e-08
Word: . 	 Prob: 0.9979066848754883

Normalized forward sentence prob: log(P(sentence)) / sent_length: -14.334716087000238

Processing sentence: ['[CLS]', 'i', 'forgot', 'to', 'take', 'my', 'medicines', '.', '[SEP]']
Word: i 	 Prob: 1.1460726057066495e-07
Word: forgot 	 Prob: 1.1656596521358864e-10
Word: to 	 Prob: 2.567655831876847e-13
Word: take 	 Prob: 9.262473600613053e-12
Word: my 	 Prob: 1.7483587910760434e-08
Word: medicines 	 Prob: 6.6740408755094904e-09
Word: . 	 Prob: 0.989775538444519

Normalized forward sentence prob: log(P(sentence)) / sent_length: -14.438589340903693

Processing sentence: ['[CLS]', 'i', 'fo

In [None]:
# Load pre-trained model (weights)
with torch.no_grad():
    model = BertForMaskedLM.from_pretrained('bert-base-uncased')
    model.eval()
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
#get_sentence_prob("I fed my cat some of it and he damn near passed out")
get_sentence_prob("He was born in Berlin.")
get_sentence_prob("He was born in Santiago.")
get_sentence_prob("He was born in France.")
get_sentence_prob("He was born in window.")
get_sentence_prob("He was born in was.")

In [None]:
get_sentence_prob("I fed my cat some of it and he damn near passed out.")
get_sentence_prob("I fed my dog some of it and he damn near passed out.")
get_sentence_prob("I fed my window some of it and he damn near passed out.")
get_sentence_prob("I fed my the some of it and he damn near passed out.")

In [None]:
print("Should have similar/high probs\n")
get_sentence_prob("I forgot to take my medicine.")
get_sentence_prob("I forgot to take my medicines.")
get_sentence_prob("I forgot to take my medication.")
get_sentence_prob("I forgot to take my pills.")
print("Should have low probs\n")
get_sentence_prob("I forgot to take my turn.")
get_sentence_prob("I forgot to take my medical.")
get_sentence_prob("I forgot to take my medically.")
get_sentence_prob("I forgot to take my turned.")