# Estimate sentence probability with BERT
## Calculating probability more properly:
P_f, P_b: Probability forward pass, backward pass, respectively
```
P_f = P(w_0) * P(w_1|w_0) * P(w_2|w_0, w_1) * ... * P(w_N)
P_b = P(w_N-1|w_N) * P(w_N-2|w_N-1, w_N) * ... * P(w_0|w_1, w_2, ... ,w_N)
```
In this notebook, probabilities are not normalized by sentence length, so `P_f`, `P_b` become smaller as the sentence length increases.

Finally, the sentence probability P(S) is the geometric mean of forward and backwards probabilities:
```
P(S) = (P_f(S) * P_b(S)) ^ (1/2)
```

In [1]:
import numpy as np
import torch
from transformers import BertTokenizer, BertForMaskedLM

BOS_TOKEN = '[CLS]'
EOS_TOKEN = '[SEP]'
MASK_TOKEN = '[MASK]'

# Load pre-trained model (weights)
with torch.no_grad():
    model = BertForMaskedLM.from_pretrained('bert-large-uncased')
#     model = BertForMaskedLM.from_pretrained('bert-base-uncased')
    model.eval()
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
#     tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [2]:
def print_top_predictions(probs, k=5):
    probs = probs.detach().numpy()
    top_indexes = np.argpartition(probs, -k)[-k:]
    sorted_indexes = top_indexes[np.argsort(-probs[top_indexes])]
    top_tokens = tokenizer.convert_ids_to_tokens(sorted_indexes)
    print(f"Ordered top predicted tokens: {top_tokens}")
    print(f"Ordered top predicted values: {probs[sorted_indexes]}")

In [3]:
def get_sentence_prob(sentence, verbose=False):
    # Pre-process sentence, adding special tokens
    tokenized_input = tokenizer.tokenize(sentence)
    if tokenized_input[0] != BOS_TOKEN:
        tokenized_input.insert(0, BOS_TOKEN)
    if tokenized_input[-1] != EOS_TOKEN:
        tokenized_input.append(EOS_TOKEN)
    sent_len = len(tokenized_input)
    ids_input = tokenizer.convert_tokens_to_ids(tokenized_input)
    print(f"Processing sentence: {tokenized_input}\n")
    
    sm = torch.nn.Softmax(dim=0) # used to convert last hidden state to probs
    
    log_sent_prob_forward = 0
    log_sent_prob_backwards = 0
    # Mask non-special tokens in forward and backwards directions; calculate their probabilities
    for i in range(1, len(tokenized_input) - 1):  # Don't loop first and last tokens
        probs_forward = get_directional_prob(sm, tokenized_input, i, 'forward', verbose=verbose)
        probs_backwards = get_directional_prob(sm, tokenized_input, i, 'backwards', verbose=verbose)
        log_prob_forward = probs_forward[ids_input[i]]  # Prediction for masked word
        log_prob_forward = np.log10(log_prob_forward.detach().numpy())
        log_prob_backwards = probs_backwards[ids_input[i]]  # Prediction for masked word
        log_prob_backwards = np.log10(log_prob_backwards.detach().numpy())
        log_sent_prob_forward += log_prob_forward
        log_sent_prob_backwards += log_prob_backwards

        if verbose:
            print(f"Word: {tokenized_input[i]} \t Log-Prob_forward: {log_prob_forward}; Log-Prob_backwards: {log_prob_backwards}")

    # Obtain geometric average of forward and backward probs
    log_geom_mean_sent_prob = 0.5 * (log_sent_prob_forward + log_sent_prob_backwards)
    if verbose:
        print(f"Raw forward sentence probability: {log_sent_prob_forward}")
        print(f"Raw backward sentence probability: {log_sent_prob_backwards}\n")
        print(f"Average normalized sentence prob: {log_geom_mean_sent_prob}\n")
    sentence_prob = np.power(10, log_geom_mean_sent_prob)
    print(sentence_prob)
    return sentence_prob

In [4]:
def get_directional_prob(sm, tokenized_input, i, direction, verbose=False):
    current_tokens = tokenized_input[:]
    if direction == 'backwards':
        current_tokens[1:i+1] = [MASK_TOKEN for j in range(i)]
    elif direction == 'forward':
        current_tokens[i:-1] = [MASK_TOKEN for j in range(len(tokenized_input) - 1 - i)]
    else:
        print("Direction can only be 'forward' or 'backwards'")
        exit()
    if verbose: 
        print()
        print(current_tokens)
        
    masked_input = torch.tensor([tokenizer.convert_tokens_to_ids(current_tokens)])
    predictions = model(masked_input)
    predictions = predictions[0]
    probs = sm(predictions[0, i]) # Softmax to get probabilities
    if verbose: 
        print_top_predictions(probs)
    
    return probs # Model predictions

In [24]:
get_sentence_prob("The fat cat ate the last mouse quickly.", verbose=False)
get_sentence_prob("There are many health risks associated with fat.")
get_sentence_prob("They will fly out of Santiago tomorrow morning.")
get_sentence_prob("She bought the last Microsoft mouse for Santiago.")

Processing sentence: ['[CLS]', 'the', 'fat', 'cat', 'ate', 'the', 'last', 'mouse', 'quickly', '.', '[SEP]']

1.5699080230392615e-23
Processing sentence: ['[CLS]', 'there', 'are', 'many', 'health', 'risks', 'associated', 'with', 'fat', '.', '[SEP]']

8.689618845171052e-20
Processing sentence: ['[CLS]', 'they', 'will', 'fly', 'out', 'of', 'santiago', 'tomorrow', 'morning', '.', '[SEP]']

3.506387431159266e-22
Processing sentence: ['[CLS]', 'she', 'bought', 'the', 'last', 'microsoft', 'mouse', 'for', 'santiago', '.', '[SEP]']

2.933483879831623e-28


2.933483879831623e-28

In [18]:
np.power((1.5699080230392615e-23 * 8.689618845171052e-20 * 3.506387431159266e-22 * 2.933483879831623e-28), 1/4)

1.9354396584371938e-23

In [30]:
get_sentence_prob("The deteriorated cat ate the last mouse quickly.")/1.9354396584371938e-23

Processing sentence: ['[CLS]', 'the', 'deteriorated', 'cat', 'ate', 'the', 'last', 'mouse', 'quickly', '.', '[SEP]']

1.552863016010471e-26


0.000802330886029461

In [31]:
get_sentence_prob("There are many health risks associated with time.")/1.9354396584371938e-23

Processing sentence: ['[CLS]', 'there', 'are', 'many', 'health', 'risks', 'associated', 'with', 'time', '.', '[SEP]']

5.7637240939881745e-22


29.779921419209725

In [None]:
get_sentence_prob("penguins are birds.")
get_sentence_prob("penguins have wings.")
get_sentence_prob("wings are useful.")

In [None]:
get_sentence_prob("The kids eat the candy.")
get_sentence_prob("The kids eat the apple.")
get_sentence_prob("The kids ate the apple.")
get_sentence_prob("The kids ate the apple quickly.")
get_sentence_prob("The kids ate the apple slowly.")

In [None]:
get_sentence_prob("The kids eat the candy.")
get_sentence_prob("kids eat.")
get_sentence_prob("eat kids.")
get_sentence_prob("kids eat candy.")
get_sentence_prob("the kids eat.")
get_sentence_prob("small kids eat.")

In [None]:
get_sentence_prob("Smurfs eat the ancient nuns ungracefully.")
get_sentence_prob("Smurfs eat ancient the nuns ungracefully.")
get_sentence_prob("eat smurfs the ancient nuns ungracefully.")
get_sentence_prob("Smurfs eat the ancient ungracefully nuns.")
get_sentence_prob("Smurfs the eat ancient nuns ungracefully.")

In [None]:
get_sentence_prob("kids eat the red grapes quickly.")
get_sentence_prob("quickly eat the red grapes kids.")
get_sentence_prob("the kids eat the red grapes quickly.")
get_sentence_prob("girls eat the red grapes quickly.")
get_sentence_prob("the red grapes eat kids quickly.")
get_sentence_prob("kids eat red the grapes quickly.")
get_sentence_prob("eat kids the red grapes quickly.")
get_sentence_prob("kids eat the red quickly grapes.")
get_sentence_prob("kids the eat red grapes quickly.")

In [None]:
get_sentence_prob("Colorless green ideas sleep furiously.")
get_sentence_prob("Confused dumb benches eat endlessly.")
get_sentence_prob("Hairless ugly men complain constantly.")

In [None]:
get_sentence_prob("The test was a success.")
get_sentence_prob("The was test a success.")
get_sentence_prob("The test was success a.")
get_sentence_prob("The party was a success.")
get_sentence_prob("The farewell party was definitely not a success.")

In [None]:
get_sentence_prob("He answered unequivocally.")
get_sentence_prob("He answered quickly.", verbose=True)

In [None]:
get_sentence_prob("The guy with small hands demanded a quid pro quo.")
get_sentence_prob("The guy with small hands demanded an exchange.")

In [None]:
get_sentence_prob("This is a sentence.")
get_sentence_prob("This is a macrame.", verbose=False)
get_sentence_prob("This is a joke.", verbose=False)
get_sentence_prob("Are you kidding me?", verbose=False)


In [None]:
get_sentence_prob("Rachel was wearing a lovely satin dress last night.")

In [None]:
get_sentence_prob("Rachel was wearing a lovely satin dress last night.")
get_sentence_prob("Grandma was wearing a lovely satin dress last night.")
get_sentence_prob("Mother was wearing a lovely satin dress last night.")
get_sentence_prob("She was wearing a lovely satin dress last night.")
get_sentence_prob("He was wearing a lovely satin dress last night.")
get_sentence_prob("I was wearing a lovely satin dress last night.")
get_sentence_prob("Angela was wearing a lovely satin dress last night.")
get_sentence_prob("Roberta was wearing a lovely satin dress last night.")
get_sentence_prob("Running was wearing a lovely satin dress last night.")

In [None]:
get_sentence_prob("The man ate the steak.")
get_sentence_prob("The stake ate the man.")
get_sentence_prob("The man who arrived late ate the steak with a glass of wine.")
get_sentence_prob("The steak was eaten by the man.")
get_sentence_prob("The man was eaten by the stake.")

In [None]:
get_sentence_prob("He was born in Berlin.")
get_sentence_prob("He was born in Santiago.")
get_sentence_prob("He was born in France.")
get_sentence_prob("He was born in window.")
get_sentence_prob("He was born in was.")


In [None]:
get_sentence_prob("I fed my cat some of it and he damn near passed out.")
get_sentence_prob("I fed my dog some of it and he damn near passed out.")
get_sentence_prob("I fed my window some of it and he damn near passed out.")
get_sentence_prob("I fed my the some of it and he damn near passed out.")

In [None]:
print("Should have similar/high probs\n")
get_sentence_prob("I forgot to take my medicine.")
get_sentence_prob("I forgot to take my medicines.")
get_sentence_prob("I forgot to take my medication.")
get_sentence_prob("I forgot to take my pills.")
get_sentence_prob("I forgot to take my turn.")
print("Should have low probs\n")
get_sentence_prob("I forgot to take my medical.")
get_sentence_prob("I forgot to take my medically.")
get_sentence_prob("I forgot to take my turned.")

In [None]:
get_sentence_prob("We will explore the elements used to construct sentences, and what parts of speech are used to expand and elaborate on them.")
get_sentence_prob("Wikipedia is a multilingual online encyclopedia created and maintained as an open collaboration project by a community of volunteer editors.")
get_sentence_prob("Once she gave her a little cap of red velvet, which suited her so well that she would never wear anything else.")

In [None]:
# Load pre-trained model (weights)
with torch.no_grad():
    model = BertForMaskedLM.from_pretrained('bert-base-uncased')
    model.eval()
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
#get_sentence_prob("I fed my cat some of it and he damn near passed out")
get_sentence_prob("He was born in Berlin.")
get_sentence_prob("He was born in Santiago.")
get_sentence_prob("He was born in France.")
get_sentence_prob("He was born in window.")
get_sentence_prob("He was born in was.")

In [None]:
get_sentence_prob("I fed my cat some of it and he damn near passed out.")
get_sentence_prob("I fed my dog some of it and he damn near passed out.")
get_sentence_prob("I fed my window some of it and he damn near passed out.")
get_sentence_prob("I fed my the some of it and he damn near passed out.")

In [None]:
print("Should have similar/high probs\n")
get_sentence_prob("I forgot to take my medicine.")
get_sentence_prob("I forgot to take my medicines.")
get_sentence_prob("I forgot to take my medication.")
get_sentence_prob("I forgot to take my pills.")
print("Should have low probs\n")
get_sentence_prob("I forgot to take my turn.")
get_sentence_prob("I forgot to take my medical.")
get_sentence_prob("I forgot to take my medically.")
get_sentence_prob("I forgot to take my turned.")