In [23]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
from scipy.special import softmax
import numpy as np


In [14]:
model_name = "bert-base-cased"

model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
mask = tokenizer.mask_token

In [16]:
sentence = f"I want to {mask} pizza for tonight."

In [17]:
tokens = tokenizer.tokenize(sentence)
encoded_inputs = tokenizer(sentence, return_tensors='pt')
outputs = model(**encoded_inputs)
outputs

MaskedLMOutput(loss=None, logits=tensor([[[ -7.3723,  -7.2489,  -7.4421,  ...,  -6.3119,  -5.9369,  -6.4257],
         [ -7.9311,  -8.2282,  -8.0326,  ...,  -6.7387,  -6.4877,  -6.9525],
         [-12.0500, -11.7972, -12.5776,  ...,  -8.4518,  -6.7310,  -8.2586],
         ...,
         [-10.2204, -10.4315,  -9.9993,  ...,  -7.9570,  -6.7194,  -9.3618],
         [-12.4471, -12.5367, -12.5614,  ...,  -9.9086,  -9.4219, -11.1770],
         [-14.3657, -14.5227, -15.0017,  ..., -11.9715, -11.6569, -13.4498]]],
       grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)

The logits are the raw and normalized predictions for the tokens, but you can think of them as scores of all possibles words that could fill in the mask token

In [19]:
logits = outputs.logits.detach().numpy()[0]
logits

array([[ -7.3722935,  -7.2488627,  -7.4421425, ...,  -6.311863 ,
         -5.9368925,  -6.425681 ],
       [ -7.9311185,  -8.2282095,  -8.032589 , ...,  -6.738744 ,
         -6.487724 ,  -6.952523 ],
       [-12.05001  , -11.79721  , -12.577608 , ...,  -8.451774 ,
         -6.7310195,  -8.258565 ],
       ...,
       [-10.220409 , -10.431485 ,  -9.999258 , ...,  -7.956992 ,
         -6.719399 ,  -9.3617935],
       [-12.4471245, -12.536709 , -12.561404 , ...,  -9.90855  ,
         -9.4219055, -11.17695  ],
       [-14.365709 , -14.5227165, -15.001675 , ..., -11.971545 ,
        -11.6569195, -13.449782 ]], dtype=float32)

In [20]:
logits.shape

(10, 28996)

In [21]:
len(tokens)

8

In [25]:
mask_logits = logits[tokens.index(mask) + 1]
confidence_scores = softmax(mask_logits)
confidence_scores

array([2.9159813e-10, 4.0784948e-10, 5.2928040e-10, ..., 8.4445934e-10,
       6.2026295e-09, 1.6282753e-09], dtype=float32)

The top 5 predictions for the masked token are

In [27]:
for i in np.argsort(confidence_scores)[::-1][:5]:
    pred_token = tokenizer.decode(i)
    score = confidence_scores[i]

    #print(pred_token, score)
    print(sentence.replace(mask, pred_token), score)

I want to have pizza for tonight. 0.25729144
I want to get pizza for tonight. 0.17849562
I want to eat pizza for tonight. 0.1555553
I want to make pizza for tonight. 0.114223786
I want to order pizza for tonight. 0.09823061
