In [89]:
import torch as t
from torch import einsum
from einops import rearrange, reduce, repeat
import transformers
import bert_tests

# Part Zero: Tokenization

In [15]:
import transformers

cased_tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-cased")
uncased_tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased")

In [16]:
tokens = tokenizer(['Hi, my name is bert'])
tokenizer.decode(tokens['input_ids'][0])

'[CLS] Hi, my name is bert [SEP]'

In [17]:
tokens = cased_tokenizer(['Hi, my name is bert'])
cased_tokenizer.decode(tokens['input_ids'][0])

'[CLS] colleges 天 largest happened smile donation [SEP]'

# Part One: Inference

In [18]:
pretrained_bert = bert_tests.get_pretrained_bert()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
tokens['input_ids']

{'input_ids': [[101, 7632, 1010, 2026, 2171, 2003, 14324, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1]]}

In [63]:
tokens = cased_tokenizer(['The firetruck was painted bright [MASK].'])

mask_tokens = cased_tokenizer(['[MASK]'])

probabilities = pretrained_bert.eval()(t.tensor(tokens['input_ids'])).logits.softmax(dim=1)

mask_id = mask_tokens["input_ids"][0][1]

print(mask_tokens)
print(tokens)


{'input_ids': [[101, 103, 102]], 'token_type_ids': [[0, 0, 0]], 'attention_mask': [[1, 1, 1]]}
{'input_ids': [[101, 1109, 1783, 18062, 8474, 1108, 4331, 3999, 103, 119, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [78]:
def get_mask(tokens):
    token_numbers = t.tensor(tokens["input_ids"])
    mask_index = (token_numbers == mask_id).nonzero(as_tuple = True)[1]
    probabilities = pretrained_bert.eval()(t.tensor(tokens['input_ids'])).logits.softmax(dim=2)

    topkprobs, topktokens = probabilities.topk(5, dim=2)

    topkprobs = topkprobs[0,mask_index,:]
    topktokens = topktokens[0,mask_index,:]

    topkguesses = cased_tokenizer.decode(topktokens[0]).split(' ')

    return zip(topkguesses, topkprobs.tolist()[0])

list(get_mask(tokens))

[('red', 0.5602872371673584),
 ('yellow', 0.12122748047113419),
 ('white', 0.07108743488788605),
 ('blue', 0.069109246134758),
 ('green', 0.05439625307917595)]

In [111]:
def ascii_art_probs(string):
    tokens = cased_tokenizer([string])
    guesses = get_mask(tokens)

    indent = string.index('[MASK]')

    string = string.replace('[MASK]', '___')

    print(string)

    for word, prob in guesses:
        prob_string = "{}%".format(round(prob * 100)) 
        # print(prob_string)
        prob_string = " " * (indent - len(prob_string) - 1) + prob_string + " " + word

        print(prob_string)

# ascii_art_probs('The firetruck was painted bright [MASK].')
# ascii_art_probs('The fish loves to eat [MASK].')
# ascii_art_probs('The fish loves to eat [MASK]')
ascii_art_probs('The British Prime Minister is [MASK].')
ascii_art_probs('The capital of Illinois is [MASK].')

ascii_art_probs('17 plus 80 equals [MASK].')

ascii_art_probs('1, 1, 2, 3, 5, [MASK].')

The British Prime Minister is ___.
                          17% elected
                          12% British
                          10% Hon
                           9% appointed
                           4% Conservative
The capital of Illinois is ___.
                       43% Chicago
                       20% Springfield
                        2% Cairo
                        1% Madison
                        1% Quincy
17 plus 80 equals ___.
               4% 10
               4% 16
               4% 20
               3% 15
               3% 12
1, 1, 2, 3, 5, 6, ___.
              43% 7
              15% 8
               5% 10
               4% 12
               4% 6


In [42]:
topk = probabilities.topk(5, dim=2)

topk

torch.return_types.topk(
values=tensor([[[0.7243, 0.7186, 0.6659, 0.6492, 0.6321],
         [0.4452, 0.4371, 0.4316, 0.4289, 0.4204],
         [0.5841, 0.5715, 0.5522, 0.5305, 0.5068],
         [0.9977, 0.9940, 0.9618, 0.9562, 0.9484],
         [0.9658, 0.9607, 0.9572, 0.9535, 0.9525],
         [1.0000, 0.9999, 0.9999, 0.9988, 0.9987],
         [1.0000, 1.0000, 0.9971, 0.9967, 0.9965],
         [0.9999, 0.9993, 0.9963, 0.9912, 0.9899],
         [0.9482, 0.9479, 0.9333, 0.9326, 0.9138],
         [0.9845, 0.9832, 0.9801, 0.9770, 0.9696],
         [0.9527, 0.9515, 0.9405, 0.9300, 0.9293]]], grad_fn=<TopkBackward0>),
indices=tensor([[[ 2870,  3398,  2932, 19230, 10163],
         [23764,  1368,  1333,  1371,  1381],
         [ 2543, 26566,  9363, 13836, 23630],
         [16344, 10845, 12514, 10497,  6703],
         [22071, 24243, 12722, 27525, 15514],
         [ 2001,  2495, 14557, 19569, 16920],
         [ 4993, 14429, 28236,  3715,  2118],
         [ 4408, 20292, 12301,  6455,  5260],
   

# Part Two: Fine Tuning on Classification