### References

BERT LM prediction
https://github.com/huggingface/transformers/blob/master/docs/source/quickstart.md#bert-example

Masking script
https://github.com/huggingface/pytorch-pretrained-BERT/blob/f9cde97b313c3218e1b29ea73a42414dfefadb40/examples/lm_finetuning/simple_lm_finetuning.py#L276-L301

In [1]:
import random
import torch
from transformers import BertTokenizer, BertForMaskedLM


In [48]:
from typing import List, Union

In [46]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [47]:
# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval()


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [37]:
def mask_sentence(tokens, tokenizer):
    """
    Masking some random tokens for Language Model task with probabilities as in the original BERT paper.
    :param tokens: list of str, tokenized sentence.
    :param tokenizer: Tokenizer, object used for tokenization (we need it's vocab here)
    :return: (list of str, list of int), masked tokens and related labels for LM prediction
    """
    output_label = []

    for i, token in enumerate(tokens):
        prob = random.random()
        # mask token with 15% probability
        if prob < 0.15:
            prob /= 0.15

            # 80% randomly change token to mask token
            if prob < 0.8:
                tokens[i] = "[MASK]"

            # 10% randomly change token to random token
            elif prob < 0.9:
                tokens[i] = random.choice(list(tokenizer.vocab.items()))[0]

            # -> rest 10% randomly keep current token

            # append current token to output (we will predict these later)
            try:
                output_label.append(tokenizer.vocab[token])
            except KeyError:
                # For unknown words (should not occur with BPE vocab)
                output_label.append(tokenizer.vocab["[UNK]"])
                print("Cannot find token '{}' in vocab. Using [UNK] insetad".format(token))
        else:
            # no masking token (will be ignored by loss function later)
            output_label.append(-1)

    return tokens, output_label



In [38]:
print(tokenizer.encode(f"Hello, I am a great guy", add_special_tokens=True))
len(tokenizer.encode(f"Hello, I am a great guy", add_special_tokens=True))

[101, 7592, 1010, 1045, 2572, 1037, 2307, 3124, 102]


9

In [39]:
model.parameters

<bound method Module.parameters of BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,

In [275]:
text = "America saw the third highest daily increase of new cases on Sunday. The president is ashamed to mention the casualities."

def tokenize_text(text):
    # Tokenize input
    tokenized_text = tokenizer.tokenize(text)
    return tokenized_text

In [276]:
def add_special_tokens(tokenlist: List[str]):
    """TODO"""

In [376]:
def prepare_input(model:str, text: str):
    if model=="BERT":
        # Tokenize input
        tokenized_text = tokenizer.tokenize(text)
        
        tokenized_text = ['[CLS]']  +  tokenized_text +['[SEP]']
        masktokenized_text, output_labels = mask_sentence(tokenized_text,tokenizer)
        
        # Convert token to vocabulary indices
        indexed_tokens = tokenizer.convert_tokens_to_ids(masktokenized_text)
        
        print(indexed_tokens)
        word_indexed_token = tokenizer.convert_ids_to_tokens(indexed_tokens)
        print(word_indexed_token)
        
        # Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
        # segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
        segments_ids = [0]* len(indexed_tokens)

        # Convert inputs to PyTorch tensors
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])
        
        return tokenized_text, tokens_tensor, output_labels, segments_tensors

In [377]:
text = "America saw the third highest daily increase of new cases on Sunday. The president is ashamed to mention the casualities."

prepare_input('BERT',text)

[101, 18295, 2387, 1996, 2353, 3284, 3679, 103, 1997, 2047, 3572, 2006, 4465, 1012, 1996, 2343, 103, 14984, 2000, 5254, 1996, 10017, 6447, 103, 7339]
['[CLS]', 'brightness', 'saw', 'the', 'third', 'highest', 'daily', '[MASK]', 'of', 'new', 'cases', 'on', 'sunday', '.', 'the', 'president', '[MASK]', 'ashamed', 'to', 'mention', 'the', 'casual', '##ities', '[MASK]', 'perspective']


(['[CLS]',
  'brightness',
  'saw',
  'the',
  'third',
  'highest',
  'daily',
  '[MASK]',
  'of',
  'new',
  'cases',
  'on',
  'sunday',
  '.',
  'the',
  'president',
  '[MASK]',
  'ashamed',
  'to',
  'mention',
  'the',
  'casual',
  '##ities',
  '[MASK]',
  'perspective'],
 tensor([[  101, 18295,  2387,  1996,  2353,  3284,  3679,   103,  1997,  2047,
           3572,  2006,  4465,  1012,  1996,  2343,   103, 14984,  2000,  5254,
           1996, 10017,  6447,   103,  7339]]),
 [-1,
  2637,
  -1,
  -1,
  -1,
  -1,
  -1,
  3623,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  2003,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  1012,
  102],
 tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0]]))

In [378]:
import numpy as np

In [391]:
def predict(model, tokenizedtext, tokens_tensor, output_labels, segments_tensors=None):
    input_sentence = tokenizer.convert_ids_to_tokens(tokens_tensor[0].tolist())    
    sent_length = len(input_sentence)
    print(input_sentence)
    
    # If you have a GPU, put everything on cuda
    tokens_tensor = tokens_tensor.to('cuda')
    if not segments_tensors == None:
        segments_tensors = segments_tensors.to('cuda')
    
    model.to('cuda')

    # Predict all tokens
    with torch.no_grad():
        if not segments_tensors==None:
            outputs = model(tokens_tensor,token_type_ids=segments_tensors)
        else:
            outputs = model(tokens_tensor)
        
        #outputs = Tuple((batchsize, seq_len, vocab_size))
        predictions = outputs[0]

    # using list comprehension + enumerate() 
    # index of matching element 
    mask_positions = [idx for idx, val in enumerate(output_labels) if val > 0] 
    print(mask_positions)

    actual_words = [val for idx, val in enumerate(tokenizedtext) if idx in mask_positions]
    
    word_indexed_token = [input_sentence[mp] for mp in mask_positions]
    
    for mask_position in list(mask_positions):
        # confirm we were able to predict the actual words
        predicted_index = torch.argmax(predictions[0, mask_position]).item()
        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
        input_sentence[mask_position] = predicted_token
    
#     print("\n-------------------------------")
#     for mask_position in range(sent_length):
#         # confirm we were able to predict the actual words
#         predicted_index = torch.argmax(predictions[0, mask_position]).item()
#         predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]

    print(input_sentence)
    
    print("Actual one is ",actual_words)
    print("Fake one is  ",word_indexed_token)
    print(tokenizedtext)

In [392]:
text = "America saw the third highest daily increase of new cases on Sunday. The president is ashamed to mention the casualities."

tokenizedtext, input_tensor, output_labels, segments_tensors = prepare_input('BERT', text)

[101, 2637, 2387, 1996, 2353, 103, 3679, 103, 1997, 2047, 3572, 2006, 4465, 1012, 1996, 2343, 2003, 14984, 2000, 5254, 1996, 10017, 6447, 103, 103]
['[CLS]', 'america', 'saw', 'the', 'third', '[MASK]', 'daily', '[MASK]', 'of', 'new', 'cases', 'on', 'sunday', '.', 'the', 'president', 'is', 'ashamed', 'to', 'mention', 'the', 'casual', '##ities', '[MASK]', '[MASK]']


In [393]:
predict(model, tokenized_text, input_tensor, output_labels)

['[CLS]', 'america', 'saw', 'the', 'third', '[MASK]', 'daily', '[MASK]', 'of', 'new', 'cases', 'on', 'sunday', '.', 'the', 'president', 'is', 'ashamed', 'to', 'mention', 'the', 'casual', '##ities', '[MASK]', '[MASK]']
[5, 7, 23, 24]
['[CLS]', 'america', 'saw', 'the', 'third', '"', 'daily', '"', 'of', 'new', 'cases', 'on', 'sunday', '.', 'the', 'president', 'is', 'ashamed', 'to', 'mention', 'the', 'casual', '##ities', 'of', 'the']
Actual one is  ['highest', 'increase', '.', '[SEP]']
Fake one is   ['[MASK]', '[MASK]', '[MASK]', '[MASK]']
