In [1]:
from transformers import BertTokenizer
import random

In [2]:
def random_word(tokens, tokenizer):
    """
    Function:
        Masking some random tokens for Language Model task with probabilities as in the original BERT paper.
        For every word, generate a random probability. If it is <15%, then in that 15%, 80% should be replaced 
        with [MASK], 10% should be replaced with random word and other 10% to be the actual word.
    Input:    
        param tokens: list of str, tokenized sentence.
        param tokenizer: Tokenizer, object used for tokenization (we need it's vocab here)
    Output:
        return: (list of str, list of int), masked tokens and related labels for LM prediction
    """
    output_label = []

    for i, token in enumerate(tokens):
        prob = random.random()
        # mask token with 15% probability
        if prob < 0.15:
            prob /= 0.15

            # 80% randomly change token to mask token
            if prob < 0.8:
                tokens[i] = "[MASK]"

            # 10% randomly change token to random token
            elif prob < 0.9:
                tokens[i] = random.choice(list(tokenizer.vocab.items()))[0]

            # -> rest 10% randomly keep current token

            # append current token to output (we will predict these later)
            try:
                output_label.append(tokenizer.vocab[token])
            except KeyError:
                # For unknown words (should not occur with BPE vocab)
                output_label.append(tokenizer.vocab["[UNK]"])
                print("Cannot find token '{}' in vocab. Using [UNK] insetad".format(token))
        else:
            # no masking token (will be ignored by loss function later)
            output_label.append(-1)

    return tokens, output_label



In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [4]:
print(tokenizer.tokenize('Hello, I am a achievement guy but I hate it'))
random_word(['Hello','I','am','a','great','guy','but','I','hate','it'], tokenizer)

['hello', ',', 'i', 'am', 'a', 'achievement', 'guy', 'but', 'i', 'hate', 'it']
Cannot find token 'Hello' in vocab. Using [UNK] insetad


(['[MASK]', 'I', 'am', 'a', 'great', 'guy', 'but', 'I', 'hate', '[MASK]'],
 [100, -1, -1, -1, -1, -1, -1, -1, -1, 2009])

In [5]:
# Next Step: Check how to make it work for BPE.