In [42]:
from transformers import BertModel, BertTokenizer, BertForMaskedLM
import torch

import random

In [47]:
def random_word(tokens, tokenizer):
    """
    Masking some random tokens for Language Model task with probabilities as in the original BERT paper.
    :param tokens: list of str, tokenized sentence.
    :param tokenizer: Tokenizer, object used for tokenization (we need it's vocab here)
    :return: (list of str, list of int), masked tokens and related labels for LM prediction
    """
    output_label = []

    for i, token in enumerate(tokens):
        prob = random.random()
        # mask token with 15% probability
        if prob < 0.15:
            prob /= 0.15

            # 80% randomly change token to mask token
            if prob < 0.8:
                tokens[i] = "[MASK]"

            # 10% randomly change token to random token
            elif prob < 0.9:
                tokens[i] = random.choice(list(tokenizer.vocab.items()))[0]

            # -> rest 10% randomly keep current token

            # append current token to output (we will predict these later)
            try:
                output_label.append(tokenizer.vocab[token])
            except KeyError:
                # For unknown words (should not occur with BPE vocab)
                output_label.append(tokenizer.vocab["[UNK]"])
                print("Cannot find token '{}' in vocab. Using [UNK] insetad".format(token))
        else:
            # no masking token (will be ignored by loss function later)
            output_label.append(-1)

    return tokens, output_label



In [56]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [62]:
print(tokenizer.tokenize('Hello, I am a achievement guy but I hate it'))
random_word(['Hello','I','am','a','great','guy','but','I','hate','it'], tokenizer)

['hello', ',', 'i', 'am', 'a', 'achievement', 'guy', 'but', 'i', 'hate', 'it']


(['Hello', 'I', 'am', 'a', 'great', 'guy', '[MASK]', 'I', 'hate', 'it'],
 [-1, -1, -1, -1, -1, -1, 2021, -1, -1, -1])

In [55]:
tokenizer.encode()

TypeError: encode() missing 1 required positional argument: 'text'

In [7]:
model = BertModel.from_pretrained('bert-base-uncased')

In [8]:
tokenizer.encode("Hello, I am a great guy")

[101, 7592, 1010, 1045, 2572, 1037, 2307, 3124, 102]

In [26]:
print(tokenizer.encode(f"Hello, I am a great guy", add_special_tokens=True))
len(tokenizer.encode(f"Hello, I am a great guy", add_special_tokens=True))

[101, 7592, 1010, 1045, 2572, 1037, 2307, 3124, 102]


9

In [None]:
tokenizerbuild_inputs_with_special_tokens

In [14]:
model.parameters

<bound method Module.parameters of BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropo

In [15]:
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
outputs = model(input_ids)

prediction_scores, seq_relationship_scores = outputs[:2]

In [17]:
prediction_scores.shape

torch.Size([1, 8, 768])

In [37]:
seq_relationship_scores.shape

torch.Size([1, 768])

In [40]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
lm_model = BertForMaskedLM.from_pretrained('bert-base-uncased')

input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
outputs = lm_model(input_ids)

print(outputs[:2])

(tensor([[[ -7.8962,  -7.8105,  -7.7903,  ...,  -7.0694,  -7.1693,  -4.3590],
         [ -8.4461,  -8.4401,  -8.5044,  ...,  -8.0625,  -7.9909,  -5.7160],
         [-15.2953, -15.4727, -15.5865,  ..., -12.9857, -11.7038, -11.4293],
         ...,
         [-14.0628, -14.2535, -14.3645,  ..., -12.7151, -11.1621, -10.2317],
         [-10.6576, -10.7892, -11.0402,  ..., -10.3233, -10.1578,  -3.7721],
         [-11.3383, -11.4590, -11.1767,  ...,  -9.2152,  -9.5209,  -9.5571]]],
       grad_fn=<AddBackward0>),)
