In [2]:
import torch
from torch import nn
from transformers.models.bert import BertTokenizer,BertModel,BertForMaskedLM

#### 1. load model and data processing

In [3]:
model_name = "../../models/bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
bert = BertModel.from_pretrained(model_name)
mlm = BertForMaskedLM.from_pretrained(model_name, output_hidden_states=True)

Some weights of the model checkpoint at ../../models/bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
bert

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [7]:
text = ("After Abraham Lincoln won the November 1860 presidential "
        "election on an anti-slavery platform, an initial seven "
        "slave states declared their secession from the country "
        "to form the Confederacy. War broke out in April 1861 "
        "when secessionist forces attacked Fort Sumter in South "
        "Carolina, just over a month after Lincoln's "
        "inauguration.")

In [8]:
text

"After Abraham Lincoln won the November 1860 presidential election on an anti-slavery platform, an initial seven slave states declared their secession from the country to form the Confederacy. War broke out in April 1861 when secessionist forces attacked Fort Sumter in South Carolina, just over a month after Lincoln's inauguration."

In [51]:
inputs = tokenizer(text, return_tensors='pt')

In [52]:
inputs

{'input_ids': tensor([[  101,  2044,  8181,  5367,  2180,  1996,  2281,  7313,  4883,  2602,
          2006,  2019,  3424,  1011,  8864,  4132,  1010,  2019,  3988,  2698,
          6658,  2163,  4161,  2037, 22965,  2013,  1996,  2406,  2000,  2433,
          1996, 18179,  1012,  2162,  3631,  2041,  1999,  2258,  6863,  2043,
         22965,  2923,  2749,  4457,  3481,  7680,  3334,  1999,  2148,  3792,
          1010,  2074,  2058,  1037,  3204,  2044,  5367,  1005,  1055, 17331,
          1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [53]:
inputs["input_ids"].shape

torch.Size([1, 62])

In [54]:
' '.join(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]))

"[CLS] after abraham lincoln won the november 1860 presidential election on an anti - slavery platform , an initial seven slave states declared their secession from the country to form the confederacy . war broke out in april 1861 when secession ##ist forces attacked fort sum ##ter in south carolina , just over a month after lincoln ' s inauguration . [SEP]"

#### 2. masking

In [55]:
inputs["labels"] = inputs['input_ids'].detach().clone()

In [56]:
inputs["labels"]

tensor([[  101,  2044,  8181,  5367,  2180,  1996,  2281,  7313,  4883,  2602,
          2006,  2019,  3424,  1011,  8864,  4132,  1010,  2019,  3988,  2698,
          6658,  2163,  4161,  2037, 22965,  2013,  1996,  2406,  2000,  2433,
          1996, 18179,  1012,  2162,  3631,  2041,  1999,  2258,  6863,  2043,
         22965,  2923,  2749,  4457,  3481,  7680,  3334,  1999,  2148,  3792,
          1010,  2074,  2058,  1037,  3204,  2044,  5367,  1005,  1055, 17331,
          1012,   102]])

In [57]:
mask = torch.rand(inputs["input_ids"].shape) < 0.15

In [58]:
mask

tensor([[False, False, False,  True,  True, False, False, False, False,  True,
         False, False, False,  True, False,  True, False, False,  True, False,
         False, False, False,  True, False, False, False, False, False, False,
         False, False, False, False,  True, False, False, False, False,  True,
         False, False, False, False, False, False,  True, False,  True, False,
         False, False, False, False, False, False, False, False, False, False,
          True,  True]])

In [59]:
sum(mask[0])

tensor(13)

In [60]:
10/62

0.16129032258064516

In [61]:
mask_arr = (torch.rand(inputs['input_ids'].shape) < 0.15) * (inputs['input_ids'] != 101) * (inputs['input_ids'] != 102)

In [62]:
mask_arr

tensor([[False, False, False, False, False, False, False, False,  True, False,
         False, False, False, False, False,  True, False, False, False,  True,
         False, False, False, False, False, False, False, False, False, False,
         False, False,  True,  True, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False,  True, False, False,  True, False, False, False,
         False, False]])

In [63]:
sum(mask_arr[0])

tensor(7)

In [64]:
selection = torch.flatten(mask_arr[0].nonzero()).tolist()

In [65]:
selection

[8, 15, 19, 32, 33, 53, 56]

In [66]:
tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [67]:
tokenizer.vocab["[MASK]"]

103

In [68]:
inputs['input_ids'][0, selection] = 103

In [69]:
inputs

{'input_ids': tensor([[  101,  2044,  8181,  5367,  2180,  1996,  2281,  7313,   103,  2602,
          2006,  2019,  3424,  1011,  8864,   103,  1010,  2019,  3988,   103,
          6658,  2163,  4161,  2037, 22965,  2013,  1996,  2406,  2000,  2433,
          1996, 18179,   103,   103,  3631,  2041,  1999,  2258,  6863,  2043,
         22965,  2923,  2749,  4457,  3481,  7680,  3334,  1999,  2148,  3792,
          1010,  2074,  2058,   103,  3204,  2044,   103,  1005,  1055, 17331,
          1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([

In [70]:
" ".join(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]))

"[CLS] after abraham lincoln won the november 1860 [MASK] election on an anti - slavery [MASK] , an initial [MASK] slave states declared their secession from the country to form the confederacy [MASK] [MASK] broke out in april 1861 when secession ##ist forces attacked fort sum ##ter in south carolina , just over [MASK] month after [MASK] ' s inauguration . [SEP]"

In [71]:
" ".join(tokenizer.convert_ids_to_tokens(inputs['labels'][0]))

"[CLS] after abraham lincoln won the november 1860 presidential election on an anti - slavery platform , an initial seven slave states declared their secession from the country to form the confederacy . war broke out in april 1861 when secession ##ist forces attacked fort sum ##ter in south carolina , just over a month after lincoln ' s inauguration . [SEP]"

#### 3. forward and calculate loss

In [72]:
mlm.eval()
with torch.no_grad():
        output = mlm(**inputs)

In [73]:
output.keys()

odict_keys(['loss', 'logits', 'hidden_states'])

In [74]:
output.logits

tensor([[[ -7.1467,  -7.0860,  -7.1372,  ...,  -6.3451,  -6.2599,  -4.3199],
         [-12.2118, -12.0553, -12.3142,  ..., -11.6241, -10.9304,  -9.4274],
         [ -6.4680,  -6.5631,  -6.0842,  ...,  -6.2033,  -6.2894,  -4.8912],
         ...,
         [ -3.3077,  -3.5327,  -3.2557,  ...,  -2.5559,  -2.6619,  -6.6899],
         [-13.9408, -13.8837, -13.9085,  ..., -10.9444, -10.8564,  -9.8597],
         [-12.4091, -12.7295, -12.5287,  ..., -11.6290, -10.9041,  -8.2796]]])

In [75]:
output.loss

tensor(0.6831)

In [78]:
type(output["hidden_states"])

tuple

In [79]:
len(output["hidden_states"])

13

In [77]:
output["hidden_states"][-1]

tensor([[[-0.3972,  0.0693, -0.2755,  ..., -0.2881, -0.1247,  0.4333],
         [-0.6269, -0.0257,  0.2558,  ..., -0.4460,  0.1589,  0.5990],
         [-0.5807,  1.0084, -0.7404,  ..., -0.4955, -0.3250,  0.4461],
         ...,
         [-0.3943,  0.2768, -0.8203,  ..., -0.4461, -0.7303,  1.1885],
         [ 0.5834,  0.0123, -0.3591,  ...,  0.0985, -0.6580, -0.0154],
         [ 0.0499,  0.1532, -0.5158,  ..., -0.2261, -0.8722, -0.0483]]])

#### 4. from scratch

In [80]:
mlm.cls(output['hidden_states'][-1])

tensor([[[ -7.1467,  -7.0860,  -7.1372,  ...,  -6.3451,  -6.2599,  -4.3199],
         [-12.2118, -12.0553, -12.3142,  ..., -11.6241, -10.9304,  -9.4274],
         [ -6.4680,  -6.5631,  -6.0842,  ...,  -6.2033,  -6.2894,  -4.8912],
         ...,
         [ -3.3077,  -3.5327,  -3.2557,  ...,  -2.5559,  -2.6619,  -6.6899],
         [-13.9408, -13.8837, -13.9085,  ..., -10.9444, -10.8564,  -9.8597],
         [-12.4091, -12.7295, -12.5287,  ..., -11.6290, -10.9041,  -8.2796]]],
       grad_fn=<ViewBackward0>)

In [81]:
last_hidden_state = output['hidden_states'][-1]

In [83]:
last_hidden_state.shape

torch.Size([1, 62, 768])

In [85]:
mlm.eval()
with torch.no_grad():
        transformed = mlm.cls.predictions.transform(last_hidden_state)
        print(transformed.shape)
        logits = mlm.cls.predictions.decoder(transformed)
        print(logits.shape)
logits

torch.Size([1, 62, 768])
torch.Size([1, 62, 30522])


tensor([[[ -7.1467,  -7.0860,  -7.1372,  ...,  -6.3451,  -6.2599,  -4.3199],
         [-12.2118, -12.0553, -12.3142,  ..., -11.6241, -10.9304,  -9.4274],
         [ -6.4680,  -6.5631,  -6.0842,  ...,  -6.2033,  -6.2894,  -4.8912],
         ...,
         [ -3.3077,  -3.5327,  -3.2557,  ...,  -2.5559,  -2.6619,  -6.6899],
         [-13.9408, -13.8837, -13.9085,  ..., -10.9444, -10.8564,  -9.8597],
         [-12.4091, -12.7295, -12.5287,  ..., -11.6290, -10.9041,  -8.2796]]])

In [86]:
output.loss

tensor(0.6831)

#### 5. loss and translate

In [87]:

ce = nn.CrossEntropyLoss()

In [88]:
logits.shape

torch.Size([1, 62, 30522])

In [89]:
inputs["labels"].shape

torch.Size([1, 62])

In [93]:
inputs["labels"][0].view(-1).shape

torch.Size([62])

In [94]:
ce(logits[0], inputs["labels"][0].view(-1))

tensor(0.6831)

In [95]:
torch.argmax(logits[0], dim=-1)

tensor([ 1012,  2044,  8181,  5367,  2180,  1996,  2281,  7313,  4883,  2602,
         2006,  2019,  3424,  1011,  8864,  4132,  1010,  2019,  3988,  2698,
         6658,  2163,  4161,  2037, 22965,  2013,  1996,  2406,  2000,  2433,
         1996, 18179,  1012,  4808, 12591, 16591,  1999,  2258,  6863,  2043,
        22965,  2923,  2749,  4457,  3481,  7680,  3334,  1999,  2148,  3792,
         1010,  2074,  2058,  1037,  3204,  2044,  5367,  1005,  1055, 17331,
         1012,  1055])

In [96]:
" ".join(tokenizer.convert_ids_to_tokens(torch.argmax(logits[0], dim=-1)))

". after abraham lincoln won the november 1860 presidential election on an anti - slavery platform , an initial seven slave states declared their secession from the country to form the confederacy . violence erupted unrest in april 1861 when secession ##ist forces attacked fort sum ##ter in south carolina , just over a month after lincoln ' s inauguration . s"

In [97]:
" ".join(tokenizer.convert_ids_to_tokens(inputs["labels"][0]))

"[CLS] after abraham lincoln won the november 1860 presidential election on an anti - slavery platform , an initial seven slave states declared their secession from the country to form the confederacy . war broke out in april 1861 when secession ##ist forces attacked fort sum ##ter in south carolina , just over a month after lincoln ' s inauguration . [SEP]"