<a href="https://colab.research.google.com/github/erikapaceep/NLP/blob/main/MLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MLM Training Logic 

In [1]:
!pip install transformers
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m94.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m128.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, https://u

In [2]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

In [3]:
#initialize the tokenizer and the model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
text = ("After Abraham Lincoln won the November 1860 presidential [MASK] on an "
        "anti-slavery platform, an initial seven slave states declared their "
        "secession from the country to form the Confederacy. War broke out in "
        "April 1861 when secessionist forces [MASK] Fort Sumter in South "
        "Carolina, just over a month after Lincoln's inauguration.")

In [5]:
inputs = tokenizer(text, return_tensors='pt')

In [6]:
outputs = model(**inputs)

In [7]:
# we are just getting the prediction logit
outputs.keys()

odict_keys(['logits'])

In [8]:
outputs.logits.shape

torch.Size([1, 62, 30522])

In [9]:
# in reality we need to mask tokens randomly which is not what we have done and we also need an unmasked vesion of our input as label to feed the model peer and calcualte our loss

In [10]:
text = ("After Abraham Lincoln won the November 1860 presidential election on an "
        "anti-slavery platform, an initial seven slave states declared their "
        "secession from the country to form the Confederacy. War broke out in "
        "April 1861 when secessionist forces attacked Fort Sumter in South "
        "Carolina, just over a month after Lincoln's inauguration.")

In [11]:
# and again we need to convert this into our tokenized tensor
inputs = tokenizer(text, return_tensors='pt')

In [12]:
inputs

{'input_ids': tensor([[  101,  2044,  8181,  5367,  2180,  1996,  2281,  7313,  4883,  2602,
          2006,  2019,  3424,  1011,  8864,  4132,  1010,  2019,  3988,  2698,
          6658,  2163,  4161,  2037, 22965,  2013,  1996,  2406,  2000,  2433,
          1996, 18179,  1012,  2162,  3631,  2041,  1999,  2258,  6863,  2043,
         22965,  2923,  2749,  4457,  3481,  7680,  3334,  1999,  2148,  3792,
          1010,  2074,  2058,  1037,  3204,  2044,  5367,  1005,  1055, 17331,
          1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [13]:
# in order to mask we care about the input ids tensors and what we want is to mask around 15% of the input ids
# rather than than masking 15% of the token we want to have each single token to be masked with 15% of probs of being mask
rand = torch.rand(inputs.input_ids.shape)
rand
# we have a random value form 0 to 1 so we can create a masking rate base if those are

tensor([[0.3682, 0.8260, 0.5363, 0.8361, 0.0736, 0.8251, 0.7306, 0.5325, 0.2379,
         0.6783, 0.2203, 0.2434, 0.0333, 0.4861, 0.0246, 0.5890, 0.2620, 0.2663,
         0.9935, 0.3228, 0.4050, 0.7845, 0.9514, 0.5221, 0.0854, 0.1203, 0.9121,
         0.4440, 0.2491, 0.4823, 0.2525, 0.5367, 0.9694, 0.2839, 0.4496, 0.0106,
         0.8889, 0.3681, 0.7191, 0.7662, 0.1570, 0.3686, 0.2559, 0.2897, 0.7628,
         0.0742, 0.2650, 0.3102, 0.2988, 0.0445, 0.5293, 0.6473, 0.5592, 0.6302,
         0.4618, 0.3068, 0.0862, 0.3319, 0.4716, 0.8258, 0.7603, 0.6342]])

In [14]:
inputs.input_ids != 102

tensor([[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True, False]])

In [15]:
mask_arr = (rand < 0.15) & (inputs.input_ids != 102)
mask_arr = (rand < 0.15) * (inputs.input_ids != 102)
mask_arr

tensor([[False, False, False, False,  True, False, False, False, False, False,
         False, False,  True, False,  True, False, False, False, False, False,
         False, False, False, False,  True,  True, False, False, False, False,
         False, False, False, False, False,  True, False, False, False, False,
         False, False, False, False, False,  True, False, False, False,  True,
         False, False, False, False, False, False,  True, False, False, False,
         False, False]])

In [17]:
#index where we have prob < 0.15 
mask_arr[0].nonzero()

tensor([[ 4],
        [12],
        [14],
        [24],
        [25],
        [35],
        [45],
        [49],
        [56]])

In [18]:
# to get this into a list
selection = torch.flatten(mask_arr[0].nonzero()).tolist()
selection

[4, 12, 14, 24, 25, 35, 45, 49, 56]

In [19]:
# copy the original tensor into a new one called label
inputs['labels']=inputs.input_ids.detach().clone()
inputs

{'input_ids': tensor([[  101,  2044,  8181,  5367,  2180,  1996,  2281,  7313,  4883,  2602,
          2006,  2019,  3424,  1011,  8864,  4132,  1010,  2019,  3988,  2698,
          6658,  2163,  4161,  2037, 22965,  2013,  1996,  2406,  2000,  2433,
          1996, 18179,  1012,  2162,  3631,  2041,  1999,  2258,  6863,  2043,
         22965,  2923,  2749,  4457,  3481,  7680,  3334,  1999,  2148,  3792,
          1010,  2074,  2058,  1037,  3204,  2044,  5367,  1005,  1055, 17331,
          1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([

In [20]:
# now we can mask our input ids 
inputs.input_ids[0, selection] = 103

In [21]:
inputs

{'input_ids': tensor([[  101,  2044,  8181,  5367,   103,  1996,  2281,  7313,  4883,  2602,
          2006,  2019,   103,  1011,   103,  4132,  1010,  2019,  3988,  2698,
          6658,  2163,  4161,  2037,   103,   103,  1996,  2406,  2000,  2433,
          1996, 18179,  1012,  2162,  3631,   103,  1999,  2258,  6863,  2043,
         22965,  2923,  2749,  4457,  3481,   103,  3334,  1999,  2148,   103,
          1010,  2074,  2058,  1037,  3204,  2044,   103,  1005,  1055, 17331,
          1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([

In [22]:
outputs = model(**inputs)

In [23]:
outputs.keys()

odict_keys(['loss', 'logits'])

Now we have loss on the top of it, because we have also added labels:

In [24]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [25]:
outputs.loss

tensor(0.6674, grad_fn=<NllLossBackward0>)

In [26]:
import requests

In [27]:
data = requests.get('https://raw.githubusercontent.com/jamescalam/transformers/main/data/text/meditations/clean.txt')

In [43]:
text = data.text

In [29]:
#using text to fine tune our model

In [44]:
# 1. tokenize out model
# we need to split by new line of character, which will give us a list rather than a single string here
inputs = tokenizer(text.split('\n'), return_tensors='pt', max_length=512, truncation=True, padding='max_length')

In [45]:
inputs.input_ids.shape

torch.Size([507, 512])

In [48]:
# we need to create our label tensor by cloning the input_id
inputs['labels'] = inputs.input_ids.detach().clone()

In [50]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [54]:
# now we need to mask our input ids: create a random tensor
rand = torch.rand(inputs.input_ids.shape)
# this time we add another condition as we don't want to mask our padding token either (which is our padding token)
rand_mask = (rand < 0.15) & (inputs.input_ids != 102) & (inputs.input_ids != 101) & & (inputs.input_ids != 0)
rand_mask

tensor([[False, False, False,  ..., False, False, False],
        [ True, False, False,  ...,  True, False,  True],
        [False, False, False,  ..., False,  True, False],
        ...,
        [False,  True, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])

In [70]:
rand_mask.shape[0]

507

In [None]:
rand_mask[0].nonzero()

In [None]:
torch.flatten(rand_mask[0].nonzero()).tolist()

In [72]:
# since now we have multiple list we need a single array for each one of those vectors
# we initialize our selection as a list and then rather then loop through all the vectors here
seleciton = []

for i in range(inputs.input_ids.shape[0]):
  sel = torch.flatten(rand_mask[i].nonzero()).tolist()
  selection.append(sel)

In [75]:
for i in range(inputs.input_ids.shape[0]):
  inputs.input_ids[i, selection[i]] = 103

In [76]:
inputs.input_ids

tensor([[  101,  2013,  2026,  ...,     0,     0,     0],
        [  101,  2013,  1996,  ...,     0,     0,     0],
        [  101,  2013,  2026,  ...,     0,     0,     0],
        ...,
        [  101,  3459,  2185,  ...,     0,     0,     0],
        [  101,  2043, 15223,  ...,     0,     0,     0],
        [  101,  7887,   103,  ...,     0,   103,     0]])

In [63]:
inputs

{'input_ids': tensor([[  101,  2013,  2026,  ...,     0,     0,     0],
        [  101,  2013,  1996,  ...,     0,     0,     0],
        [  101,  2013,  2026,  ...,     0,     0,     0],
        ...,
        [  101,  3459,  2185,  ...,     0,     0,     0],
        [  101,  2043, 15223,  ...,     0,     0,     0],
        [  101,  7887,  3288,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[  101,  2013,  2026,  ...,     0,     0,     0],
        [  101,  2013,  1996,  ...,     0,     0,     0],
        [  101,  2013, 