### myBERT   
Create and train my own BERT language model.  
Article: https://www.kdnuggets.com/2021/08/train-bert-model-scratch.html?utm_source=pocket_mylist    
Christoph Windheuser    
August 19, 2021

pip install datasets

In [1]:
from datasets import load_dataset
from tqdm.auto import tqdm
from pathlib import Path
import os
import pickle

import torch
from tokenizers import ByteLevelBPETokenizer
from transformers import RobertaTokenizer
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM
from transformers import AdamW
from transformers import pipeline


In [19]:
dataset = load_dataset('oscar', 'unshuffled_deduplicated_it')


Reusing dataset oscar (/home/christoph/.cache/huggingface/datasets/oscar/unshuffled_deduplicated_it/1.0.0/84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2)


In [20]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text'],
        num_rows: 28522082
    })
})

In [21]:
dataset['train']

Dataset({
    features: ['id', 'text'],
    num_rows: 28522082
})

In [22]:
dataset['train'].features

{'id': Value(dtype='int64', id=None), 'text': Value(dtype='string', id=None)}

In [23]:
dataset['train'][0]

{'id': 0,
 'text': "La estrazione numero 48 del 10 e LOTTO ogni 5 minuti e' avvenuta sabato 15 settembre 2018 alle ore 04:00 a Roma, nel Centro Elaborazione Dati della Lottomatica Italia (ora GTech SpA), con la supervisione della Amministrazione Autonoma dei Monopoli di Stato (AAMS), incaricata di vigilare sulla regolarità delle operazioni di sorteggio.\nIl Montepremi della 48ª estrazione viene ripartito tra i vincitori delle singole categorie di premio.\nRicorda di controllare il Numero ORO 53. E, se lo hai giocato, anche il DOPPIO ORO 53 e 66. Se indovini puoi vincere premi più ricchi.\nIl nostro sito web impiega cookies per migliorare la navigazione del visitatore. L’utente è consapevole che, continuando a visitare il nostro sito web, accetta l’utilizzo dei cookies Accetto Informazioni\n(C) Copyright 2013-2017 10elotto.biz | Il presente sito è da considerarsi un sito indipendente, NON collegato alla rete ufficiale Gtech SpA."}

In [24]:
# Write about 2852 text_<#>.txt files with 10.000 samples each. (# = 0 until 2852)
# Only do it onece!

# text_data  = []
# file_count = 0

# !pwd

# for sample in tqdm(dataset['train']):
#    sample = sample['text'].replace('\n', '')
#    text_data.append(sample)
#    if len(text_data) == 10_000:

# once we git the 10K mark, save to file

#        with open(f'data/text/oscar_it/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
#            fp.write('\n'.join(text_data))
#        text_data = []
#        file_count += 1

# after saving in 10K chunks, we will have ~2082 leftover samples, we save those now too
# with open(f'data/text/oscar_it/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
#     fp.write('\n'.join(text_data))


In [25]:
paths = [str(x) for x in Path('data/text/oscar_it').glob('**/*.txt')]


In [26]:
paths[:5]

['data/text/oscar_it/text_1732.txt',
 'data/text/oscar_it/text_1927.txt',
 'data/text/oscar_it/text_2374.txt',
 'data/text/oscar_it/text_469.txt',
 'data/text/oscar_it/text_2340.txt']

In [27]:
tokenizer = ByteLevelBPETokenizer()


In [28]:
tokenizer.train(files=paths[:5], vocab_size=30_522, min_frequency=2,
                special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'])


In [29]:
# Save tokenizer model at ./filiberto

# os.mkdir('./filiberto')
tokenizer.save_model('filiberto')

['filiberto/vocab.json', 'filiberto/merges.txt']

In [30]:
# initialize the tokenizer using the tokenizer we initialized and saved to file
tokenizer = RobertaTokenizer.from_pretrained('filiberto', max_len=512)


In [31]:
# test our tokenizer on a simple sentence
tokens = tokenizer('ciao, come va?')

In [32]:
print (tokens)

{'input_ids': [0, 15253, 16, 478, 589, 35, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


In [33]:
tokens.input_ids

[0, 15253, 16, 478, 589, 35, 2]

### MLM Training Input Pipeline (MLM = Masked Language Modeling)    
youTube Video: https://www.youtube.com/watch?v=heTYbpr9mD8


In [17]:
# Masked Language Model
# return a attention mask vector for training, mask ID = 4

def mlm(tensor):
    rand = torch.rand(tensor.shape)
    mask_arr = (rand < 0.15) * (tensor > 2)
    for i in range(tensor.shape[0]):
        selection = torch.flatten(mask_arr[i].nonzero()).tolist()
        tensor[i, selection] = 4
    return tensor


In [34]:
paths = [str(x) for x in Path('data/text/oscar_it').glob('**/*.txt')]
paths[:5]


['data/text/oscar_it/text_1732.txt',
 'data/text/oscar_it/text_1927.txt',
 'data/text/oscar_it/text_2374.txt',
 'data/text/oscar_it/text_469.txt',
 'data/text/oscar_it/text_2340.txt']

In [19]:
# one iteration path takes 55 seconds

input_ids = []
mask      = []
labels    = []

for path in tqdm(paths[:50]):
    with open(path, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')
    sample = tokenizer(lines, max_length=512, padding='max_length', truncation=True, return_tensors='pt')
    labels.append(sample.input_ids)
    mask.append(sample.attention_mask)
    input_ids.append(mlm(sample.input_ids.detach().clone()))


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [20]:
# Save objects as pickle files
#

f_input_ids = open("input_ids.pkl", "wb")
pickle.dump(input_ids, f_input_ids)
f_input_ids.close()

f_mask = open("mask.pkl", "wb")
pickle.dump(mask, f_mask)
f_mask.close()

f_labels = open("labels.pkl", "wb")
pickle.dump(labels, f_labels)
f_labels.close()


In [35]:
# Load objects back from the pickle files
# The notebook can be started at this point

f_input_ids = open("input_ids.pkl", "rb")
input_ids   = pickle.load(f_input_ids)
f_input_ids.close()

f_mask = open("mask.pkl", "rb")
mask   = pickle.load(f_mask)
f_mask.close()

f_labels = open("labels.pkl", "rb")
labels   = pickle.load(f_labels)
f_labels.close()


In [36]:
input_ids[:2]

[tensor([[    0,    37,    42,  ...,     1,     1,     1],
         [    0,  2938,   707,  ...,     1,     1,     1],
         [    0, 28423,   290,  ...,     1,     1,     1],
         ...,
         [    0,  1414,  6342,  ...,  1227,     4,     2],
         [    0,    37, 18027,  ...,     4,     4,     2],
         [    0,  1414,  3790,  ...,     1,     1,     1]]),
 tensor([[    0,    12, 21935,  ...,     1,     1,     1],
         [    0,     4,   956,  ...,     1,     1,     1],
         [    0,    48,   324,  ...,   452, 19806,     2],
         ...,
         [    0,  2314,    18,  ...,     1,     1,     1],
         [    0,    52,   520,  ...,     1,     1,     1],
         [    0,  1596,  5261,  ...,     1,     1,     1]])]

In [37]:
 # torch.cat(input_ids[:2])

In [38]:
input_ids = torch.cat(input_ids)
mask      = torch.cat(mask)
labels    = torch.cat(labels)


In [39]:
input_ids[1][:10]

tensor([    0,  2938,   707,   339,   324,  7958,  8426,     4, 16614,   376])

In [40]:
labels[1][:10]

tensor([    0,  2938,   707,   339,   324,  7958,  8426,   343, 16614,   376])

In [41]:
mask[1][:10]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

# Training the Model

In [42]:
encodings = {
    'input_ids': input_ids,
    'attention_mask': mask,
    'labels': labels
}


In [43]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __len__(self):
        return self.encodings['input_ids'].shape[0]
    def __getitem__(self, i):
        return {key: tensor[i] for key, tensor in self.encodings.items()}


In [44]:
dataset = Dataset(encodings)

In [45]:
# dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)
# this caused an Cuda Out-of-Memory Error!

dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=True)


In [46]:
config = RobertaConfig(
    vocab_size=tokenizer.vocab_size,       # vocab_size = 30_522
    max_position_embeddings=514,           # = max_length + 2 (for special tokens)
    hidden_size=768, 
    num_attention_heads=12,
#    num_hidden_layers=6,
    num_hidden_layers=6,
    type_vocab_size=1
)

In [47]:
model = RobertaForMaskedLM(config)


In [48]:
torch.cuda.is_available()

True

In [49]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [50]:
torch.cuda.empty_cache()

In [51]:
# move our model over to the selected device
model.to(device)


RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [52]:
optim = AdamW(model.parameters(), lr=1e-4)

## Training Loop

In [54]:
epochs = 1
step = 0

for epoch in range(epochs):
    loop = tqdm(dataloader, leave=True)
    for batch in loop:
        
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        
        # pull all tensor batches required for training
        input_ids      = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels         = batch['labels'].to(device)
    
        # process
        outputs = model(input_ids, attention_mask=_attention_mask, labels=labels)
        loss    = outputs.loss
        loss.backward()
        optim.step()
    
        loop.set_description(f'Epoch: {epoch}')
        loop.set_postfix(loss=loss.item())
 

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=125000.0), HTML(value='')))




In [55]:
model.save_pretrained('./filiberto')  # and don't forget to save filiBERTo!


# Testing the Model

In [2]:
fill = pipeline('fill-mask', model='filiberto', tokenizer='filiberto')

Some weights of RobertaModel were not initialized from the model checkpoint at filiberto and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
fill(f'ciao {fill.tokenizer.mask_token} va?')

[{'sequence': 'ciao? va?',
  'score': 0.13754770159721375,
  'token': 35,
  'token_str': '?'},
 {'sequence': "ciao' va?",
  'score': 0.012243658304214478,
  'token': 11,
  'token_str': "'"},
 {'sequence': 'ciao che va?',
  'score': 0.010065736249089241,
  'token': 313,
  'token_str': ' che'},
 {'sequence': 'ciao una va?',
  'score': 0.009317081421613693,
  'token': 376,
  'token_str': ' una'},
 {'sequence': 'ciao la va?',
  'score': 0.009047871455550194,
  'token': 306,
  'token_str': ' la'}]

In [4]:
fill(f'buongiorno, {fill.tokenizer.mask_token} va?')

[{'sequence': 'buongiorno,? va?',
  'score': 0.10432537645101547,
  'token': 35,
  'token_str': '?'},
 {'sequence': 'buongiorno,, va?',
  'score': 0.07070033252239227,
  'token': 16,
  'token_str': ','},
 {'sequence': 'buongiorno, che va?',
  'score': 0.016522590070962906,
  'token': 313,
  'token_str': ' che'},
 {'sequence': 'buongiorno, la va?',
  'score': 0.012377028353512287,
  'token': 306,
  'token_str': ' la'},
 {'sequence': 'buongiorno, di va?',
  'score': 0.011870961636304855,
  'token': 275,
  'token_str': ' di'}]