# Treinando um modelo de linguagem do zero usando Transformers e Tokenizers


O modelo Transformer deste notebook √© um modelo Transformer chamado ***Bumbabert**.

***Bumbabert*** foi pr√©-treinado com um modelo pequeno de 84 milh√µes de par√¢metros, usando o mesmo n√∫mero de camadas e cabe√ßas que o DistilBert, ou seja, 6 camadas, tamanho oculto de 768 e 12 cabe√ßas de aten√ß√£o. ***Bumbabert*** √© ent√£o ajustado para uma tarefa downstream de modelagem de linguagem mascarada.


***

# Fase 1: Carregando os dados

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('/content/drive/MyDrive/projetos/dados/pos-bumbabert/jf_p3d_pp.csv')

In [None]:
df = data.sample(1000)

#Fase 2: Instalando Hugging Face transformers


In [None]:
!pip install Transformers
!pip install --upgrade accelerate
from accelerate import Accelerator



#Fase 3: Treinando o tokenizer do zero


In [None]:
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path(".").glob("**/*.txt")]

# Read the content from the files, ignoring or replacing invalid characters
file_contents = []
for path in paths:
    try:
        with open(path, 'r', encoding='utf-8', errors='replace') as file:
            file_contents.append(file.read())
    except Exception as e:
        print(f"Error reading {path}: {e}")

# Join the contents into a single string
text = "\n".join(file_contents)

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train_from_iterator([text], vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

In [None]:
# Concatenar os textos da coluna 'text'
text = "\n".join(df['text'])

In [None]:
len(text)

15155908

In [None]:
from tokenizers import ByteLevelBPETokenizer

# Inicializar o tokenizer
tokenizer = ByteLevelBPETokenizer()

# Treinar o tokenizer
tokenizer.train_from_iterator([text], vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])


#Fase 4: Salvando ao arquivos em disco


In [None]:
import os
token_dir = '/content/drive/MyDrive/projetos/bumbabert/drRoBerta/model1'
if not os.path.exists(token_dir):
  os.makedirs(token_dir)
tokenizer.save_model('/content/drive/MyDrive/projetos/bumbabert/drRoBerta/model1')

['/content/drive/MyDrive/projetos/bumbabert/drRoBerta/model1/vocab.json',
 '/content/drive/MyDrive/projetos/bumbabert/drRoBerta/model1/merges.txt']

#Step 5: Loading the trained tokenizer files

In [None]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
path = "/content/drive/MyDrive/projetos/bumbabert/drRoBerta/model1"
tokenizer = ByteLevelBPETokenizer(
    f"{path}/vocab.json",
    f"{path}/merges.txt",
)

In [None]:
tokenizer.encode("Excelentissimo senhor juiz").tokens

['Ex', 'cel', 'en', 'tis', 'si', 'mo', 'ƒ†sen', 'h', 'or', 'ƒ†juiz']

In [None]:
tokenizer.encode("Excelentissimo senhor juiz")

Encoding(num_tokens=10, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [None]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

#Step 6: Checking Resource Constraints: GPU and NVIDIA

In [None]:
!nvidia-smi

Fri Jun 28 13:28:26 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
#@title Checking that PyTorch Sees CUDA
import torch
torch.cuda.is_available()

True

#Step 7: Defining the configuration of the model

In [None]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=30000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
    #padding_idx=0
)

In [None]:
print(config)

RobertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.41.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 30000
}



#Step 8: Reloading the tokenizer in transformers

In [None]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained(path, max_length=512)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizer'.


#Step 9: Initializing a model from scratch

In [None]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)
print(model)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

## Exploring the parameters

In [None]:
print(model.num_parameters())

66586416


In [None]:
LP=list(model.parameters())
lp=len(LP)
print(lp)
for p in range(0,lp):
  print(LP[p])

106
Parameter containing:
tensor([[-0.0036, -0.0253,  0.0339,  ...,  0.0336,  0.0152,  0.0237],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0038,  0.0013,  0.0162,  ..., -0.0270, -0.0049,  0.0301],
        ...,
        [ 0.0144,  0.0028, -0.0038,  ..., -0.0245, -0.0187,  0.0044],
        [-0.0094,  0.0050,  0.0095,  ..., -0.0070,  0.0265, -0.0209],
        [ 0.0114,  0.0312,  0.0154,  ...,  0.0200, -0.0064, -0.0232]],
       requires_grad=True)
Parameter containing:
tensor([[-0.0263,  0.0030, -0.0003,  ...,  0.0140,  0.0139,  0.0466],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0172,  0.0364, -0.0148,  ...,  0.0260, -0.0221,  0.0099],
        ...,
        [ 0.0149,  0.0072,  0.0051,  ..., -0.0118,  0.0014,  0.0195],
        [ 0.0028, -0.0130,  0.0207,  ...,  0.0089, -0.0126, -0.0328],
        [ 0.0219,  0.0021, -0.0087,  ..., -0.0403, -0.0113, -0.0119]],
       requires_grad=True)
Parameter containing:
tensor([

In [None]:
#Shape of each tensor in the model
LP = list(model.parameters())
for i, tensor in enumerate(LP):
    print(f"Shape of tensor {i}: {tensor.shape}")

Shape of tensor 0: torch.Size([30000, 768])
Shape of tensor 1: torch.Size([514, 768])
Shape of tensor 2: torch.Size([1, 768])
Shape of tensor 3: torch.Size([768])
Shape of tensor 4: torch.Size([768])
Shape of tensor 5: torch.Size([768, 768])
Shape of tensor 6: torch.Size([768])
Shape of tensor 7: torch.Size([768, 768])
Shape of tensor 8: torch.Size([768])
Shape of tensor 9: torch.Size([768, 768])
Shape of tensor 10: torch.Size([768])
Shape of tensor 11: torch.Size([768, 768])
Shape of tensor 12: torch.Size([768])
Shape of tensor 13: torch.Size([768])
Shape of tensor 14: torch.Size([768])
Shape of tensor 15: torch.Size([3072, 768])
Shape of tensor 16: torch.Size([3072])
Shape of tensor 17: torch.Size([768, 3072])
Shape of tensor 18: torch.Size([768])
Shape of tensor 19: torch.Size([768])
Shape of tensor 20: torch.Size([768])
Shape of tensor 21: torch.Size([768, 768])
Shape of tensor 22: torch.Size([768])
Shape of tensor 23: torch.Size([768, 768])
Shape of tensor 24: torch.Size([768])
Sh

In [None]:
#counting the parameters
np=0
for p in range(0,lp):#number of tensors
  PL2=True
  try:
    L2=len(LP[p][0]) #check if 2D
  except:
    L2=1             #not 2D but 1D
    PL2=False
  L1=len(LP[p])
  L3=L1*L2
  np+=L3             # number of parameters per tensor
  if PL2==True:
    print(p,L1,L2,L3)  # displaying the sizes of the parameters
  if PL2==False:
    print(p,L1,L3)  # displaying the sizes of the parameters

print(np)              # total number of parameters

0 30000 768 23040000
1 514 768 394752
2 1 768 768
3 768 768
4 768 768
5 768 768 589824
6 768 768
7 768 768 589824
8 768 768
9 768 768 589824
10 768 768
11 768 768 589824
12 768 768
13 768 768
14 768 768
15 3072 768 2359296
16 3072 3072
17 768 3072 2359296
18 768 768
19 768 768
20 768 768
21 768 768 589824
22 768 768
23 768 768 589824
24 768 768
25 768 768 589824
26 768 768
27 768 768 589824
28 768 768
29 768 768
30 768 768
31 3072 768 2359296
32 3072 3072
33 768 3072 2359296
34 768 768
35 768 768
36 768 768
37 768 768 589824
38 768 768
39 768 768 589824
40 768 768
41 768 768 589824
42 768 768
43 768 768 589824
44 768 768
45 768 768
46 768 768
47 3072 768 2359296
48 3072 3072
49 768 3072 2359296
50 768 768
51 768 768
52 768 768
53 768 768 589824
54 768 768
55 768 768 589824
56 768 768
57 768 768 589824
58 768 768
59 768 768 589824
60 768 768
61 768 768
62 768 768
63 3072 768 2359296
64 3072 3072
65 768 3072 2359296
66 768 768
67 768 768
68 768 768
69 768 768 589824
70 768 768
71 768 768

#Step 10: Building the dataset

In [None]:
d5 = df.sample(300)

In [None]:
d5

Unnamed: 0,text
20749,"10¬™ Turma, Ementa: PODER JUDICI√ÅRIOTribunal Re..."
34688,"10¬™ Turma, Ementa: PODER JUDICI√ÅRIOTribunal Re..."
21056,"10¬™ Turma, Ementa: PODER JUDICI√ÅRIOTribunal Re..."
36378,"10¬™ Turma, Ementa: PODER JUDICI√ÅRIOTribunal Re..."
45247,"10¬™ Turma, Ementa: PODER JUDICI√ÅRIOTribunal Re..."
...,...
32066,"10¬™ Turma, Ementa: PODER JUDICI√ÅRIOTribunal Re..."
46451,"10¬™ Turma, Ementa: PODER JUDICI√ÅRIOTribunal Re..."
800,"10¬™ Turma, Ementa: PODER JUDICI√ÅRIOTribunal Re..."
25205,"10¬™ Turma, Ementa: PODER JUDICI√ÅRIOTribunal Re..."


In [None]:
# Transformando em uma √∫nica string
docs_join = '\n'.join(d5['text'])

In [None]:
docs_join

In [None]:
with open('/content/drive/MyDrive/projetos/dados/pos-bumbabert/temp_texto_concatenado2.txt', 'w') as arquivo:
    arquivo.write(docs_join)

print("Texto salvo com sucesso no arquivo 'temp_texto_concatenado2.txt'")

Texto salvo com sucesso no arquivo 'temp_texto_concatenado2.txt'


In [None]:
%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path='/content/drive/MyDrive/projetos/dados/pos-bumbabert/temp_texto_concatenado2.txt', #
    block_size=128,
)



CPU times: user 8.25 s, sys: 10.8 ms, total: 8.26 s
Wall time: 9.8 s


#Step 11: Defining a data collator

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

#Step 12: Initializing the trainer

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/projetos/bumbabert/drRoBerta/model4",
    overwrite_output_dir=True,
    num_train_epochs=60,
    per_device_train_batch_size=32,
    save_steps=100,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

#Step 13: Pretraining the model

In [None]:
%%time
trainer.train()

Step,Training Loss
500,1.6873


CPU times: user 4min 35s, sys: 10.6 s, total: 4min 45s
Wall time: 5min 9s


TrainOutput(global_step=600, training_loss=1.5686592229207357, metrics={'train_runtime': 309.145, 'train_samples_per_second': 58.225, 'train_steps_per_second': 1.941, 'total_flos': 596517986304000.0, 'train_loss': 1.5686592229207357, 'epoch': 60.0})

#Step 14: Saving the final model (+tokenizer + config) to disk

> Adicionar aspas



In [None]:
trainer.save_model("/content/drive/MyDrive/projetos/bumbabert/drRoBerta/model4")

#Step 15: Language modeling with FillMaskPipeline

In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="/content/drive/MyDrive/projetos/bumbabert/drRoBerta/model4",
    tokenizer="/content/drive/MyDrive/projetos/bumbabert/drRoBerta/model1"
)

In [None]:
fill_mask("Corte Superior de <mask>, a fim de evitar o deslocamentoda compet√™ncia da Justi√ßa Federal para a Estadual, ou vice-versa, ap√≥sdecorrida toda a instru√ß√£o processual, sufragou entendimento segundo o quala compet√™ncia √© definida, ab initio, em raz√£o do pedido e da causa de pedirpresentes na pe√ßa vestibular, e n√£o por sua proced√™ncia ou improced√™ncia,legitimidade ou ilegitimidade das partes, ou qualquer outro ju√≠zo a respeitoda pr√≥pria demanda.4. Incompet√™ncia da Justi√ßa Federal para julgar a presente demanda que se reconhece.")