<a href="https://colab.research.google.com/github/eldercamposds/LLM_Bert_Roberta/blob/main/LLM_ROBERTA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
!pip install transformers[torch] # Hugging Face



In [29]:
!wget -O ./sample_data/crepusculoDosIdolos.txt https://raw.githubusercontent.com/eldercamposds/LLM_Bert_Roberta/refs/heads/main/crepusculoDosIdolos.txt

--2025-04-04 17:05:25--  https://raw.githubusercontent.com/eldercamposds/LLM_Bert_Roberta/refs/heads/main/crepusculoDosIdolos.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 162098 (158K) [text/plain]
Saving to: ‘./sample_data/crepusculoDosIdolos.txt’


2025-04-04 17:05:25 (4.60 MB/s) - ‘./sample_data/crepusculoDosIdolos.txt’ saved [162098/162098]



In [30]:
PATH = './sample_data/'
data_file = 'crepusculoDosIdolos.txt'

In [31]:
from tokenizers import ByteLevelBPETokenizer

In [32]:
tokenizer = ByteLevelBPETokenizer()

In [33]:
tokenizer.train(files=[PATH+data_file], vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",

]) # construção do vocabulário

In [34]:
print(tokenizer.encode("Hoje é um novo dia!").ids) # tokenizando o texto

[44, 83, 570, 306, 300, 1714, 556, 5]


In [35]:
tokenizer.decode([44, 83, 78, 73, 225, 132, 107, 225, 89, 81, 225, 82, 83, 90, 83, 225, 72, 77, 69, 5]) # revertendo tokenização

'Hoje é um novo dia!'

In [36]:
#vocab.json -> Tokens ordenados por frequência - Converte para ids
#merges.txt -> mapeia o texto

!rm -r ./sample_data/RAW_MODEL
!mkdir ./sample_data/RAW_MODEL
tokenizer.save_model(PATH+"RAW_MODEL")

['./sample_data/RAW_MODEL/vocab.json', './sample_data/RAW_MODEL/merges.txt']

# Building Tokenizer

In [37]:
from transformers import RobertaTokenizer


In [38]:
tokenizer = RobertaTokenizer.from_pretrained(PATH+'RAW_MODEL', max_len=512)

# Creating Transformer

In [39]:
from transformers import RobertaConfig

In [40]:
config = RobertaConfig(

    vocab_size=52_000,
    max_position_embeddings=512,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [41]:
from transformers import RobertaForMaskedLM

In [42]:
model = RobertaForMaskedLM(config=config)

In [43]:
model.num_parameters() #quantidade de parametros

83502880

# Creating Tokenized Dataset

In [44]:
from transformers import LineByLineTextDataset

In [45]:
# loading the file as a dataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=PATH+data_file,
    block_size=128,
)



In [46]:
# verify

dataset.examples[:2]

[{'input_ids': tensor([   0,   69,  276, 1154,  341,  306,  277,  273,   73,  271,  446, 1058,
            18,  352,   35, 1155,  262, 1058,  300,  527, 2240,   35,    2])},
 {'input_ids': tensor([   0,   83,  358, 1142, 3664, 1816,  272,  687, 2688,  781, 4651,  315,
          2377,  271, 2768, 1635,  285,  811, 2375,  527,    2])}]

In [47]:
tokenizer.decode(dataset.examples[0]['input_ids'])


'<s>a ociosidade é mãe de toda psicologia. como? seria a psicologia um... vício?</s>'

In [48]:
tokenizer.decode(dataset.examples[1]['input_ids'])

'<s>o mais corajoso dentre nós dispõe apenas raramente da coragem de afirmar aquilo que sabe verdadeiramente...</s>'

# Training the model



*   Data Collators are strategies for building batches of data to train the model. They create lists of samples from the dataset and allow Pytorch to apply backpropagation appropriately.
*   Probability = probability of masking input tokens



In [49]:
from transformers import DataCollatorForLanguageModeling

In [50]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [51]:
from transformers import Trainer, TrainingArguments

In [52]:
training_args = TrainingArguments(
    output_dir=PATH+"RAW_MODEL",
    overwrite_output_dir=True,
    num_train_epochs=1200,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

In [53]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [54]:
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33meldercamposdev[0m ([33meldercamposdev-zerelder[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,6.4302
1000,4.9149
1500,3.6797
2000,2.522
2500,1.5767
3000,0.9133
3500,0.5248
4000,0.3298
4500,0.2432


TrainOutput(global_step=4800, training_loss=2.2149046881993613, metrics={'train_runtime': 5534.127, 'train_samples_per_second': 49.005, 'train_steps_per_second': 0.867, 'total_flos': 8992119855513600.0, 'train_loss': 2.2149046881993613, 'epoch': 1200.0})

In [55]:
trainer.save_model(PATH+'RAW_MODEL')

# Test Model

In [56]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model=PATH+'RAW_MODEL',
    tokenizer=PATH+'RAW_MODEL'
)

Device set to use cuda:0


In [57]:
texto = "o amor é <mask>!"
fill_mask(texto)

[{'score': 0.11748901009559631,
  'token': 1603,
  'token_str': ' causalidade',
  'sequence': 'o amor é causalidade!'},
 {'score': 0.07969565689563751,
  'token': 1490,
  'token_str': ' causas',
  'sequence': 'o amor é causas!'},
 {'score': 0.06356725096702576,
  'token': 1196,
  'token_str': 'elo',
  'sequence': 'o amor éelo!'},
 {'score': 0.056110233068466187,
  'token': 644,
  'token_str': 'pos',
  'sequence': 'o amor épos!'},
 {'score': 0.03202644735574722,
  'token': 700,
  'token_str': ' er',
  'sequence': 'o amor é er!'}]