In [1]:
%config IPCompleter.use_jedi=False

# Fine Tune ModernBERT

## Referências
1. Warner, Benjamin, et al. ["Finally, a Replacement for BERT." Hugging Face, 19 Dec. 2024, huggingface.co/blog/modernbert](https://huggingface.co/blog/modernbert).
2. Stijn Smits. ["Fine-tuning ModernBERT on a Dutch Dataset with Custom Tokenizer Training" GitHub, 14 Fev. 2025, https://github.com/s-smits/modernbert-finetune](https://github.com/s-smits/modernbert-finetune).


## Training a WordPiece tokenizer

Para treinar um novo tokenizador, é preciso seguir os seguintes passos

A. Configure os parâmetros `DATASET_NAME`, `TOKENIZER_SAVE_PATH`, `VOCAB_SIZE` e `NUM_EXAMPLES_TO_TRAIN`

In [2]:
DATASET_NAME = None
TOKENIZER_SAVE_PATH = "domain_tokenizer"
VOCAB_SIZE = 32768
NUM_EXAMPLES_TO_TRAIN = 3_634_908
MODEL_TYPE = "bpe"
BATCH_SIZE = 1_000

**Importando o dataset**

In [3]:
import re
import pandas as pd
from datasets import Dataset

#  baixar arquivo de https://github.com/emdemor/News-of-the-Brazilian-Newspaper/blob/main/data/brazilian-news.parquet
df = pd.read_parquet("../data/brazilian-news.parquet")

temp = df.sample(min(NUM_EXAMPLES_TO_TRAIN, len(df)))
texts = temp["text"].to_list() + temp["title"].to_list()



def dividir_em_frases(texto):
    frases = re.split(r'(?<=[.!?])\s+', texto)
    return [frase.strip() for frase in frases if frase.strip()]

texts = []
for string in temp["text"].to_list() + temp["title"].to_list():
    if string:
        frases = dividir_em_frases(string)
        texts.extend(frases)

texts = list(set(texts))
len(texts)

3634908

In [4]:
import os

if os.environ.get("TRANSFORMERS_CACHE"):
    os.environ["HF_HOME"] = os.environ.pop("TRANSFORMERS_CACHE")

import json
from itertools import islice

import pandas as pd
from datasets import Dataset
from tokenizers import Tokenizer, normalizers
from tokenizers.models import WordPiece
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordPieceTrainer
from transformers import AutoTokenizer
from transformers.models.bert.tokenization_bert import BertTokenizer, BasicTokenizer

dataset = Dataset.from_dict({"text": texts})
dataset_iterator = iter(dataset)

# Cria o tokenizer com o modelo WordPiece
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
tokenizer.normalizer = normalizers.Sequence([])

trainer = WordPieceTrainer(
    vocab_size=VOCAB_SIZE,
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
    min_frequency=2,
)


def batch_iterator(batch_size=BATCH_SIZE):
    total_batches = (NUM_EXAMPLES_TO_TRAIN + batch_size - 1) // batch_size
    from tqdm import (
        tqdm,  # Certifique-se de importar o tqdm se ainda não estiver importado
    )

    with tqdm(
        total=NUM_EXAMPLES_TO_TRAIN, desc="Treinando tokenizer", unit="exemplos"
    ) as pbar:
        for i in range(0, NUM_EXAMPLES_TO_TRAIN, batch_size):
            batch_texts = dataset[i : i + batch_size]["text"]
            pbar.update(len(batch_texts))
            yield batch_texts


# Treina o tokenizer
tokenizer.train_from_iterator(
    batch_iterator(), trainer=trainer, length=NUM_EXAMPLES_TO_TRAIN
)

# Cria o diretório se não existir e salva o tokenizer
os.makedirs(TOKENIZER_SAVE_PATH, exist_ok=True)
tokenizer_file = os.path.join(TOKENIZER_SAVE_PATH, "tokenizer.json")
tokenizer.save(tokenizer_file)
print(f"Tokenizer trained and saved to {TOKENIZER_SAVE_PATH}")

# Cria automaticamente o arquivo config.json se não existir, informando o model_type
config_path = os.path.join(TOKENIZER_SAVE_PATH, "config.json")
if not os.path.exists(config_path):
    config = {"model_type": "bert"}
    with open(config_path, "w") as f:
        json.dump(config, f)


Treinando tokenizer: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3634908/3634908 [00:17<00:00, 210689.46exemplos/s]





Tokenizer trained and saved to domain_tokenizer


In [None]:

# # Agora o AutoTokenizer conseguirá carregar o tokenizer corretamente
# tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_SAVE_PATH)

In [6]:
# import os

# if os.environ.get("TRANSFORMERS_CACHE"):
#     os.environ["HF_HOME"] = os.environ.pop("TRANSFORMERS_CACHE")

# import json
# from itertools import islice

# import pandas as pd
# from datasets import Dataset
# from tokenizers import Tokenizer
# from tokenizers.models import WordPiece
# from tokenizers.pre_tokenizers import Whitespace
# from tokenizers.trainers import WordPieceTrainer
# from transformers import AutoTokenizer

# #  baixar arquivo de https://github.com/emdemor/News-of-the-Brazilian-Newspaper/blob/main/data/brazilian-news.parquet
# df = pd.read_parquet("../data/brazilian-news.parquet")

# temp = df.sample(min(NUM_EXAMPLES_TO_TRAIN, len(df)))
# texts = temp["text"].to_list() + temp["title"].to_list()
# texts = [x[:MAX_CHAR_LENGTH] for x in texts if x]

# dataset = Dataset.from_dict({"text": texts})
# dataset_iterator = iter(dataset)

# tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
# tokenizer.pre_tokenizer = Whitespace()

# trainer = WordPieceTrainer(
#     vocab_size=VOCAB_SIZE,
#     special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
#     min_frequency=2,
# )


# def batch_iterator(batch_size=BATCH_SIZE):
#     total_batches = (NUM_EXAMPLES_TO_TRAIN + batch_size - 1) // batch_size
#     with tqdm(
#         total=NUM_EXAMPLES_TO_TRAIN, desc="Treinando tokenizer", unit="exemplos"
#     ) as pbar:
#         for i in range(0, NUM_EXAMPLES_TO_TRAIN, batch_size):
#             batch_texts = dataset[i : i + batch_size]["text"]
#             pbar.update(len(batch_texts))
#             yield batch_texts


# tokenizer.train_from_iterator(
#     batch_iterator(), trainer=trainer, length=NUM_EXAMPLES_TO_TRAIN
# )
# os.makedirs(TOKENIZER_SAVE_PATH, exist_ok=True)
# tokenizer.save(os.path.join(TOKENIZER_SAVE_PATH, "tokenizer.json"))
# print(f"Tokenizer trained and saved to {TOKENIZER_SAVE_PATH}")

# # Cria automaticamente o arquivo config.json se não existir
# config_path = os.path.join(TOKENIZER_SAVE_PATH, "config.json")
# if not os.path.exists(config_path):
#     config = {"model_type": "bert"}  # Altere para o tipo adequado se necessário
#     with open(config_path, "w") as f:
#         json.dump(config, f)


# tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

## Fine-tuning the ModernBERT-base model 

In [8]:
# !pip uninstall torchvision -y
# !pip install torchvision==0.18.0 -f https://download.pytorch.org/whl/torch_stable.html -qqq
# !pip install --upgrade 'optree>=0.13.0' -qqq
# !pip install -U torch torch-adopt torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
# !pip install -U torch-adopt -qqq

# !pip uninstall transformers -y
# !pip install git+https://github.com/huggingface/transformers.git

# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

[0mCollecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-llqjm_pk
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-llqjm_pk
  Resolved https://github.com/huggingface/transformers.git to commit 336dc69d63d56f232a183a3e7f52790429b871ef
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers==4.49.0.dev0)
  Downloading huggingface_hub-0.28.1-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers==4.49.0.dev0)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading huggingface_hub-0.28.1-py3-none-any.whl (464 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

In [3]:
import re
import pandas as pd
from datasets import Dataset

#  baixar arquivo de https://github.com/emdemor/News-of-the-Brazilian-Newspaper/blob/main/data/brazilian-news.parquet
df = pd.read_parquet("../data/brazilian-news.parquet")

temp = df.sample(min(NUM_EXAMPLES_TO_TRAIN, len(df)))
texts = temp["text"].to_list() + temp["title"].to_list()



def dividir_em_frases(texto):
    frases = re.split(r'(?<=[.!?])\s+', texto)
    return [frase.strip() for frase in frases if frase.strip()]

texts = []
for string in temp["text"].to_list() + temp["title"].to_list():
    if string:
        frases = dividir_em_frases(string)
        texts.extend(frases)

texts = list(set(texts))
len(texts)

dataset = Dataset.from_dict({"text": texts})
dataset_iterator = iter(dataset)

In [4]:
import os

if os.environ.get("TRANSFORMERS_CACHE"):
    os.environ["HF_HOME"] = os.environ.pop("TRANSFORMERS_CACHE")


import math
import shutil
from typing import Any, Dict, List

import torch
import torch.nn as nn
from datasets import load_dataset
from huggingface_hub import Repository, whoami
from torch.optim import AdamW
from tqdm.auto import tqdm
from transformers import (
    AutoConfig,
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    get_linear_schedule_with_warmup,
)

In [5]:
# --- Configuration ---
model_checkpoint = "answerdotai/ModernBERT-base"
username = "emdemor"
tokenizer_path = "domain_tokenizer"  # Path to custom tokenizer directory

# --- Dataset size (in rows) ---
estimated_dataset_size_in_rows = 3_500_000

# --- Training Config ---
num_train_epochs = 1
# Reduce or remove chunk size to allow for dynamic batching
chunk_size = None  # Remove chunk size
per_device_train_batch_size = 4
gradient_accumulation_steps = 2
eval_size_ratio = 0.05
total_save_limit = 2

effective_batch_size = per_device_train_batch_size * gradient_accumulation_steps
total_steps_per_epoch = math.ceil(estimated_dataset_size_in_rows / effective_batch_size)
total_train_steps = total_steps_per_epoch * num_train_epochs
eval_size_per_chunk = int(estimated_dataset_size_in_rows * eval_size_ratio)

# --- Testing Mode ---
TESTING = True  # Set to True for testing, False for full training
FLASH_ATTENTION = True

if TESTING:
    push_interval = 10_000
else:
    push_interval = 100_000

In [6]:
if FLASH_ATTENTION:
    try:
        import flash_attn

        print("FlashAttention is already installed.")
    except ImportError:
        print("FlashAttention is not installed. Installing...")
        try:
            import subprocess

            subprocess.run(
                ["pip", "install", "flash-attn", "--no-build-isolation"], check=True
            )
            import flash_attn

            print("FlashAttention installed successfully.")
        except Exception as e:
            print(f"Error installing FlashAttention: {e}")
            exit()

FlashAttention is already installed.


In [7]:
# --- Flash-attn Integration Check ---
try:
    from flash_attn.flash_attention import FlashAttention

    print("FlashAttention is available.")
    flash_attn_available = True
except ImportError:
    print("FlashAttention is not available. Using standard attention.")
    flash_attn_available = False

FlashAttention is not available. Using standard attention.


In [8]:
# --- Load Tokenizer and Model ---
print(f"Loading model and tokenizer from {model_checkpoint}...")

# Check if custom tokenizer exists, otherwise use default
if os.path.exists(tokenizer_path) and any(
    fname.startswith("spm") for fname in os.listdir(tokenizer_path)
):
    print(f"Loading custom SentencePiece tokenizer from {tokenizer_path}...")
    from transformers import AutoTokenizer

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    # Add the pad_token if it's not already in the tokenizer
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
        model.resize_token_embeddings(len(tokenizer))
elif os.path.exists(tokenizer_path) and os.path.isfile(
    os.path.join(tokenizer_path, "tokenizer.json")
):
    print(f"Loading custom tokenizer from {tokenizer_path}...")
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
else:
    print(f"Using default tokenizer from {model_checkpoint}...")
    tokenizer = AutoTokenizer.from_pretrained(
        model_checkpoint, use_auth_token=huggingface_token
    )

print(f"Loading model config from {model_checkpoint}...")
config = AutoConfig.from_pretrained(
    model_checkpoint,  # use_auth_token=huggingface_token
)
config.torch_dtype = "float16"
print(f"Model config loaded and modified: {config}")

model = AutoModelForMaskedLM.from_pretrained(
    model_checkpoint,
    config=config,  # use_auth_token=huggingface_token
)

# --- Device Configuration ---
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(f"Using device: {device}")
model.to(device)

print("Model and tokenizer loaded.")

Loading model and tokenizer from answerdotai/ModernBERT-base...
Loading custom tokenizer from domain_tokenizer...
Loading model config from answerdotai/ModernBERT-base...
Model config loaded and modified: ModernBertConfig {
  "_name_or_path": "answerdotai/ModernBERT-base",
  "architectures": [
    "ModernBertForMaskedLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 50281,
  "classifier_activation": "gelu",
  "classifier_bias": false,
  "classifier_dropout": 0.0,
  "classifier_pooling": "mean",
  "cls_token_id": 50281,
  "decoder_bias": true,
  "deterministic_flash_attn": false,
  "embedding_dropout": 0.0,
  "eos_token_id": 50282,
  "global_attn_every_n_layers": 3,
  "global_rope_theta": 160000.0,
  "gradient_checkpointing": false,
  "hidden_activation": "gelu",
  "hidden_size": 768,
  "initializer_cutoff_factor": 2.0,
  "initializer_range": 0.02,
  "intermediate_size": 1152,
  "layer_norm_eps": 1e-05,
  "local_attention": 128,
  "local_rope_theta": 1000

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


Using device: cuda:0
Model and tokenizer loaded.


In [9]:
flash_attn_available = True

# --- Integrate Flash-attn (if available) ---
if flash_attn_available:
    print("Replacing standard attention with FlashAttention...")
    for module in model.modules():
        if isinstance(module, nn.MultiheadAttention):
            module.attention = FlashAttention()
    print("FlashAttention integrated.")

Replacing standard attention with FlashAttention...
FlashAttention integrated.


In [10]:
# --- Tokenization Function ---
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        # No truncation and max_length to allow dynamic padding truncation=True, max_length=chunk_size, padding="longest",
        return_special_tokens_mask=True,
    )


In [11]:
# --- Tokenize Dataset ---
print("Tokenizing dataset...")
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names,
)
print("Dataset tokenized.")

Tokenizing dataset...


Map:   0%|          | 0/3634908 [00:00<?, ? examples/s]

Dataset tokenized.


In [12]:
model_name = model_checkpoint.split("/")[-1]
output_dir = f"{model_name}-ptbr-{'test' if TESTING else 'full'}"
repo_name = f"{username}/{output_dir}"


In [13]:
if os.path.exists(output_dir):
    shutil.rmtree(output_dir)

In [14]:
from torch.optim import AdamW


# --- Optimizer and Scheduler ---
optimizer = AdamW(model.parameters(), lr=5e-4, weight_decay=0.01)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=total_train_steps
)

# --- AMP scaler for mixed precision ---
scaler = torch.amp.GradScaler('cuda', enabled=(device.type == "cuda"))

In [15]:
# --- Helper Function to Fix Batch Inputs ---
def fix_batch_inputs(inputs: dict) -> dict:
    """
    Ensures that input tensors have the correct shape and dtype.
    - Removes any extra dimensions (e.g., [1, batch, seq_len] -> [batch, seq_len]).
    - Casts input_ids to torch.long.
    """
    for key in ["input_ids", "attention_mask", "token_type_ids"]:
        if key in inputs:
            if inputs[key].dim() == 3 and inputs[key].shape[0] == 1:
                inputs[key] = inputs[key].squeeze(0)
            elif inputs[key].dim() > 2:
                raise ValueError(
                    f"Unexpected tensor shape for {key}: {inputs[key].shape}"
                )
    if "input_ids" in inputs and inputs["input_ids"].dtype != torch.long:
        inputs["input_ids"] = inputs["input_ids"].long()
    return inputs

# --- Forward Pass Function ---
def forward_pass(model, inputs):
    """
    Performs a forward pass with autocast for FP16.
    Returns the loss.
    """
    inputs = fix_batch_inputs(inputs)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.amp.autocast('cuda', enabled=(device.type == "cuda")):
        outputs = model(**inputs, return_dict=True)
    if outputs.loss is None:
        raise ValueError("Model did not return a loss.")
    return outputs.loss

# --- Evaluation Function ---
def evaluate(model, eval_dataset, data_collator):
    """
    Evaluates the model on the evaluation dataset.
    Returns the average loss.
    """
    model.eval()
    losses = []
    eval_iterator = eval_dataset.iter(batch_size=per_device_train_batch_size)
    for batch in tqdm(eval_iterator, desc="Evaluating"):
        with torch.no_grad(), torch.amp.autocast('cuda',
            enabled=(device.type == "cuda")
        ):
            inputs = data_collator(batch)
            try:
                loss = forward_pass(model, inputs)
                losses.append(loss.item())
            except Exception as e:
                print(f"Evaluation batch failed: {e}. Skipping.")
                continue
    model.train()
    average_loss = sum(losses) / len(losses) if losses else float("inf")
    return average_loss

In [16]:
class DynamicPaddingDataCollator(DataCollatorForLanguageModeling):
    """
    Data collator that dynamically pads the inputs for language modeling.
    This ensures that all sequences within a batch have the same length,
    but the overall length can vary between batches.
    """

    def __call__(self, examples: Dict[str, Any]) -> Dict[str, torch.Tensor]:
        # Find the maximum length within the current batch
        max_length = max(len(input_ids) for input_ids in examples['input_ids'])

        # Pad or truncate each example to the max_length
        batch = []
        input_ids = examples["input_ids"]
        attention_mask = examples["attention_mask"]

        for ids, mask in zip(input_ids, attention_mask):
            padding_length = max_length - len(ids)
            if padding_length > 0:
                # Pad
                ids = torch.tensor(ids + [self.tokenizer.pad_token_id] * padding_length)
                mask = torch.tensor(mask + [0] * padding_length)
            elif padding_length <= 0:
                # Truncate (if enabled in your tokenizer)
                ids = torch.tensor(ids[:max_length])
                mask = torch.tensor(mask[:max_length])

            batch.append({"input_ids": ids, "attention_mask": mask})

        # Apply the rest of the data collation logic (MLM masking, etc.)
        batch = self.torch_call(batch)  # Use torch_call instead of __call__ to call the parent's method

        # Ensure correct shapes and dtypes
        batch = fix_batch_inputs(batch)

        return batch


In [17]:
mlm_probabilities = [0.3, 0.2, 0.18, 0.16, 0.14]

chunk_size_dataset = len(dataset) // len(mlm_probabilities)

In [18]:
model.train()
global_step = 0

In [None]:
for epoch in range(num_train_epochs):
    for i, mlm_probability in enumerate(mlm_probabilities):
        print(
            f"\nEpoch {epoch + 1}/{num_train_epochs}, MLM Probability: {mlm_probability}"
        )

        data_collator = DynamicPaddingDataCollator(
            tokenizer=tokenizer, mlm_probability=mlm_probability
        )

        train_dataset = (
            tokenized_dataset.skip(
                i * chunk_size_dataset + eval_size_per_chunk
            )
            .take(chunk_size_dataset)
            .shuffle(seed=42)
        )
        eval_dataset = tokenized_dataset.skip(i * chunk_size_dataset).take(
            eval_size_per_chunk
        )

        train_iterator = train_dataset.iter(batch_size=per_device_train_batch_size)
        for step, batch in enumerate(
            tqdm(train_iterator, desc=f"Training (MLM {mlm_probability})")
        ):
            try:
                inputs = data_collator(batch)
                loss = forward_pass(model, inputs)
            except Exception as e:
                print(f"Training batch failed: {e}. Skipping.")
                continue

            scaler.scale(loss / gradient_accumulation_steps).backward()

            if (step + 1) % gradient_accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                scheduler.step()
                optimizer.zero_grad()
                torch.cuda.empty_cache()  # Clear cache
                global_step += 1


                # Evaluation
                eval_interval = total_steps_per_epoch // (num_train_epochs * 4)
                if eval_interval > 0 and (global_step % eval_interval == 0):
                    eval_loss = evaluate(model, eval_dataset, data_collator)
                    print(f"Evaluation loss at step {global_step}: {eval_loss}")

                # Push to hub incl TESTING
                if global_step % push_interval == 0:
                    print(f"Saving and pushing model at step {global_step}...")
                    model.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
                    print(f"Model saved and pushed at step {global_step}.")


Epoch 1/1, MLM Probability: 0.3


Training (MLM 0.3): 0it [00:00, ?it/s]

Saving and pushing model at step 10000...
Model saved and pushed at step 10000.
Saving and pushing model at step 20000...
Model saved and pushed at step 20000.
Saving and pushing model at step 30000...
Model saved and pushed at step 30000.
Saving and pushing model at step 40000...
Model saved and pushed at step 40000.
Saving and pushing model at step 50000...
Model saved and pushed at step 50000.
Saving and pushing model at step 60000...
Model saved and pushed at step 60000.
Saving and pushing model at step 70000...
Model saved and pushed at step 70000.
Saving and pushing model at step 80000...
Model saved and pushed at step 80000.
Saving and pushing model at step 90000...
Model saved and pushed at step 90000.

Epoch 1/1, MLM Probability: 0.2


Training (MLM 0.2): 0it [00:00, ?it/s]

Saving and pushing model at step 100000...
Model saved and pushed at step 100000.


Evaluating: 0it [00:00, ?it/s]

Evaluation loss at step 109375: 3.5834869703536376
Saving and pushing model at step 110000...
Model saved and pushed at step 110000.
Saving and pushing model at step 120000...
Model saved and pushed at step 120000.
Saving and pushing model at step 130000...
Model saved and pushed at step 130000.
Saving and pushing model at step 140000...
Model saved and pushed at step 140000.
Saving and pushing model at step 150000...
Model saved and pushed at step 150000.
Saving and pushing model at step 160000...
Model saved and pushed at step 160000.
Saving and pushing model at step 170000...
Model saved and pushed at step 170000.
Saving and pushing model at step 180000...
Model saved and pushed at step 180000.

Epoch 1/1, MLM Probability: 0.18


Training (MLM 0.18): 0it [00:00, ?it/s]

Saving and pushing model at step 190000...
Model saved and pushed at step 190000.
Saving and pushing model at step 200000...
Model saved and pushed at step 200000.
Saving and pushing model at step 210000...
Model saved and pushed at step 210000.


Evaluating: 0it [00:00, ?it/s]

Evaluation loss at step 218750: 3.2397417649450686
Saving and pushing model at step 220000...
Model saved and pushed at step 220000.
Saving and pushing model at step 230000...
Model saved and pushed at step 230000.
Saving and pushing model at step 240000...
Model saved and pushed at step 240000.
Saving and pushing model at step 250000...
Model saved and pushed at step 250000.


In [None]:
# Final Save and Push
print("\nSaving and pushing final model...")
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print("Final model saved and pushed.")