# 1. Imports

In [1]:
import argparse
import logging
import math
import os
import re
import shutil
from dataclasses import dataclass
from itertools import product
from typing import Any, Dict, List, Optional

import pandas as pd
import tabulate
import torch
import torch.nn as nn
from datasets import Dataset, load_dataset
from pydantic import BaseModel, field_validator
from torch.optim import AdamW
from tqdm.auto import tqdm
from transformers import (
    AutoConfig,
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    get_linear_schedule_with_warmup,
)

# 2. Constantes e Configurações

In [2]:
# Configuração de logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    handlers=[logging.FileHandler("training.log"), logging.StreamHandler()],
)
logger = logging.getLogger(__name__)

In [3]:
# def parse_args():
#     """Parse command line arguments."""
#     parser = argparse.ArgumentParser(description="Treinar modelo MLM em português")

#     # Argumentos principais
#     parser.add_argument("--model_id", default="answerdotai/ModernBERT-base",
#                         help="ID do modelo base no Hugging Face Hub")
#     parser.add_argument("--dataset_id", default="emdemor/news-of-the-brazilian-newspaper",
#                         help="ID do dataset no Hugging Face Hub")
#     parser.add_argument("--tokenizer_path", default="domain_tokenizer",
#                         help="Caminho para o tokenizador")
#     parser.add_argument("--output_dir", default=None,
#                         help="Diretório para salvar o modelo treinado")

#     # Configurações de treinamento
#     parser.add_argument("--num_train_epochs", type=int, default=3,
#                         help="Número de épocas de treinamento")
#     parser.add_argument("--train_batch_size", type=int, default=4,
#                         help="Tamanho do batch por dispositivo")
#     parser.add_argument("--gradient_accumulation_steps", type=int, default=2,
#                         help="Passos de acumulação de gradiente")
#     parser.add_argument("--learning_rate", type=float, default=5e-3,
#                         help="Taxa de aprendizado inicial")
#     parser.add_argument("--weight_decay", type=float, default=0.01,
#                         help="Peso de decaimento")
#     parser.add_argument("--warmup_steps", type=int, default=0,
#                         help="Passos de aquecimento para o scheduler")

#     # Configurações de dados
#     parser.add_argument("--max_news", type=int, default=3535,
#                         help="Número máximo de notícias a usar")
#     parser.add_argument("--max_sentences", type=int, default=3456,
#                         help="Número máximo de sentenças")
#     parser.add_argument("--eval_ratio", type=float, default=0.1,
#                         help="Fração dos dados usada para validação")

#     # Otimizações
#     parser.add_argument("--use_flash_attention", action="store_true",
#                         help="Usar Flash Attention se disponível")
#     parser.add_argument("--fp16", action="store_true",
#                         help="Usar treinamento em precisão mista")
#     parser.add_argument("--push_interval", type=int, default=10000,
#                         help="Intervalo de steps para push para o Hub")
#     parser.add_argument("--num_workers", type=int, default=4,
#                         help="Número de workers para processamento de dados")

#     # Modo de teste
#     parser.add_argument("--testing", action="store_true",
#                         help="Executar em modo de teste")

#     args = parser.parse_args()

#     # Configurar diretório de saída se não especificado
#     if args.output_dir is None:
#         model_name = args.model_id.split("/")[-1]
#         args.output_dir = f"{model_name}-ptbr-{'test' if args.testing else 'full'}"

#     return args

# 3. Classes utilitárias

In [4]:
class TrainingParams(BaseModel):
    """Configuração completa para treinamento."""

    dataset_size: int
    num_train_epochs: int
    num_chunks: int
    train_batch_size_per_device: int
    gradient_accumulation_steps: int
    eval_size_ratio: float
    total_save_limit: int

    @field_validator("num_chunks")
    def validate_num_chunks(cls, v, info):
        data = info.data
        if (
            "dataset_size" in data
            and "dataset_size" in data
            and "eval_size_ratio" in data
        ):
            dataset_size = data["dataset_size"]
            eval_size_per_chunk = int(data["dataset_size"] * data["eval_size_ratio"])
            available_size = dataset_size - eval_size_per_chunk * v
            if available_size < v:
                raise ValueError(
                    f"available_size ({available_size}) deve ser maior ou igual a num_chunks ({v})"
                )
        return v

    @property
    def effective_batch_size(self):
        """Tamanho efetivo do batch considerando a acumulação de gradiente."""
        return self.train_batch_size_per_device * self.gradient_accumulation_steps

    @property
    def total_steps_per_epoch(self):
        """Total de passos por época."""
        return math.ceil(self.dataset_size / self.effective_batch_size)

    @property
    def total_train_steps(self):
        """Total de passos de treinamento."""
        return self.total_steps_per_epoch * self.num_train_epochs

    @property
    def eval_size_per_chunk(self):
        """Tamanho do dataset de avaliação em cada chunk."""
        return int(self.dataset_size * self.eval_size_ratio / self.num_chunks)

    @property
    def available_size(self):
        """Tamanho disponível para treinamento."""
        return self.dataset_size - self.eval_size_per_chunk * self.num_chunks

    @property
    def eval_size(self):
        """Tamanho total para avaliação."""
        return self.dataset_size - self.available_size

    @property
    def chunk_size(self):
        """Tamanho de cada chunk de dados."""
        return self.dataset_size // self.num_chunks

    @property
    def chunk_train_size(self):
        """Tamanho de treinamento em cada chunk."""
        return self.available_size // self.num_chunks

    def __repr(self):
        data = [
            ["num_train_epochs", self.num_train_epochs],
            ["dataset_size", self.dataset_size],
            ["num_chunks", self.num_chunks],
            ["chunk_size", self.chunk_size],
            ["chunk_train_size", self.chunk_train_size],
            ["eval_size_per_chunk", self.eval_size_per_chunk],
            ["eval_size_ratio", self.eval_size_ratio],
            ["available_size", self.available_size],
            ["eval_size", self.eval_size],
            ["train_batch_size_per_device", self.train_batch_size_per_device],
            ["gradient_accumulation_steps", self.gradient_accumulation_steps],
            ["total_save_limit", self.total_save_limit],
            ["effective_batch_size", self.effective_batch_size],
            ["total_steps_per_epoch", self.total_steps_per_epoch],
            ["total_train_steps", self.total_train_steps],
        ]

        return tabulate.tabulate(data, headers=["Attribute", "Value"], tablefmt="grid")

    def __repr__(self):
        return self.__repr()

    def __str__(self):
        return self.__repr()

In [5]:
def split_into_sentences(text: str) -> List[str]:
    """
    Divide o texto em sentenças.

    Args:
        text: Texto a ser dividido

    Returns:
        Lista de sentenças
    """
    return [
        sentence.strip()
        for sentence in re.split(r"(?<=[.!?])\s+", text)
        if sentence.strip()
    ]


def set_attention(model, use_flash_attention=False):
    """
    Configura atenção do modelo, possibilitando uso de Flash Attention.

    Args:
        model: Modelo a ser configurado
        use_flash_attention: Se deve usar Flash Attention

    Returns:
        Modelo configurado
    """
    if not use_flash_attention:
        return model

    def check_flash_attention_support():
        if not torch.cuda.is_available():
            return False
        try:
            from flash_attn import flash_attn_qkvpacked_func

            qkv = torch.randn(1, 1, 3, 16, 64, dtype=torch.float16, device="cuda")
            flash_attn_qkvpacked_func(qkv, causal=False)
            return True
        except (ImportError, RuntimeError) as e:
            logger.warning(f"Flash Attention não é compatível: {str(e)}")
            return False

    if check_flash_attention_support():
        logger.info("Replacing standard attention with FlashAttention...")
        try:
            from flash_attn import FlashAttention

            for module in model.modules():
                if isinstance(module, nn.MultiheadAttention):
                    module.attention = FlashAttention()
            logger.info("FlashAttention integrated successfully.")
        except Exception as e:
            logger.error(f"Failed to integrate FlashAttention: {str(e)}")

    return model


def check_vocab_size(tokenizer, model):
    """
    Verifica se o tamanho do vocabulário do tokenizador é compatível com o modelo.

    Args:
        tokenizer: Tokenizador a ser verificado
        model: Modelo a ser verificado

    Raises:
        AssertionError: Se os IDs do token estiverem fora do intervalo do modelo
    """
    max_token_id = max(tokenizer.get_vocab().values())
    logger.info(f"Maior ID no tokenizador: {max_token_id}")
    logger.info(f"Tamanho do vocabulário do modelo: {model.config.vocab_size}")
    assert max_token_id < model.config.vocab_size, "IDs de tokens fora do intervalo!"


def tokenize_function(examples, tokenizer, target_column="text"):
    """
    Função para tokenizar exemplos do dataset.

    Args:
        examples: Batch de exemplos a serem tokenizados
        tokenizer: Tokenizador a ser utilizado
        target_column: Nome da coluna contendo o texto

    Returns:
        Exemplos tokenizados
    """
    return tokenizer(
        examples[target_column],
        return_special_tokens_mask=True,
    )


def tokenize_dataset(dataset, tokenizer, num_proc=4):
    """
    Tokeniza o dataset completo usando processamento paralelo.

    Args:
        dataset: Dataset a ser tokenizado
        tokenizer: Tokenizador a ser utilizado
        num_proc: Número de processos para paralelização

    Returns:
        Dataset tokenizado
    """
    tokenized_dataset = dataset.map(
        lambda examples: tokenize_function(examples, tokenizer),
        batched=True,
        remove_columns=dataset.column_names,
        num_proc=num_proc,
    )

    return tokenized_dataset


class DynamicPaddingDataCollator(DataCollatorForLanguageModeling):
    """
    Colator de dados com preenchimento dinâmico para MLM.
    """

    def __call__(self, examples: Dict[str, Any]) -> Dict[str, torch.Tensor]:
        # Find the maximum length within the current batch
        max_length = max(len(input_ids) for input_ids in examples["input_ids"])

        # Pad or truncate each example to the max_length
        batch = []
        input_ids = examples["input_ids"]
        attention_mask = examples["attention_mask"]

        for ids, mask in zip(input_ids, attention_mask):
            padding_length = max_length - len(ids)
            if padding_length > 0:
                # Pad
                ids = torch.tensor(ids + [self.tokenizer.pad_token_id] * padding_length)
                mask = torch.tensor(mask + [0] * padding_length)
            elif padding_length <= 0:
                # Truncate (if enabled in your tokenizer)
                ids = torch.tensor(ids[:max_length])
                mask = torch.tensor(mask[:max_length])

            batch.append({"input_ids": ids, "attention_mask": mask})

        # Apply the rest of the data collation logic (MLM masking, etc.)
        batch = self.torch_call(batch)

        # Ensure correct shapes and dtypes
        batch = fix_batch_inputs(batch)

        return batch


def fix_batch_inputs(inputs: dict) -> dict:
    """
    Garante que os tensores de entrada tenham a forma e o tipo corretos.

    Args:
        inputs: Dicionário com tensores de entrada

    Returns:
        Dicionário com tensores corrigidos

    Raises:
        ValueError: Se algum tensor tiver forma inesperada
    """
    for key in ["input_ids", "attention_mask", "token_type_ids"]:
        if key in inputs:
            if inputs[key].dim() == 3 and inputs[key].shape[0] == 1:
                inputs[key] = inputs[key].squeeze(0)
            elif inputs[key].dim() > 2:
                raise ValueError(
                    f"Unexpected tensor shape for {key}: {inputs[key].shape}"
                )
    if "input_ids" in inputs and inputs["input_ids"].dtype != torch.long:
        inputs["input_ids"] = inputs["input_ids"].long()
    return inputs


def forward_pass(model, inputs, device):
    """
    Realiza uma passagem para frente no modelo.

    Args:
        model: Modelo para realizar a passagem
        inputs: Entradas do modelo
        device: Dispositivo onde o modelo está

    Returns:
        Perda calculada pelo modelo

    Raises:
        ValueError: Se o modelo não retornar uma perda
    """
    inputs = fix_batch_inputs(inputs)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.amp.autocast("cuda", enabled=(device.type == "cuda")):
        outputs = model(**inputs, return_dict=True)
    if outputs.loss is None:
        raise ValueError("Model did not return a loss.")
    return outputs.loss


def evaluate(model, eval_dataset, data_collator, batch_size, device):
    """
    Avalia o desempenho do modelo no conjunto de validação.

    Args:
        model: Modelo a ser avaliado
        eval_dataset: Dataset de avaliação
        data_collator: Colator de dados para preparar batches
        batch_size: Tamanho do batch para avaliação
        device: Dispositivo onde o modelo está

    Returns:
        Perda média de avaliação
    """
    model.eval()
    losses = []
    eval_iterator = eval_dataset.iter(batch_size=batch_size)

    for batch in tqdm(eval_iterator, desc="Evaluating"):
        with torch.no_grad(), torch.amp.autocast(
            "cuda", enabled=(device.type == "cuda")
        ):
            try:
                inputs = data_collator(batch)
                loss = forward_pass(model, inputs, device)
                losses.append(loss.item())
            except Exception as e:
                logger.warning(f"Evaluation batch failed: {e}. Skipping.")
                continue

    model.train()
    average_loss = sum(losses) / len(losses) if losses else float("inf")
    return average_loss


def save_checkpoint(
    model, optimizer, scheduler, tokenizer, global_step, epoch, output_dir
):
    """
    Salva um checkpoint do treinamento.

    Args:
        model: Modelo a ser salvo
        optimizer: Otimizador a ser salvo
        scheduler: Scheduler a ser salvo
        tokenizer: Tokenizador a ser salvo
        global_step: Passo global atual
        epoch: Época atual
        output_dir: Diretório base para salvar
    """
    checkpoint_dir = os.path.join(output_dir, f"checkpoint-{global_step}")
    os.makedirs(checkpoint_dir, exist_ok=True)

    # Salvar estado do modelo e otimizador
    torch.save(
        {
            "epoch": epoch,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "scheduler_state_dict": scheduler.state_dict(),
            "global_step": global_step,
        },
        os.path.join(checkpoint_dir, "training_state.pt"),
    )

    # Salvar modelo e tokenizador no formato HF
    model.save_pretrained(checkpoint_dir)
    tokenizer.save_pretrained(checkpoint_dir)

    logger.info(f"Saved checkpoint at step {global_step} to {checkpoint_dir}")


def load_checkpoint(model, optimizer, scheduler, checkpoint_path):
    """
    Carrega um checkpoint de treinamento.

    Args:
        model: Modelo a ser carregado
        optimizer: Otimizador a ser carregado
        scheduler: Scheduler a ser carregado
        checkpoint_path: Caminho para o checkpoint

    Returns:
        Tupla com (modelo, otimizador, scheduler, global_step, epoch)
    """
    logger.info(f"Loading checkpoint from {checkpoint_path}")
    checkpoint = torch.load(os.path.join(checkpoint_path, "training_state.pt"))

    model.load_state_dict(checkpoint["model_state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    scheduler.load_state_dict(checkpoint["scheduler_state_dict"])

    return model, optimizer, scheduler, checkpoint["global_step"], checkpoint["epoch"]

# Main

In [28]:
import argparse
import sys
from typing import List

import yaml
from pydantic import BaseModel, Field


class ModelConfig(BaseModel):
    base_id: str = Field(..., description="ID do modelo base no Hugging Face Hub")
    tokenizer_path: str = Field(..., description="Caminho para o tokenizador")
    output_dir: str = Field(..., description="Diretório para salvar o modelo treinado")


class DatasetConfig(BaseModel):
    id: str = Field(..., description="ID do dataset no HF Hub")
    max_news: int = Field(..., description="Número máximo de notícias a usar")
    max_sentences: int = Field(..., description="Número máximo de sentenças")
    eval_ratio: float = Field(..., description="Fração dos dados usada para validação")


class TrainingConfig(BaseModel):
    num_train_epochs: int = Field(..., description="Número de épocas de treinamento")
    train_batch_size: int = Field(..., description="Tamanho do batch por dispositivo")
    gradient_accumulation_steps: int = Field(
        ..., description="Passos de acumulação de gradiente"
    )
    learning_rate: float = Field(..., description="Taxa de aprendizado inicial")
    weight_decay: float = Field(..., description="Peso de decaimento")
    warmup_steps: int = Field(..., description="Passos de aquecimento para o scheduler")
    mlm_probabilities: List[float] = Field(
        ..., description="Probabilidades de mascaramento para diferentes chunks"
    )
    total_save_limit: int = Field(
        ..., description="Número máximo de checkpoints a manter"
    )


class OptimizationConfig(BaseModel):
    use_flash_attention: bool = Field(
        ..., description="Usar Flash Attention se disponível"
    )
    fp16: bool = Field(..., description="Usar treinamento em precisão mista")
    push_interval: int = Field(
        ..., description="Intervalo de steps para push para o Hub"
    )
    num_workers: int = Field(
        ..., description="Número de workers para processamento de dados"
    )


class ExecutionConfig(BaseModel):
    testing: bool = Field(..., description="Executar em modo de teste")
    seed: int = Field(..., description="Semente para reprodutibilidade")
    device: str = Field(..., description="Dispositivo de execução: auto, cuda ou cpu")


class MLMTrainingConfig(BaseModel):
    model: ModelConfig
    dataset: DatasetConfig
    training: TrainingConfig
    optimization: OptimizationConfig
    execution: ExecutionConfig


def parse_yaml(file_path: str) -> MLMTrainingConfig:
    with open(file_path, "r") as file:
        data = yaml.safe_load(file)
    return MLMTrainingConfig(**data)


config = parse_yaml("config.yml")

# Functions

In [6]:
# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

2025-02-19 03:37:50,621 - __main__ - INFO - Using device: cuda


In [7]:
# Load dataset
logger.info("Loading dataset...")
raw_dataset = load_dataset(config.dataset.id, split="train")
df = raw_dataset.to_pandas().sample(frac=1).reset_index(drop=True)
sample_df = df.sample(min(config.dataset.max_news, len(df)))

logger.info("Preparing sentences...")
combined_texts = sample_df["text"].to_list() + sample_df["title"].to_list()
sentences = [
    phrase for text in combined_texts if text for phrase in split_into_sentences(text)
]
sentences_sample = pd.Series(sentences).sample(config.dataset.max_sentences).to_list()
dataset = Dataset.from_dict({"text": sentences_sample})

# Setup model and tokenizer
logger.info("Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(
    config.model.tokenizer_path, clean_up_tokenization_spaces=False
)


model_config = AutoConfig.from_pretrained(config.model.base_id)

if config.optimization.fp16:
    model_config.torch_dtype = torch.float16

model = AutoModelForMaskedLM.from_pretrained(config.model.base_id, config=model_config)
model.resize_token_embeddings(len(tokenizer))
model.to(device)

# Setup attention mechanism
model = set_attention(model, config.optimization.use_flash_attention)

# Check vocabulary compatibility
check_vocab_size(tokenizer, model)

# Create training config
training_params = TrainingParams(
    num_train_epochs=config.training.num_train_epochs,
    dataset_size=len(dataset),
    num_chunks=len(config.training.mlm_probabilities),
    train_batch_size_per_device=config.training.train_batch_size,
    gradient_accumulation_steps=config.training.gradient_accumulation_steps,
    eval_size_ratio=config.dataset.eval_ratio,
    total_save_limit=2,
)

logger.info(f"Training configuration:\n{training_params}")

# Tokenize dataset
logger.info("Tokenizing dataset...")
tokenized_dataset = tokenize_dataset(
    dataset, tokenizer, num_proc=config.optimization.num_workers
)

# Setup optimizer, scheduler and scaler
optimizer = AdamW(
    model.parameters(),
    lr=config.training.learning_rate,
    weight_decay=config.training.weight_decay,
)
scaler = torch.amp.GradScaler(
    enabled=(device.type == "cuda" and config.optimization.fp16)
)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=config.training.warmup_steps,
    num_training_steps=training_params.total_train_steps,
)

# Check for existing checkpoints
global_step = 0
start_epoch = 0
latest_checkpoint = None

if os.path.exists(config.model.output_dir):
    checkpoints = [
        d for d in os.listdir(config.model.output_dir) if d.startswith("checkpoint-")
    ]
    if checkpoints:
        latest_checkpoint = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))[-1]
        checkpoint_path = os.path.join(config.model.output_dir, latest_checkpoint)
        model, optimizer, scheduler, global_step, start_epoch = load_checkpoint(
            model, optimizer, scheduler, checkpoint_path
        )
else:
    os.makedirs(config.model.output_dir, exist_ok=True)

# Training loop
logger.info("Starting training...")
model.train()

try:
    for epoch in range(start_epoch, training_params.num_train_epochs):
        for chunk_number, mlm_probability in enumerate(
            config.training.mlm_probabilities
        ):
            logger.info(
                f"Epoch {epoch + 1}/{training_params.num_train_epochs} | "
                f"MLM Probability: {mlm_probability}"
            )

            # Setup data collator with current MLM probability
            data_collator = DynamicPaddingDataCollator(
                tokenizer, mlm_probability=mlm_probability
            )

            # Split dataset for this chunk
            eval_start_idx = chunk_number * training_params.chunk_size
            eval_end_idx = eval_start_idx + training_params.eval_size_per_chunk - 1
            train_start_idx = (
                chunk_number * training_params.chunk_size
                + training_params.eval_size_per_chunk
            )
            train_end_idx = train_start_idx + training_params.chunk_train_size - 1

            logger.info(
                f"Splitting | "
                f"chunk: {eval_start_idx}-{train_end_idx} | "
                f"eval: {eval_start_idx}-{eval_end_idx} | "
                f"train: {train_start_idx}-{train_end_idx}"
            )

            train_dataset = (
                tokenized_dataset.skip(train_start_idx)
                .take(training_params.chunk_train_size)
                .shuffle(seed=42)
            )

            eval_dataset = (
                tokenized_dataset.skip(eval_start_idx)
                .take(training_params.eval_size_per_chunk)
                .shuffle(seed=42)
            )

            # Train on this chunk
            train_iterator = train_dataset.iter(
                batch_size=training_params.train_batch_size_per_device
            )

            for step, batch in tqdm(
                enumerate(train_iterator), desc=f"Training (MLM {mlm_probability})"
            ):
                # Check if accumulation step is complete
                accumulation_step_complete = (
                    step + 1
                ) % training_params.gradient_accumulation_steps == 0

                try:
                    # Forward pass
                    inputs = data_collator(batch)
                    loss = forward_pass(model, inputs, device)

                    # Backward pass with gradient scaling
                    scaler.scale(
                        loss / training_params.gradient_accumulation_steps
                    ).backward()

                    if accumulation_step_complete:
                        # Update model parameters
                        scaler.step(optimizer)
                        scaler.update()
                        scheduler.step()
                        optimizer.zero_grad()

                        # Update global step
                        global_step += 1

                        # Evaluate periodically
                        eval_interval = max(
                            1, training_params.total_steps_per_epoch // 4
                        )
                        if global_step % eval_interval == 0:
                            eval_loss = evaluate(
                                model,
                                eval_dataset,
                                data_collator,
                                batch_size=training_params.train_batch_size_per_device,
                                device=device,
                            )
                            logger.info(
                                f"Evaluation loss at step {global_step}: {eval_loss}"
                            )

                        # Save checkpoint periodically
                        save_interval = config.optimization.push_interval
                        if global_step % save_interval == 0:
                            save_checkpoint(
                                model,
                                optimizer,
                                scheduler,
                                tokenizer,
                                global_step,
                                epoch,
                                config.model.output_dir,
                            )

                        # Clear CUDA cache periodically
                        if device.type == "cuda" and global_step % 100 == 0:
                            torch.cuda.empty_cache()

                except Exception as e:
                    logger.error(f"Training batch failed: {e}. Skipping.")
                    continue

            # Evaluate at the end of each chunk
            logger.info(f"Evaluating at the end of chunk {chunk_number}...")
            eval_loss = evaluate(
                model,
                eval_dataset,
                data_collator,
                batch_size=training_params.train_batch_size_per_device,
                device=device,
            )
            logger.info(f"Chunk {chunk_number} evaluation loss: {eval_loss}")

        # Save checkpoint at the end of each epoch
        save_checkpoint(
            model,
            optimizer,
            scheduler,
            tokenizer,
            global_step,
            epoch,
            config.model.output_dir,
        )

    # Save final model
    logger.info("Training complete. Saving final model...")
    model.save_pretrained(
        config.model.output_dir,
    )
    tokenizer.save_pretrained(
        config.model.output_dir,
    )

except KeyboardInterrupt:
    logger.info("Training interrupted by user. Saving checkpoint...")
    save_checkpoint(
        model,
        optimizer,
        scheduler,
        tokenizer,
        global_step,
        epoch,
        config.model.output_dir,
    )

    logger.info("Training process completed.")

2025-02-19 03:37:52,223 - __main__ - INFO - Loading dataset...


NameError: name 'config' is not defined

In [8]:
config.training.mlm_probabilities

NameError: name 'config' is not defined

# REFACT

In [9]:
import argparse
import sys
from typing import List

import yaml
from pydantic import BaseModel, Field


class ModelConfig(BaseModel):
    base_id: str = Field(..., description="ID do modelo base no Hugging Face Hub")
    tokenizer_path: str = Field(..., description="Caminho para o tokenizador")
    output_dir: str = Field(..., description="Diretório para salvar o modelo treinado")


class DatasetConfig(BaseModel):
    id: str = Field(..., description="ID do dataset no HF Hub")
    max_news: int = Field(..., description="Número máximo de notícias a usar")
    max_sentences: int = Field(..., description="Número máximo de sentenças")
    eval_ratio: float = Field(..., description="Fração dos dados usada para validação")


class TrainingConfig(BaseModel):
    num_train_epochs: int = Field(..., description="Número de épocas de treinamento")
    train_batch_size: int = Field(..., description="Tamanho do batch por dispositivo")
    gradient_accumulation_steps: int = Field(
        ..., description="Passos de acumulação de gradiente"
    )
    learning_rate: float = Field(..., description="Taxa de aprendizado inicial")
    weight_decay: float = Field(..., description="Peso de decaimento")
    warmup_steps: int = Field(..., description="Passos de aquecimento para o scheduler")
    mlm_probabilities: List[float] = Field(
        ..., description="Probabilidades de mascaramento para diferentes chunks"
    )
    total_save_limit: int = Field(
        ..., description="Número máximo de checkpoints a manter"
    )
    fp16: bool = Field(..., description="Usar treinamento em precisão mista")
    use_flash_attention: bool = Field(
        ..., description="Usar Flash Attention se disponível"
    )
    push_interval: int = Field(
        ..., description="Intervalo de steps para push para o Hub"
    )
    num_workers: int = Field(
        ..., description="Número de workers para processamento de dados"
    )
    clear_memory_steps: int = Field(
        ..., description="Número de passos antes de limpar a memória"
    )
    memory_log_steps: int = Field(
        ..., description="Número de passos para retornar um log de uso da memória"
    )


class ExecutionConfig(BaseModel):
    testing: bool = Field(..., description="Executar em modo de teste")
    seed: int = Field(..., description="Semente para reprodutibilidade")
    device: str = Field(..., description="Dispositivo de execução: auto, cuda ou cpu")


class MLMTrainingConfig(BaseModel):
    model: ModelConfig
    dataset: DatasetConfig
    training: TrainingConfig
    execution: ExecutionConfig


def parse_yaml(file_path: str) -> MLMTrainingConfig:
    with open(file_path, "r") as file:
        data = yaml.safe_load(file)
    return MLMTrainingConfig(**data)


config = parse_yaml("config.yml")

In [33]:
import argparse
import logging
import os
import sys
from typing import List, Tuple

import pandas as pd
import torch
import yaml
from datasets import Dataset, load_dataset
from pydantic import BaseModel, Field
from torch.cuda.amp import GradScaler
from torch.optim import AdamW
from tqdm import tqdm
from tabulate import tabulate
from transformers import (
    AutoConfig,
    AutoModelForMaskedLM,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
)


def log_gpu_memory_usage():
    if torch.cuda.is_available():
        device = torch.cuda.current_device()
        device_properties = torch.cuda.get_device_properties(device)
        total_memory = device_properties.total_memory / 1024**2
        allocated_memory = torch.cuda.memory_allocated(device) / 1024**2
        reserved_memory = torch.cuda.memory_reserved(device) / 1024**2
        allocated_percent = (allocated_memory / total_memory) * 100
        reserved_percent = (reserved_memory / total_memory) * 100
        table = tabulate([
            ["Total de Memória (MB)", f"{int(round(total_memory))}", f"100%"],
            ["Memória Alocada (MB)", f"{int(round(allocated_memory))}", f"{int(round(allocated_percent))}%"],
            ["Memória Reservada (MB)", f"{int(round(reserved_memory))}", f"{int(round(reserved_percent))}%"]
        ], headers=["Descrição", "Valor", "Percentual"], tablefmt="grid")
        logger.info(f"\nUso de VRAM pela GPU:\n{table}")
    else:
        logger.info("CUDA não está disponível.")


import gc
def clear_vram():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    torch.cuda.empty_cache()

    gc.collect()
    if hasattr(torch.cuda, 'ipc_collect'):
        torch.cuda.ipc_collect()
    gc.collect()
    logger.info("VRAM cleared definitively.")

# def clear_vram():
#     """Limpa a memória da GPU antes de iniciar o treinamento."""
#     if torch.cuda.is_available():
#         torch.cuda.empty_cache()
#         logger.info("VRAM cleared before training.")

class TrainingParams:
    """Calcula parâmetros derivados para o treinamento"""

    def __init__(self, config: MLMTrainingConfig):
        self.num_train_epochs = config.training.num_train_epochs
        self.train_batch_size_per_device = config.training.train_batch_size
        self.gradient_accumulation_steps = config.training.gradient_accumulation_steps
        self.eval_size_ratio = config.dataset.eval_ratio
        self.total_save_limit = config.training.total_save_limit
        self.clear_memory_steps = config.training.clear_memory_steps
        self.memory_log_steps = config.training.memory_log_steps

        # Cálculos derivados
        self.dataset_size = None  # Será definido posteriormente
        self.num_chunks = len(config.training.mlm_probabilities)

    def calculate_derived_params(self, dataset_size: int):
        self.dataset_size = dataset_size
        self.chunk_size = self.dataset_size // self.num_chunks
        self.eval_size_per_chunk = int(self.chunk_size * self.eval_size_ratio)
        self.chunk_train_size = self.chunk_size - self.eval_size_per_chunk
        self.total_train_steps = (
            self.chunk_train_size // self.train_batch_size_per_device
        ) * self.num_train_epochs
        self.effective_batch_size = (
            self.train_batch_size_per_device * self.gradient_accumulation_steps
        )
        self.total_steps_per_epoch = math.ceil(
            self.dataset_size / self.effective_batch_size
        )


class DataPreprocessor:
    """Prepara e processa o dataset para treinamento"""

    def __init__(self, config: DatasetConfig):
        self.config = config

    def load_and_prepare_data(self) -> Dataset:
        logger.info("Loading dataset...")
        raw_dataset = load_dataset(self.config.id, split="train")
        df = raw_dataset.to_pandas().sample(frac=1).reset_index(drop=True)
        sample_df = df.sample(min(self.config.max_news, len(df)))

        logger.info("Preparing sentences...")
        combined_texts = sample_df["text"].to_list() + sample_df["title"].to_list()
        sentences = [
            phrase
            for text in combined_texts
            if text
            for phrase in split_into_sentences(text)
        ]
        sentences_sample = (
            pd.Series(sentences).sample(self.config.max_sentences).to_list()
        )
        return Dataset.from_dict({"text": sentences_sample})


# endregion


# region Gerenciamento de Modelo
class ModelManager:
    """Responsável por carregar e configurar o modelo"""

    def __init__(self, model_config: ModelConfig, training_config: TrainingConfig):
        self.model_config = model_config
        self.training_config = training_config
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def initialize_model(self) -> Tuple[AutoModelForMaskedLM, AutoTokenizer]:
        logger.info("Loading tokenizer and model...")
        tokenizer = AutoTokenizer.from_pretrained(
            self.model_config.tokenizer_path, clean_up_tokenization_spaces=False
        )

        model_config = AutoConfig.from_pretrained(self.model_config.base_id)
        if self.training_config.fp16:
            model_config.torch_dtype = torch.float16

        model = AutoModelForMaskedLM.from_pretrained(
            self.model_config.base_id, config=model_config
        )
        model.resize_token_embeddings(len(tokenizer))
        model.to(self.device)

        # Configuração adicional
        model = set_attention(model, self.training_config.use_flash_attention)
        check_vocab_size(tokenizer, model)

        return model, tokenizer


# endregion


# region Treinamento
class TrainingComponents:
    """Configura componentes para o treinamento"""

    def __init__(
        self,
        model: AutoModelForMaskedLM,
        training_config: TrainingConfig,
        total_steps: int,
    ):
        self.model = model
        self.training_config = training_config
        self.total_steps = total_steps

    def setup_components(self) -> Tuple[AdamW, GradScaler, Any]:
        optimizer = AdamW(
            self.model.parameters(),
            lr=self.training_config.learning_rate,
            weight_decay=self.training_config.weight_decay,
        )

        scaler = GradScaler(
            enabled=(self.model.device.type == "cuda" and self.training_config.fp16)
        )

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.training_config.warmup_steps,
            num_training_steps=self.total_steps,
        )

        return optimizer, scaler, scheduler


class CheckpointManager:
    """Gerencia checkpoints de treinamento"""

    def __init__(
        self,
        output_dir: str,
        model: AutoModelForMaskedLM,
        optimizer: AdamW,
        scheduler: Any,
    ):
        self.output_dir = output_dir
        self.model = model
        self.optimizer = optimizer
        self.scheduler = scheduler

    def save_checkpoint(self, global_step: int, epoch: int, tokenizer: AutoTokenizer):
        checkpoint_dir = os.path.join(self.output_dir, f"checkpoint-{global_step}")
        os.makedirs(checkpoint_dir, exist_ok=True)

        torch.save(
            {
                "global_step": global_step,
                "epoch": epoch,
                "model_state_dict": self.model.state_dict(),
                "optimizer_state_dict": self.optimizer.state_dict(),
                "scheduler_state_dict": self.scheduler.state_dict(),
            },
            os.path.join(checkpoint_dir, "training_state.pt"),
        )

        self.model.save_pretrained(checkpoint_dir)
        tokenizer.save_pretrained(checkpoint_dir)
        logger.info(f"Checkpoint saved at {checkpoint_dir}")

    def load_latest_checkpoint(self) -> Tuple[int, int]:
        if not os.path.exists(self.output_dir):
            return 0, 0

        checkpoints = [
            d for d in os.listdir(self.output_dir) if d.startswith("checkpoint-")
        ]
        if not checkpoints:
            return 0, 0

        latest_checkpoint = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))[-1]
        checkpoint_path = os.path.join(self.output_dir, latest_checkpoint)

        checkpoint = torch.load(os.path.join(checkpoint_path, "training_state.pt"))
        self.model.load_state_dict(checkpoint["model_state_dict"])
        self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
        self.scheduler.load_state_dict(checkpoint["scheduler_state_dict"])

        return checkpoint["global_step"], checkpoint["epoch"]


class Trainer:
    """Orquestra o processo de treinamento"""

    def __init__(
        self,
        config: MLMTrainingConfig,
        model: AutoModelForMaskedLM,
        tokenizer: AutoTokenizer,
    ):
        self.config = config
        self.model = model
        self.tokenizer = tokenizer
        self.device = model.device
        self.global_step = 0
        self.start_epoch = 0

        clear_vram()


    def train(self, dataset: Dataset):
        training_params = TrainingParams(self.config)
        training_params.calculate_derived_params(len(dataset))

        optimizer, scaler, scheduler = TrainingComponents(
            self.model, self.config.training, training_params.total_train_steps
        ).setup_components()

        checkpoint_manager = CheckpointManager(
            self.config.model.output_dir, self.model, optimizer, scheduler
        )

        # Tenta carregar checkpoint existente
        self.global_step, self.start_epoch = checkpoint_manager.load_latest_checkpoint()

        try:
            for epoch in range(self.start_epoch, training_params.num_train_epochs):
                self._train_epoch(
                    epoch,
                    training_params,
                    optimizer,
                    scaler,
                    scheduler,
                    checkpoint_manager,
                    dataset,
                )

            self._save_final_model()

        except KeyboardInterrupt:
            logger.info("Training interrupted. Saving final checkpoint...")
            checkpoint_manager.save_checkpoint(self.global_step, epoch, self.tokenizer)

    def _train_epoch(
        self,
        epoch: int,
        params: TrainingParams,
        optimizer: AdamW,
        scaler: GradScaler,
        scheduler: Any,
        checkpoint_manager: CheckpointManager,
        dataset: Dataset,
    ):
        tokenized_dataset = tokenize_dataset(
            dataset, self.tokenizer, num_proc=self.config.training.num_workers
        )

        for chunk_number, mlm_probability in enumerate(
            self.config.training.mlm_probabilities
        ):
            data_collator = DynamicPaddingDataCollator(
                self.tokenizer, mlm_probability=mlm_probability
            )

            train_dataset, eval_dataset = self._prepare_datachunk(
                chunk_number, params, tokenized_dataset
            )

            self._train_chunk(
                epoch,
                chunk_number,
                mlm_probability,
                params,
                train_dataset,
                eval_dataset,
                data_collator,
                optimizer,
                scaler,
                scheduler,
                checkpoint_manager,
            )

    def _prepare_datachunk(
        self, chunk_number: int, params: TrainingParams, tokenized_dataset
    ) -> Tuple[Dataset, Dataset]:
        eval_start_idx = chunk_number * params.chunk_size
        eval_end_idx = eval_start_idx + params.eval_size_per_chunk - 1
        train_start_idx = chunk_number * params.chunk_size + params.eval_size_per_chunk
        train_end_idx = train_start_idx + params.chunk_train_size - 1

        train_dataset = (
            tokenized_dataset.skip(train_start_idx)
            .take(params.chunk_train_size)
            .shuffle(seed=42)
        )

        eval_dataset = (
            tokenized_dataset.skip(eval_start_idx)
            .take(params.eval_size_per_chunk)
            .shuffle(seed=42)
        )

        return train_dataset, eval_dataset

    def _train_chunk(
        self,
        epoch: int,
        chunk_number: int,
        mlm_probability: float,
        params: TrainingParams,
        train_dataset: Dataset,
        eval_dataset: Dataset,
        data_collator: DynamicPaddingDataCollator,
        optimizer: AdamW,
        scaler: GradScaler,
        scheduler: Any,
        checkpoint_manager: CheckpointManager,
    ):
        logger.info(
            f"Epoch {epoch + 1}/{params.num_train_epochs} | MLM Probability: {mlm_probability}"
        )

        train_iterator = train_dataset.iter(
            batch_size=params.train_batch_size_per_device
        )

        for step, batch in tqdm(
            enumerate(train_iterator), desc=f"Training (MLM {mlm_probability})"
        ):
            accumulation_step_complete = (
                step + 1
            ) % params.gradient_accumulation_steps == 0

            try:
                loss = self._training_step(batch, data_collator, scaler)

                if accumulation_step_complete:
                    self._update_model(optimizer, scaler, scheduler)
                    self.global_step += 1

                    self._periodic_operations(
                        params,
                        eval_dataset,
                        data_collator,
                        optimizer,
                        scaler,
                        scheduler,
                        checkpoint_manager,
                    )

            except Exception as e:
                logger.error(f"Training batch failed: {e}. Skipping.")
                continue

        self._evaluate_chunk(chunk_number, eval_dataset, data_collator, params)

    def _training_step(
        self, batch: dict, data_collator: DynamicPaddingDataCollator, scaler: GradScaler
    ) -> float:
        inputs = data_collator(batch)
        loss = forward_pass(self.model, inputs, self.device)
        scaler.scale(loss / self.config.training.gradient_accumulation_steps).backward()
        return loss.item()

    def _update_model(self, optimizer: AdamW, scaler: GradScaler, scheduler: Any):
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        optimizer.zero_grad()

    def _periodic_operations(
        self,
        params: TrainingParams,
        eval_dataset: Dataset,
        data_collator: DynamicPaddingDataCollator,
        optimizer: AdamW,
        scaler: GradScaler,
        scheduler: Any,
        checkpoint_manager: CheckpointManager,
    ):
        eval_interval = max(1, params.total_steps_per_epoch // 4)
        if self.global_step % eval_interval == 0:
            eval_loss = evaluate(
                self.model,
                eval_dataset,
                data_collator,
                batch_size=params.train_batch_size_per_device,
                device=self.device,
            )
            logger.info(f"Evaluation loss at step {self.global_step}: {eval_loss}")

        if self.global_step % self.config.training.push_interval == 0:
            checkpoint_manager.save_checkpoint(
                self.global_step, self.start_epoch, self.tokenizer
            )

        if (
            self.global_step % params.memory_log_steps == 0
        ):  # Ajuste a frequência conforme necessário
            log_gpu_memory_usage()

        if (
            self.device.type == "cuda"
            and self.global_step % params.clear_memory_steps == 0
        ):
            torch.cuda.empty_cache()

    def _evaluate_chunk(
        self,
        chunk_number: int,
        eval_dataset: Dataset,
        data_collator: DynamicPaddingDataCollator,
        params: TrainingParams,
    ):
        logger.info(f"Evaluating at the end of chunk {chunk_number}...")
        eval_loss = evaluate(
            self.model,
            eval_dataset,
            data_collator,
            batch_size=params.train_batch_size_per_device,
            device=self.device,
        )
        logger.info(f"Chunk evaluation loss: {eval_loss}")

    def _save_final_model(self):
        logger.info("Training complete. Saving final model...")
        self.model.save_pretrained(self.config.model.output_dir)
        self.tokenizer.save_pretrained(self.config.model.output_dir)

In [34]:
config_path = "config.yml"
config = parse_yaml(config_path)

# Pré-processamento de dados
data_preprocessor = DataPreprocessor(config.dataset)
dataset = data_preprocessor.load_and_prepare_data()

# Inicialização do modelo
model_manager = ModelManager(config.model, config.training)
model, tokenizer = model_manager.initialize_model()

2025-02-19 03:48:14,663 - __main__ - INFO - Loading dataset...
2025-02-19 03:48:18,744 - __main__ - INFO - Preparing sentences...
2025-02-19 03:48:18,835 - __main__ - INFO - Loading tokenizer and model...
Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2025-02-19 03:48:19,958 - __main__ - INFO - Maior ID no tokenizador: 32767
2025-02-19 03

In [35]:
log_gpu_memory_usage()

2025-02-19 03:48:24,297 - __main__ - INFO - 
Uso de VRAM pela GPU:
+------------------------+---------+--------------+
| Descrição              |   Valor | Percentual   |
| Total de Memória (MB)  |    3904 | 100%         |
+------------------------+---------+--------------+
| Memória Alocada (MB)   |    1200 | 31%          |
+------------------------+---------+--------------+
| Memória Reservada (MB) |    1270 | 33%          |
+------------------------+---------+--------------+


In [36]:
clear_vram()

2025-02-19 03:48:30,692 - __main__ - INFO - VRAM cleared definitively.


In [37]:
log_gpu_memory_usage()

2025-02-19 03:48:34,891 - __main__ - INFO - 
Uso de VRAM pela GPU:
+------------------------+---------+--------------+
| Descrição              |   Valor | Percentual   |
| Total de Memória (MB)  |    3904 | 100%         |
+------------------------+---------+--------------+
| Memória Alocada (MB)   |    1200 | 31%          |
+------------------------+---------+--------------+
| Memória Reservada (MB) |    1270 | 33%          |
+------------------------+---------+--------------+


In [None]:
# Treinamento
trainer = Trainer(config, model, tokenizer)
trainer.train(dataset)

2025-02-19 03:48:38,387 - __main__ - INFO - VRAM cleared definitively.


Map (num_proc=4):   0%|          | 0/1001 [00:00<?, ? examples/s]

2025-02-19 03:48:39,509 - __main__ - INFO - Epoch 1/3 | MLM Probability: 0.05
2025-02-19 03:48:40,288 - __main__ - INFO - 
Uso de VRAM pela GPU:
+------------------------+---------+--------------+
| Descrição              |   Valor | Percentual   |
| Total de Memória (MB)  |    3904 | 100%         |
+------------------------+---------+--------------+
| Memória Alocada (MB)   |    1296 | 33%          |
+------------------------+---------+--------------+
| Memória Reservada (MB) |    2252 | 58%          |
+------------------------+---------+--------------+
Training (MLM 0.05): 3it [00:01,  2.43it/s]
[Aluating: 0it [00:00, ?it/s]
[Aluating: 1it [00:00,  6.68it/s]
[Aluating: 2it [00:00,  6.38it/s]
[Aluating: 3it [00:00,  2.75it/s]
[Aluating: 4it [00:01,  3.80it/s]
Evaluating: 5it [00:01,  4.20it/s]
2025-02-19 03:48:42,464 - __main__ - INFO - Evaluation loss at step 31: 10.999372100830078
2025-02-19 03:48:50,058 - __main__ - INFO - 
Uso de VRAM pela GPU:
+------------------------+-----

In [47]:
TrainingConfig

__main__.TrainingConfig

In [43]:
trainer.config

MLMTrainingConfig(model=ModelConfig(base_id='neuralmind/bert-base-portuguese-cased', tokenizer_path='domain_tokenizer', output_dir='bert-base-portuguese-cased-ptbr-test'), dataset=DatasetConfig(id='emdemor/news-of-the-brazilian-newspaper', max_news=1234, max_sentences=1001, eval_ratio=0.1), training=TrainingConfig(num_train_epochs=3, train_batch_size=4, gradient_accumulation_steps=2, learning_rate=0.005, weight_decay=0.01, warmup_steps=0, mlm_probabilities=[0.05, 0.1, 0.15, 0.2, 0.3], total_save_limit=2), optimization=OptimizationConfig(use_flash_attention=False, fp16=True, push_interval=10000, num_workers=4), execution=ExecutionConfig(testing=False, seed=42, device='auto'))