# 1. Imports

In [2]:
import os
import logging
import math
import re
import shutil
import argparse
from dataclasses import dataclass
from itertools import product
from typing import Any, Dict, List, Optional

import pandas as pd
import tabulate
import torch
import torch.nn as nn
from datasets import Dataset, load_dataset
from pydantic import BaseModel, field_validator
from torch.optim import AdamW
from tqdm.auto import tqdm
from transformers import (
    AutoConfig,
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    get_linear_schedule_with_warmup,
)




# 2. Constantes e Configurações

In [3]:
# Configuração de logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("training.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

In [7]:
def parse_args():
    """Parse command line arguments."""
    parser = argparse.ArgumentParser(description="Treinar modelo MLM em português")
    
    # Argumentos principais
    parser.add_argument("--model_id", default="answerdotai/ModernBERT-base",
                        help="ID do modelo base no Hugging Face Hub")
    parser.add_argument("--dataset_id", default="emdemor/news-of-the-brazilian-newspaper",
                        help="ID do dataset no Hugging Face Hub")
    parser.add_argument("--tokenizer_path", default="domain_tokenizer",
                        help="Caminho para o tokenizador")
    parser.add_argument("--output_dir", default=None,
                        help="Diretório para salvar o modelo treinado")
    
    # Configurações de treinamento
    parser.add_argument("--num_train_epochs", type=int, default=3,
                        help="Número de épocas de treinamento")
    parser.add_argument("--train_batch_size", type=int, default=4,
                        help="Tamanho do batch por dispositivo")
    parser.add_argument("--gradient_accumulation_steps", type=int, default=2,
                        help="Passos de acumulação de gradiente")
    parser.add_argument("--learning_rate", type=float, default=5e-3,
                        help="Taxa de aprendizado inicial")
    parser.add_argument("--weight_decay", type=float, default=0.01,
                        help="Peso de decaimento")
    parser.add_argument("--warmup_steps", type=int, default=0,
                        help="Passos de aquecimento para o scheduler")
    
    # Configurações de dados
    parser.add_argument("--max_news", type=int, default=3535,
                        help="Número máximo de notícias a usar")
    parser.add_argument("--max_sentences", type=int, default=3456,
                        help="Número máximo de sentenças")
    parser.add_argument("--eval_ratio", type=float, default=0.1,
                        help="Fração dos dados usada para validação")
    
    # Otimizações
    parser.add_argument("--use_flash_attention", action="store_true",
                        help="Usar Flash Attention se disponível")
    parser.add_argument("--fp16", action="store_true",
                        help="Usar treinamento em precisão mista")
    parser.add_argument("--push_interval", type=int, default=10000,
                        help="Intervalo de steps para push para o Hub")
    parser.add_argument("--num_workers", type=int, default=4,
                        help="Número de workers para processamento de dados")
    
    # Modo de teste
    parser.add_argument("--testing", action="store_true",
                        help="Executar em modo de teste")
    
    args = parser.parse_args()
    
    # Configurar diretório de saída se não especificado
    if args.output_dir is None:
        model_name = args.model_id.split("/")[-1]
        args.output_dir = f"{model_name}-ptbr-{'test' if args.testing else 'full'}"
    
    return args

# 3. Classes utilitárias

In [4]:
class TrainingConfig(BaseModel):
    """Configuração completa para treinamento."""
    dataset_size: int
    num_train_epochs: int
    num_chunks: int
    train_batch_size_per_device: int
    gradient_accumulation_steps: int
    eval_size_ratio: float
    total_save_limit: int

    @field_validator("num_chunks")
    def validate_num_chunks(cls, v, info):
        data = info.data
        if (
            "dataset_size" in data
            and "dataset_size" in data
            and "eval_size_ratio" in data
        ):
            dataset_size = data["dataset_size"]
            eval_size_per_chunk = int(data["dataset_size"] * data["eval_size_ratio"])
            available_size = dataset_size - eval_size_per_chunk * v
            if available_size < v:
                raise ValueError(
                    f"available_size ({available_size}) deve ser maior ou igual a num_chunks ({v})"
                )
        return v

    @property
    def effective_batch_size(self):
        """Tamanho efetivo do batch considerando a acumulação de gradiente."""
        return self.train_batch_size_per_device * self.gradient_accumulation_steps

    @property
    def total_steps_per_epoch(self):
        """Total de passos por época."""
        return math.ceil(self.dataset_size / self.effective_batch_size)

    @property
    def total_train_steps(self):
        """Total de passos de treinamento."""
        return self.total_steps_per_epoch * self.num_train_epochs

    @property
    def eval_size_per_chunk(self):
        """Tamanho do dataset de avaliação em cada chunk."""
        return int(self.dataset_size * self.eval_size_ratio / self.num_chunks)

    @property
    def available_size(self):
        """Tamanho disponível para treinamento."""
        return self.dataset_size - self.eval_size_per_chunk * self.num_chunks

    @property
    def eval_size(self):
        """Tamanho total para avaliação."""
        return self.dataset_size - self.available_size

    @property
    def chunk_size(self):
        """Tamanho de cada chunk de dados."""
        return self.dataset_size // self.num_chunks

    @property
    def chunk_train_size(self):
        """Tamanho de treinamento em cada chunk."""
        return self.available_size // self.num_chunks

    def __repr(self):
        data = [
            ["num_train_epochs", self.num_train_epochs],
            ["dataset_size", self.dataset_size],
            ["num_chunks", self.num_chunks],
            ["chunk_size", self.chunk_size],
            ["chunk_train_size", self.chunk_train_size],
            ["eval_size_per_chunk", self.eval_size_per_chunk],
            ["eval_size_ratio", self.eval_size_ratio],
            ["available_size", self.available_size],
            ["eval_size", self.eval_size],
            ["train_batch_size_per_device", self.train_batch_size_per_device],
            ["gradient_accumulation_steps", self.gradient_accumulation_steps],
            ["total_save_limit", self.total_save_limit],
            ["effective_batch_size", self.effective_batch_size],
            ["total_steps_per_epoch", self.total_steps_per_epoch],
            ["total_train_steps", self.total_train_steps],
        ]

        return tabulate.tabulate(data, headers=["Attribute", "Value"], tablefmt="grid")

    def __repr__(self):
        return self.__repr()

    def __str__(self):
        return self.__repr()

In [5]:
def split_into_sentences(text: str) -> List[str]:
    """
    Divide o texto em sentenças.
    
    Args:
        text: Texto a ser dividido
        
    Returns:
        Lista de sentenças
    """
    return [
        sentence.strip()
        for sentence in re.split(r"(?<=[.!?])\s+", text)
        if sentence.strip()
    ]

def set_attention(model, use_flash_attention=False):
    """
    Configura atenção do modelo, possibilitando uso de Flash Attention.
    
    Args:
        model: Modelo a ser configurado
        use_flash_attention: Se deve usar Flash Attention
        
    Returns:
        Modelo configurado
    """
    if not use_flash_attention:
        return model
    
    def check_flash_attention_support():
        if not torch.cuda.is_available():
            return False
        try:
            from flash_attn import flash_attn_qkvpacked_func
            qkv = torch.randn(1, 1, 3, 16, 64, dtype=torch.float16, device="cuda")
            flash_attn_qkvpacked_func(qkv, causal=False)
            return True
        except (ImportError, RuntimeError) as e:
            logger.warning(f"Flash Attention não é compatível: {str(e)}")
            return False

    if check_flash_attention_support():
        logger.info("Replacing standard attention with FlashAttention...")
        try:
            from flash_attn import FlashAttention
            for module in model.modules():
                if isinstance(module, nn.MultiheadAttention):
                    module.attention = FlashAttention()
            logger.info("FlashAttention integrated successfully.")
        except Exception as e:
            logger.error(f"Failed to integrate FlashAttention: {str(e)}")
    
    return model

def check_vocab_size(tokenizer, model):
    """
    Verifica se o tamanho do vocabulário do tokenizador é compatível com o modelo.
    
    Args:
        tokenizer: Tokenizador a ser verificado
        model: Modelo a ser verificado
        
    Raises:
        AssertionError: Se os IDs do token estiverem fora do intervalo do modelo
    """
    max_token_id = max(tokenizer.get_vocab().values())
    logger.info(f"Maior ID no tokenizador: {max_token_id}")
    logger.info(f"Tamanho do vocabulário do modelo: {model.config.vocab_size}")
    assert max_token_id < model.config.vocab_size, "IDs de tokens fora do intervalo!"

def tokenize_function(examples, tokenizer, target_column="text"):
    """
    Função para tokenizar exemplos do dataset.
    
    Args:
        examples: Batch de exemplos a serem tokenizados
        tokenizer: Tokenizador a ser utilizado
        target_column: Nome da coluna contendo o texto
        
    Returns:
        Exemplos tokenizados
    """
    return tokenizer(
        examples[target_column],
        return_special_tokens_mask=True,
    )

def tokenize_dataset(dataset, tokenizer, num_proc=4):
    """
    Tokeniza o dataset completo usando processamento paralelo.
    
    Args:
        dataset: Dataset a ser tokenizado
        tokenizer: Tokenizador a ser utilizado
        num_proc: Número de processos para paralelização
        
    Returns:
        Dataset tokenizado
    """
    tokenized_dataset = dataset.map(
        lambda examples: tokenize_function(examples, tokenizer),
        batched=True,
        remove_columns=dataset.column_names,
        num_proc=num_proc,
    )

    return tokenized_dataset

class DynamicPaddingDataCollator(DataCollatorForLanguageModeling):
    """
    Colator de dados com preenchimento dinâmico para MLM.
    """

    def __call__(self, examples: Dict[str, Any]) -> Dict[str, torch.Tensor]:
        # Find the maximum length within the current batch
        max_length = max(len(input_ids) for input_ids in examples["input_ids"])

        # Pad or truncate each example to the max_length
        batch = []
        input_ids = examples["input_ids"]
        attention_mask = examples["attention_mask"]

        for ids, mask in zip(input_ids, attention_mask):
            padding_length = max_length - len(ids)
            if padding_length > 0:
                # Pad
                ids = torch.tensor(ids + [self.tokenizer.pad_token_id] * padding_length)
                mask = torch.tensor(mask + [0] * padding_length)
            elif padding_length <= 0:
                # Truncate (if enabled in your tokenizer)
                ids = torch.tensor(ids[:max_length])
                mask = torch.tensor(mask[:max_length])

            batch.append({"input_ids": ids, "attention_mask": mask})

        # Apply the rest of the data collation logic (MLM masking, etc.)
        batch = self.torch_call(batch)

        # Ensure correct shapes and dtypes
        batch = fix_batch_inputs(batch)

        return batch

def fix_batch_inputs(inputs: dict) -> dict:
    """
    Garante que os tensores de entrada tenham a forma e o tipo corretos.
    
    Args:
        inputs: Dicionário com tensores de entrada
        
    Returns:
        Dicionário com tensores corrigidos
        
    Raises:
        ValueError: Se algum tensor tiver forma inesperada
    """
    for key in ["input_ids", "attention_mask", "token_type_ids"]:
        if key in inputs:
            if inputs[key].dim() == 3 and inputs[key].shape[0] == 1:
                inputs[key] = inputs[key].squeeze(0)
            elif inputs[key].dim() > 2:
                raise ValueError(
                    f"Unexpected tensor shape for {key}: {inputs[key].shape}"
                )
    if "input_ids" in inputs and inputs["input_ids"].dtype != torch.long:
        inputs["input_ids"] = inputs["input_ids"].long()
    return inputs

def forward_pass(model, inputs, device):
    """
    Realiza uma passagem para frente no modelo.
    
    Args:
        model: Modelo para realizar a passagem
        inputs: Entradas do modelo
        device: Dispositivo onde o modelo está
        
    Returns:
        Perda calculada pelo modelo
        
    Raises:
        ValueError: Se o modelo não retornar uma perda
    """
    inputs = fix_batch_inputs(inputs)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.amp.autocast("cuda", enabled=(device.type == "cuda")):
        outputs = model(**inputs, return_dict=True)
    if outputs.loss is None:
        raise ValueError("Model did not return a loss.")
    return outputs.loss

def evaluate(model, eval_dataset, data_collator, batch_size, device):
    """
    Avalia o desempenho do modelo no conjunto de validação.
    
    Args:
        model: Modelo a ser avaliado
        eval_dataset: Dataset de avaliação
        data_collator: Colator de dados para preparar batches
        batch_size: Tamanho do batch para avaliação
        device: Dispositivo onde o modelo está
        
    Returns:
        Perda média de avaliação
    """
    model.eval()
    losses = []
    eval_iterator = eval_dataset.iter(batch_size=batch_size)
    
    for batch in tqdm(eval_iterator, desc="Evaluating"):
        with torch.no_grad(), torch.amp.autocast(
            "cuda", enabled=(device.type == "cuda")
        ):
            try:
                inputs = data_collator(batch)
                loss = forward_pass(model, inputs, device)
                losses.append(loss.item())
            except Exception as e:
                logger.warning(f"Evaluation batch failed: {e}. Skipping.")
                continue
    
    model.train()
    average_loss = sum(losses) / len(losses) if losses else float("inf")
    return average_loss

def save_checkpoint(model, optimizer, scheduler, tokenizer, global_step, epoch, output_dir):
    """
    Salva um checkpoint do treinamento.
    
    Args:
        model: Modelo a ser salvo
        optimizer: Otimizador a ser salvo
        scheduler: Scheduler a ser salvo
        tokenizer: Tokenizador a ser salvo
        global_step: Passo global atual
        epoch: Época atual
        output_dir: Diretório base para salvar
    """
    checkpoint_dir = os.path.join(output_dir, f"checkpoint-{global_step}")
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    # Salvar estado do modelo e otimizador
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'global_step': global_step,
    }, os.path.join(checkpoint_dir, "training_state.pt"))
    
    # Salvar modelo e tokenizador no formato HF
    model.save_pretrained(checkpoint_dir)
    tokenizer.save_pretrained(checkpoint_dir)
    
    logger.info(f"Saved checkpoint at step {global_step} to {checkpoint_dir}")

def load_checkpoint(model, optimizer, scheduler, checkpoint_path):
    """
    Carrega um checkpoint de treinamento.
    
    Args:
        model: Modelo a ser carregado
        optimizer: Otimizador a ser carregado
        scheduler: Scheduler a ser carregado
        checkpoint_path: Caminho para o checkpoint
        
    Returns:
        Tupla com (modelo, otimizador, scheduler, global_step, epoch)
    """
    logger.info(f"Loading checkpoint from {checkpoint_path}")
    checkpoint = torch.load(os.path.join(checkpoint_path, "training_state.pt"))
    
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    
    return model, optimizer, scheduler, checkpoint['global_step'], checkpoint['epoch']


# Main

In [21]:
import yaml
import argparse
import sys

# Passo 1: Carregar as configurações do arquivo YAML
with open('config.yml', 'r') as file:
    args = yaml.safe_load(file)


In [None]:

# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

# Setup MLM probabilities
mlm_probabilities = [0.05, 0.10, 0.15, 0.20, 0.30]

# Load dataset
logger.info("Loading dataset...")
raw_dataset = load_dataset(args["dataset"]["id"], split="train")
df = raw_dataset.to_pandas().sample(frac=1).reset_index(drop=True)
sample_df = df.sample(min(args["dataset"]["max_news"], len(df)))

logger.info("Preparing sentences...")
combined_texts = sample_df["text"].to_list() + sample_df["title"].to_list()
sentences = [
    phrase for text in combined_texts if text for phrase in split_into_sentences(text)
]
sentences_sample = pd.Series(sentences).sample(args["dataset"]["max_sentences"]).to_list()
dataset = Dataset.from_dict({"text": sentences_sample})

# Setup model and tokenizer
logger.info("Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(
    args["model"]["tokenizer_path"], clean_up_tokenization_spaces=False
)


config = AutoConfig.from_pretrained(args["model"]["base_id"])

if args["optimization"]["fp16"]:
    config.torch_dtype = torch.float16

model = AutoModelForMaskedLM.from_pretrained(args["model"]["base_id"], config=config)
model.resize_token_embeddings(len(tokenizer))
model.to(device)

# Setup attention mechanism
model = set_attention(model, args["optimization"]["use_flash_attention"])

# Check vocabulary compatibility
check_vocab_size(tokenizer, model)

# Create training config
training_config = TrainingConfig(
    num_train_epochs=args["training"]["num_train_epochs"],
    dataset_size=len(dataset),
    num_chunks=len(mlm_probabilities),
    train_batch_size_per_device=args["training"]["train_batch_size"],
    gradient_accumulation_steps=args["training"]["gradient_accumulation_steps"],
    eval_size_ratio=args["dataset"]["eval_ratio"],
    total_save_limit=2,
)

logger.info(f"Training configuration:\n{training_config}")

# Tokenize dataset
logger.info("Tokenizing dataset...")
tokenized_dataset = tokenize_dataset(dataset, tokenizer, num_proc=args["optimization"]["num_workers"])

# Setup optimizer, scheduler and scaler
optimizer = AdamW(model.parameters(), lr=args["training"]["learning_rate"], weight_decay=args["training"]["weight_decay"])
scaler = torch.amp.GradScaler(enabled=(device.type == "cuda" and args["optimization"]["fp16"]))

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=args["training"]["warmup_steps"],
    num_training_steps=training_config.total_train_steps,
)

# Check for existing checkpoints
global_step = 0
start_epoch = 0
latest_checkpoint = None

if os.path.exists(args["model"]["output_dir"]):
    checkpoints = [d for d in os.listdir(args["model"]["output_dir"]) if d.startswith("checkpoint-")]
    if checkpoints:
        latest_checkpoint = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))[-1]
        checkpoint_path = os.path.join(args["model"]["output_dir"], latest_checkpoint)
        model, optimizer, scheduler, global_step, start_epoch = load_checkpoint(
            model, optimizer, scheduler, checkpoint_path
        )
else:
    os.makedirs(args["model"]["output_dir"], exist_ok=True)

# Training loop
logger.info("Starting training...")
model.train()

try:
    for epoch in range(start_epoch, training_config.num_train_epochs):
        for chunk_number, mlm_probability in enumerate(mlm_probabilities):
            logger.info(
                f"Epoch {epoch + 1}/{training_config.num_train_epochs} | "
                f"MLM Probability: {mlm_probability}"
            )
            
            # Setup data collator with current MLM probability
            data_collator = DynamicPaddingDataCollator(
                tokenizer, mlm_probability=mlm_probability
            )
            
            # Split dataset for this chunk
            eval_start_idx = chunk_number * training_config.chunk_size
            eval_end_idx = eval_start_idx + training_config.eval_size_per_chunk - 1
            train_start_idx = (
                chunk_number * training_config.chunk_size + training_config.eval_size_per_chunk
            )
            train_end_idx = train_start_idx + training_config.chunk_train_size - 1
            
            logger.info(
                f"Splitting | "
                f"chunk: {eval_start_idx}-{train_end_idx} | "
                f"eval: {eval_start_idx}-{eval_end_idx} | "
                f"train: {train_start_idx}-{train_end_idx}"
            )
            
            train_dataset = (
                tokenized_dataset.skip(train_start_idx)
                .take(training_config.chunk_train_size)
                .shuffle(seed=42)
            )
            
            eval_dataset = (
                tokenized_dataset.skip(eval_start_idx)
                .take(training_config.eval_size_per_chunk)
                .shuffle(seed=42)
            )
            
            # Train on this chunk
            train_iterator = train_dataset.iter(
                batch_size=training_config.train_batch_size_per_device
            )
            
            for step, batch in tqdm(
                enumerate(train_iterator), desc=f"Training (MLM {mlm_probability})"
            ):
                # Check if accumulation step is complete
                accumulation_step_complete = (
                    step + 1
                ) % training_config.gradient_accumulation_steps == 0
                
                try:
                    # Forward pass
                    inputs = data_collator(batch)
                    loss = forward_pass(model, inputs, device)
                    
                    # Backward pass with gradient scaling
                    scaler.scale(
                        loss / training_config.gradient_accumulation_steps
                    ).backward()
                    
                    if accumulation_step_complete:
                        # Update model parameters
                        scaler.step(optimizer)
                        scaler.update()
                        scheduler.step()
                        optimizer.zero_grad()
                        
                        # Update global step
                        global_step += 1
                        
                        # Evaluate periodically
                        eval_interval = max(1, training_config.total_steps_per_epoch // 4)
                        if global_step % eval_interval == 0:
                            eval_loss = evaluate(
                                model,
                                eval_dataset,
                                data_collator,
                                batch_size=training_config.train_batch_size_per_device,
                                device=device
                            )
                            logger.info(f"Evaluation loss at step {global_step}: {eval_loss}")
                        
                        # Save checkpoint periodically
                        save_interval = args["optimization"]["push_interval"]
                        if global_step % save_interval == 0:
                            save_checkpoint(
                                model, optimizer, scheduler, tokenizer,
                                global_step, epoch, args["model"]["output_dir"],
                            )
                        
                        # Clear CUDA cache periodically
                        if device.type == "cuda" and global_step % 100 == 0:
                            torch.cuda.empty_cache()
                
                except Exception as e:
                    logger.error(f"Training batch failed: {e}. Skipping.")
                    continue
            
            # Evaluate at the end of each chunk
            logger.info(f"Evaluating at the end of chunk {chunk_number}...")
            eval_loss = evaluate(
                model,
                eval_dataset,
                data_collator,
                batch_size=training_config.train_batch_size_per_device,
                device=device
            )
            logger.info(f"Chunk {chunk_number} evaluation loss: {eval_loss}")
        
        # Save checkpoint at the end of each epoch
        save_checkpoint(
            model, optimizer, scheduler, tokenizer,
            global_step, epoch, args["model"]["output_dir"],
        )
    
    # Save final model
    logger.info("Training complete. Saving final model...")
    model.save_pretrained(args["model"]["output_dir"],)
    tokenizer.save_pretrained(args["model"]["output_dir"],)

except KeyboardInterrupt:
    logger.info("Training interrupted by user. Saving checkpoint...")
    save_checkpoint(
        model, optimizer, scheduler, tokenizer,
        global_step, epoch, args["model"]["output_dir"],
    )
    
    logger.info("Training process completed.")


2025-02-16 17:26:29,367 - __main__ - INFO - Using device: cuda
2025-02-16 17:26:29,367 - __main__ - INFO - Loading dataset...
2025-02-16 17:26:33,399 - __main__ - INFO - Preparing sentences...
2025-02-16 17:26:33,572 - __main__ - INFO - Loading tokenizer and model...
2025-02-16 17:26:34,467 - __main__ - INFO - Maior ID no tokenizador: 32767
2025-02-16 17:26:34,467 - __main__ - INFO - Tamanho do vocabulário do modelo: 32768
2025-02-16 17:26:34,468 - __main__ - INFO - Training configuration:
+-----------------------------+---------+
| Attribute                   |   Value |
| num_train_epochs            |     3   |
+-----------------------------+---------+
| dataset_size                |  3456   |
+-----------------------------+---------+
| num_chunks                  |     5   |
+-----------------------------+---------+
| chunk_size                  |   691   |
+-----------------------------+---------+
| chunk_train_size            |   622   |
+-----------------------------+---------+
|

Map (num_proc=4):   0%|          | 0/3456 [00:00<?, ? examples/s]

2025-02-16 17:26:35,017 - __main__ - INFO - Starting training...
2025-02-16 17:26:35,018 - __main__ - INFO - Epoch 1/3 | MLM Probability: 0.05
2025-02-16 17:26:35,023 - __main__ - INFO - Splitting | chunk: 0-690 | eval: 0-68 | train: 69-690


Training (MLM 0.05): 0it [00:00, ?it/s]

2025-02-16 17:26:45,108 - __main__ - INFO - Evaluating at the end of chunk 0...


Evaluating: 0it [00:00, ?it/s]

2025-02-16 17:26:45,475 - __main__ - INFO - Chunk 0 evaluation loss: nan
2025-02-16 17:26:45,476 - __main__ - INFO - Epoch 1/3 | MLM Probability: 0.1
2025-02-16 17:26:45,481 - __main__ - INFO - Splitting | chunk: 691-1381 | eval: 691-759 | train: 760-1381


Training (MLM 0.1): 0it [00:00, ?it/s]

Evaluating: 0it [00:00, ?it/s]

2025-02-16 17:26:49,417 - __main__ - INFO - Evaluation loss at step 108: 8.594348033269247
2025-02-16 17:26:55,390 - __main__ - INFO - Evaluating at the end of chunk 1...


Evaluating: 0it [00:00, ?it/s]

2025-02-16 17:26:55,763 - __main__ - INFO - Chunk 1 evaluation loss: 8.538917594485813
2025-02-16 17:26:55,763 - __main__ - INFO - Epoch 1/3 | MLM Probability: 0.15
2025-02-16 17:26:55,764 - __main__ - INFO - Splitting | chunk: 1382-2072 | eval: 1382-1450 | train: 1451-2072


Training (MLM 0.15): 0it [00:00, ?it/s]

Evaluating: 0it [00:00, ?it/s]

2025-02-16 17:27:03,221 - __main__ - INFO - Evaluation loss at step 216: 8.113239261839125
2025-02-16 17:27:05,304 - __main__ - INFO - Evaluating at the end of chunk 2...


Evaluating: 0it [00:00, ?it/s]

2025-02-16 17:27:05,668 - __main__ - INFO - Chunk 2 evaluation loss: 8.423120816548666
2025-02-16 17:27:05,669 - __main__ - INFO - Epoch 1/3 | MLM Probability: 0.2
2025-02-16 17:27:05,669 - __main__ - INFO - Splitting | chunk: 2073-2763 | eval: 2073-2141 | train: 2142-2763


Training (MLM 0.2): 0it [00:00, ?it/s]

2025-02-16 17:27:11,748 - __main__ - INFO - Training interrupted by user. Saving checkpoint...
