In [1]:
import math
import random
import pandas as pd
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    logging
)
from datasets import load_dataset
from pathlib import Path
from sklearn.model_selection import KFold

logging.set_verbosity_error()

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# configs
MODEL_CHECKPOINT = "C:\\Users\\Franco\\Desktop\\projetos\\bert\\bert-large-portuguese-cased"
DATASET_PATH = "./data/train_mlm.csv"
TEXT_COLUMN = "review"

OUTPUT_BASE = "./bertimbau_dapt_runs"
CHUNK_SIZE = 512
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [3]:
df = pd.read_csv("data/train2024.csv", sep=";")

# mantÃ©m uma ocorrÃªncia por review
df_mlm = df[[TEXT_COLUMN]].drop_duplicates().reset_index(drop=True)

print(f"Original rows: {len(df)}")
print(f"Unique reviews: {len(df_mlm)}")

df_mlm.to_csv(DATASET_PATH, index=False)

Original rows: 4828
Unique reviews: 1322


In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

dataset_full = load_dataset(
    "csv",
    data_files=DATASET_PATH,
    sep=";"
)["train"]

def tokenize_function(examples):
    return tokenizer(
        examples[TEXT_COLUMN],
        truncation=True,
        max_length=CHUNK_SIZE,
        return_special_tokens_mask=True,
    )

tokenized = dataset_full.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset_full.column_names,
    desc="Tokenizing"
)

# fiz um teste com e sem agrupar para usar ao mÃ¡ximo a capacidade do modelo
# vi um artigo recomendando fazer isso
# ficou tudo duplicado pra testar isso
def group_texts(examples):
    concatenated = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated["input_ids"])

    total_length = (total_length // CHUNK_SIZE) * CHUNK_SIZE

    result = {
        k: [t[i : i + CHUNK_SIZE] for i in range(0, total_length, CHUNK_SIZE)]
        for k, t in concatenated.items()
    }

    result["labels"] = result["input_ids"].copy()
    return result

def sample_config():
    return {
        "lr": 10 ** np.random.uniform(-5.1, -4.3), 
        "mlm_prob": random.choice([0.15, 0.15, 0.20]), # Peso maior para 0.15
        "warmup": np.random.uniform(0.05, 0.10),
        "wd": random.choice([0.01, 0.1]),
    }

N_RUNS = 50
N_SPLITS = 5

configs = [sample_config() for _ in range(N_RUNS)]

Generating train split: 1322 examples [00:00, 87914.73 examples/s]
Tokenizing: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1322/1322 [00:00<00:00, 8811.51 examples/s] 


In [5]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [None]:
grouped_results = []

# random search com k-fold cross validation

for config_idx, cfg in enumerate(configs):
    print(f"\n==================================================")
    print(f"Config {config_idx+1}/{len(configs)}: LR={cfg['lr']} | MLM={cfg['mlm_prob']}")
    print(f"==================================================")
    fold_ppls = []
    kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

    
    for fold, (train_idx, val_idx) in enumerate(kf.split(tokenized)):
        run_name = f"conf{config_idx}_fold{fold}"
        output_dir = Path(OUTPUT_BASE) / f"cfg_{config_idx}" / f"fold_{fold}"
        
        train_sub = tokenized.select(train_idx)
        val_sub = tokenized.select(val_idx)
        
        train_grouped = train_sub.map(group_texts, batched=True, desc=f"Grouping Train F{fold+1}")
        val_grouped = val_sub.map(group_texts, batched=True, desc=f"Grouping Val F{fold+1}")

        model = AutoModelForMaskedLM.from_pretrained(MODEL_CHECKPOINT).to(device)

        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=True,
            mlm_probability=cfg["mlm_prob"]
        )

    
        training_args = TrainingArguments(
            output_dir=str(output_dir),
            overwrite_output_dir=True,
            eval_strategy="epoch",
            save_strategy="no",
            logging_steps=100,
            learning_rate=cfg["lr"],
            num_train_epochs=2,
            per_device_train_batch_size=8,
            gradient_accumulation_steps=4,
            weight_decay=cfg["wd"],
            warmup_ratio=cfg["warmup"],
            fp16=True,
            seed=SEED,
            report_to="none"
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_grouped,
            eval_dataset=val_grouped,
            data_collator=data_collator,
        )

        trainer.train()
        
        eval_metrics = trainer.evaluate()
        ppl = math.exp(eval_metrics["eval_loss"])
        fold_ppls.append(ppl)
        
        print(f"   ---> Fold {fold+1}/{N_SPLITS} PPL: {ppl:.2f}")

        del model, trainer, train_grouped, val_grouped
        torch.cuda.empty_cache()
    
    # mÃ©dia da perplexidade dos folds
    avg_ppl = np.mean(fold_ppls)
    std_ppl = np.std(fold_ppls)

    print(f"Config Result: Mean PPL = {avg_ppl:.2f} (+/- {std_ppl:.2f})")

    grouped_results.append({
        "learning_rate": cfg["lr"],
        "mlm_probability": cfg["mlm_prob"],
        "warmup": cfg["warmup"],
        "weight_decay": cfg["wd"],
        "perplexity_mean": avg_ppl,
        "perplexity_std": std_ppl,
        "folds_ppl": fold_ppls
    })

print("\nFINAL RESULTS (Sorted by Mean Perplexity)")
# Ordena pelo menor erro mÃ©dio (Perplexidade mÃ©dia)
for r in sorted(grouped_results, key=lambda x: x["perplexity_mean"]):
    print(f"PPL: {r['perplexity_mean']:.2f} | LR: {r['learning_rate']} | MLM: {r['mlm_probability']}")

In [None]:
non_grouped_results = []

for config_idx, cfg in enumerate(configs):
    print(f"\n==================================================")
    print(f"Config {config_idx+1}/{len(configs)}: LR={cfg['lr']} | MLM={cfg['mlm_prob']}")
    print(f"==================================================")

    fold_ppls = []
    for fold, (train_idx, val_idx) in enumerate(kf.split(tokenized)):
        
        run_name = f"conf{config_idx}_lr{cfg['lr']}_fold{fold}_nongrouped"
        output_dir = Path(OUTPUT_BASE) / f"cfg_{config_idx}" / f"fold_{fold}"
        output_dir.mkdir(parents=True, exist_ok=True)

        print(f"\nðŸš€ Running Fold {fold+1}/{N_SPLITS}")

        train_sub = tokenized.select(train_idx)
        val_sub = tokenized.select(val_idx)

        model = AutoModelForMaskedLM.from_pretrained(MODEL_CHECKPOINT).to(device)

        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=True,
            mlm_probability=cfg["mlm_prob"]
        )

        training_args = TrainingArguments(
            output_dir=str(output_dir),
            overwrite_output_dir=True,
            eval_strategy="epoch",
            logging_steps=100,
            save_strategy="no",
            learning_rate=cfg["lr"],
            num_train_epochs=2, # treinamento rÃ¡pido para o Grid Search
            per_device_train_batch_size=8,
            gradient_accumulation_steps=4,
            weight_decay=cfg["wd"],
            warmup_ratio=cfg["warmup"],
            fp16=True,
            seed=SEED,
            report_to="none"
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_sub,
            eval_dataset=val_sub,
            data_collator=data_collator,
        )

        trainer.train()

        eval_results = trainer.evaluate()
        ppl = math.exp(eval_results["eval_loss"])
        fold_ppls.append(ppl)

        print(f"Fold {fold+1} Perplexity: {ppl:.2f}")

        del model, trainer, train_sub, val_sub
        torch.cuda.empty_cache()
    
    avg_ppl = np.mean(fold_ppls)
    std_ppl = np.std(fold_ppls)

    print(f"ðŸ“Š Config Result: Mean PPL = {avg_ppl:.2f} (+/- {std_ppl:.2f})")

    non_grouped_results.append({
        "learning_rate": cfg["lr"],
        "mlm_probability": cfg["mlm_prob"],
        "warmup": cfg["warmup"],
        "weight_decay": cfg["wd"],
        "grouped": False,
        "perplexity_mean": avg_ppl,
        "perplexity_std": std_ppl,
        "folds_ppl": fold_ppls
    })


print("\nFINAL RESULTS (Sorted by Mean Perplexity)")
for r in sorted(non_grouped_results, key=lambda x: x["perplexity_mean"]):
    print(f"PPL: {r['perplexity_mean']:.2f} | LR: {r['learning_rate']} | MLM: {r['mlm_probability']}")

In [None]:
from datetime import datetime

results_file = Path(OUTPUT_BASE) / "dapt_results_grouped.txt"

with open(results_file, "w", encoding="utf-8") as f:
    f.write("Domain-Adaptive Pretraining Results (MLM)\n")
    f.write(f"Date: {datetime.now()}\n")
    f.write("=" * 50 + "\n\n")

    for r in sorted(grouped_results, key=lambda x: x["perplexity_mean"]):
        f.write(
            f"PPL: {r['perplexity_mean']:.4f}\t"
            f"LR: {r['learning_rate']}\t"
            f"MLM_PROB: {r['mlm_probability']}\t"
            f"WARMUP: {r['warmup']}\t"
            f"WD: {r['weight_decay']}\n"
        )

print(f"\nResults saved to: {results_file}")


In [None]:
results_file = Path(OUTPUT_BASE) / "dapt_results_non_grouped.txt"

with open(results_file, "w", encoding="utf-8") as f:
    f.write("Domain-Adaptive Pretraining Results (MLM)\n")
    f.write(f"Date: {datetime.now()}\n")
    f.write("=" * 50 + "\n\n")

    for r in sorted(non_grouped_results, key=lambda x: x["perplexity_mean"]):
        f.write(
            f"PPL: {r['perplexity_mean']:.4f}\t"
            f"LR: {r['learning_rate']}\t"
            f"MLM_PROB: {r['mlm_probability']}\t"
            f"WARMUP: {r['warmup']}\t"
            f"WD: {r['weight_decay']}\n"
        )

print(f"\nResults saved to: {results_file}")

In [10]:
final_config = {
    "learning_rate": 4.704415004453045e-05,
    "mlm_probability": 0.15,
    "warmup": 0.08,
    "weight_decay": 0.1
}

In [None]:
# Treinamento final com dados nÃ£o agrupados
FINAL_OUTPUT = "./bertimbau_dapt_final_nongrouped"

model = AutoModelForMaskedLM.from_pretrained(MODEL_CHECKPOINT)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=final_config["mlm_probability"]
)

training_args = TrainingArguments(
    output_dir=FINAL_OUTPUT,
    overwrite_output_dir=True,
    num_train_epochs=4,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=final_config["learning_rate"],
    weight_decay=final_config["weight_decay"],
    warmup_ratio=final_config["warmup"],
    fp16=False,
    save_strategy="epoch",
    logging_steps=250,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,  # pode juntar train+val aqui
    data_collator=data_collator
)

trainer.train()

trainer.save_model(FINAL_OUTPUT)
tokenizer.save_pretrained(FINAL_OUTPUT)


del model, trainer
torch.cuda.empty_cache()

{'loss': 1.335, 'grad_norm': 5.9935526847839355, 'learning_rate': 1.2802178536708286e-05, 'epoch': 3.012084592145015}
{'train_runtime': 345.6173, 'train_samples_per_second': 15.3, 'train_steps_per_second': 0.961, 'train_loss': 1.2970994696559677, 'epoch': 4.0}


In [None]:
# Treinamento final com dados agrupados
FINAL_OUTPUT = "./bertimbau_dapt_final_grouped"

model = AutoModelForMaskedLM.from_pretrained(MODEL_CHECKPOINT)

grouped_dataset = tokenized.map(group_texts, batched=True, desc="Grouping Full Dataset")

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=final_config["mlm_probability"]
)

training_args = TrainingArguments(
    output_dir=FINAL_OUTPUT,
    overwrite_output_dir=True,
    num_train_epochs=4,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=final_config["learning_rate"],
    weight_decay=final_config["weight_decay"],
    warmup_ratio=final_config["warmup"],
    fp16=False,
    save_strategy="epoch",
    logging_steps=250,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=grouped_dataset,  # pode juntar train+val aqui
    data_collator=data_collator
)

trainer.train()

trainer.save_model(FINAL_OUTPUT)
tokenizer.save_pretrained(FINAL_OUTPUT)


del model, trainer
torch.cuda.empty_cache()

Grouping Full Dataset: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1322/1322 [00:00<00:00, 2221.12 examples/s]


{'train_runtime': 245.3238, 'train_samples_per_second': 4.207, 'train_steps_per_second': 0.277, 'train_loss': 1.5627224866081686, 'epoch': 4.0}
