# Introdução

Esse notebook introduz um guia inicial para uso da biblioteca PEFT para realizar o finetuning do modelo M2M100. Para esse exemplo, demonstro como posso usar um modelo de tradução genérico para adicionar camadas de pesos treinaveis para criar um modelo adaptado ao dominio de software

## Instalando bibliotecas

Importe a biblioteca de requirements.txt no [repositorio no github](https://github.com/danielhsf/peft)

In [None]:
!pip install -r requirements.txt

In [None]:
import os
import torch
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training

## Logando no Hugging Face

In [None]:
!huggingface-cli login

## Importando bibliotecas

In [None]:
import os
import torch
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training

## Configuração base

In [None]:
MODEL_NAME = "facebook/m2m100_418M"
DATASET_NAME = "kde4"
SOURCE_LANG = "en"
TARGET_LANG = "pt_BR"
SOURCE_LANG_M2M = "en" # M2M100 usa 'en' para Inglês
TARGET_LANG_M2M = "pt" # M2M100 usa 'pt' para Português

## Configurando o LoRA

In [None]:
# LoRA Configuration
LORA_R = 16 # Rank
LORA_ALPHA = 32 # Alpha
LORA_DROPOUT = 0.05
# M2M100 specific target modules. Inspect model.named_modules() to be sure.
# Common ones are 'q_proj', 'k_proj', 'v_proj', 'out_proj' for attention layers.
# For M2M100, encoder and decoder attention layers are named `encoder_attn` and `self_attn`
# and their projections are `k_proj`, `v_proj`, `q_proj`, `out_proj`.
# Let's target query and value projections in all attention blocks.
LORA_TARGET_MODULES = [
    "q_proj",
    "v_proj",
    # You can add "k_proj", "out_proj" if resources allow and you want to experiment
    # "fc1", "fc2" # for FFN layers, less common for LoRA in NMT but possible
]

## Argumentos para treinamento

In [None]:
# Training Arguments
OUTPUT_DIR = f"./results_m2m100_lora_kde4_{SOURCE_LANG}_{TARGET_LANG}"
BATCH_SIZE = 4 # Adjust based on your GPU memory
NUM_TRAIN_EPOCHS = 1 # Adjust as needed
LEARNING_RATE = 2e-4 # Higher learning rate is common for LoRA
WEIGHT_DECAY = 0.01
LOGGING_STEPS = 50
EVAL_STEPS = 200
SAVE_STEPS = 200
MAX_SOURCE_LENGTH = 128 # Max length for source sentences
MAX_TARGET_LENGTH = 128 # Max length for target sentences
FP16 = torch.cuda.is_available() # Use FP16 if a GPU is available

## Argumento de reproducibilidade

In [None]:
# For reproducibility
SEED = 42
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

## Carregando o dataset

In [None]:
# --- 1. Load Dataset ---
print("Loading dataset...")
try:
    # The KDE4 dataset on Hugging Face Hub is structured as language pairs
    dataset = load_dataset(DATASET_NAME, lang1=SOURCE_LANG, lang2=TARGET_LANG)
except Exception as e:
    print(f"Error loading dataset: {e}")
    print("Please ensure you have an internet connection and the dataset name/parameters are correct.")
    exit()

# For demonstration, let's use a smaller subset. Remove/adjust for full training.
# Using 'train' split as it's usually the largest.
if 'train' not in dataset:
    print(f"Error: 'train' split not found in dataset. Available splits: {list(dataset.keys())}")
    exit()

# Split the train dataset into train and validation
# KDE4 might not have a predefined validation split for the lang pair
train_test_split = dataset['train'].train_test_split(test_size=0.1, seed=SEED)

processed_dataset = DatasetDict({
    'train': train_test_split['train'].select(range(min(2000, len(train_test_split['train'])))), # Use 2000 samples for training
    'validation': train_test_split['test'].select(range(min(200, len(train_test_split['test']))))   # Use 200 samples for validation
})

'''
processed_dataset = DatasetDict({
    'train': train_test_split['train'],
    'validation': train_test_split['test']
})
'''

print(f"Dataset loaded and split. Train size: {len(processed_dataset['train'])}, Validation size: {len(processed_dataset['validation'])}")


## Carregando Tokenizer e Modelo

In [None]:
# --- 2. Load Tokenizer and Model ---
print("Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, src_lang=SOURCE_LANG_M2M, tgt_lang=TARGET_LANG_M2M)
# Load the base model first
base_model_for_lora = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

## Preprocesamento

In [None]:
# --- 3. Preprocessing Function ---
def preprocess_function(examples):
    if not examples.get('translation'):
        print("Warning: 'translation' key not found in examples. Skipping this batch.")
        return {'input_ids': [], 'attention_mask': [], 'labels': []}

    inputs = [ex[SOURCE_LANG] for ex in examples["translation"]]
    targets = [ex[TARGET_LANG] for ex in examples["translation"]]

    # Set source language for tokenizer
    tokenizer.src_lang = SOURCE_LANG_M2M
    model_inputs = tokenizer(inputs, max_length=MAX_SOURCE_LENGTH, truncation=True, padding="max_length")

    # Set target language for tokenizer
    tokenizer.tgt_lang = TARGET_LANG_M2M
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=MAX_TARGET_LENGTH, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]

    # Replace padding token id in labels with -100 to be ignored by loss function
    for i in range(len(model_inputs["labels"])):
        model_inputs["labels"][i] = [
            (l if l != tokenizer.pad_token_id else -100) for l in model_inputs["labels"][i]
        ]
    return model_inputs

In [None]:
print("Preprocessing dataset...")
tokenized_datasets = processed_dataset.map(preprocess_function, batched=True, remove_columns=processed_dataset["train"].column_names)
print("Dataset preprocessed.")

## Configurando LoRA

In [None]:
# --- 4. Configure LoRA ---
print("Configuring LoRA...")
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=LORA_TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

# model = prepare_model_for_kbit_training(base_model_for_lora) # If using k-bit training
# model = get_peft_model(model, lora_config) # If using k-bit training
model = get_peft_model(base_model_for_lora, lora_config) # Apply LoRA to the base model
model.print_trainable_parameters()

## Setando argumentos de treino

In [None]:
# --- 5. Training Arguments and Trainer ---
print("Setting up training arguments and trainer...")
forced_bos_token_id = tokenizer.lang_code_to_id[TARGET_LANG_M2M]

# ---- MODIFICATION START ----
# Get the model's existing generation_config (it's an object, not a dict)
# For PEFT models, model.generation_config should access the base model's config.
generation_config = model.generation_config
# Update it with the forced_bos_token_id
generation_config.forced_bos_token_id = forced_bos_token_id
# You can also set other generation parameters here if you want them to be part of this base config
# generation_config.max_length = MAX_TARGET_LENGTH # This will be overridden by Seq2SeqTrainingArguments if set there
# generation_config.num_beams = 4 # This will be overridden by Seq2SeqTrainingArguments if set there
# ---- MODIFICATION END ----

training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    predict_with_generate=True,
    generation_max_length=MAX_TARGET_LENGTH, # Trainer will use this to update its copy of generation_config
    generation_num_beams=4,                  # Trainer will use this to update its copy of generation_config
    fp16=FP16,
    learning_rate=LEARNING_RATE,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    eval_strategy="steps",
    eval_steps=EVAL_STEPS,
    save_strategy="steps",
    save_steps=SAVE_STEPS,
    logging_steps=LOGGING_STEPS,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="tensorboard",
    # ---- MODIFICATION FOR generation_config ----
    generation_config=generation_config, # Pass the GenerationConfig object
    push_to_hub=True,
    hub_model_id=f"danhsf/m2m100-lora-kde4-1epoch-{SOURCE_LANG}-{TARGET_LANG}",
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=-100,
    pad_to_multiple_of=8 if FP16 else None
)

# Optional: Compute Metrics (e.g., BLEU)
# You would need to install sacrebleu: pip install sacrebleu
import numpy as np
import sacrebleu
def compute_metrics(eval_preds):
     preds, labels = eval_preds
     if isinstance(preds, tuple):
         preds = preds[0]

     # Replace -100 in the labels as we can't decode them.
     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

     decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

#     # SacreBLEU expects a list of reference strings for each prediction
     decoded_labels_sacrebleu = [[label] for label in decoded_labels]

     result = sacrebleu.corpus_bleu(decoded_preds, decoded_labels_sacrebleu)
     return {"bleu": result.score}

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics, # Uncomment if using compute_metrics
)

## Treino

In [None]:
# --- 6. Train ---
print("Starting training...")
try:
    trainer.train()
except Exception as e:
    print(f"Error during training: {e}")
    if "CUDA out of memory" in str(e):
        print("CUDA out of memory. Try reducing BATCH_SIZE, MAX_SOURCE_LENGTH, MAX_TARGET_LENGTH, or LORA_R.")
        print("Using gradient_accumulation_steps in TrainingArguments can also help.")
    exit()

print("Training finished.")

# --- 7. Save LoRA Adapter ---
print("Saving LoRA adapter...")
lora_adapter_path = os.path.join(OUTPUT_DIR, "final_lora_adapter")
model.save_pretrained(lora_adapter_path)
tokenizer.save_pretrained(lora_adapter_path) # Save tokenizer with adapter for convenience
print(f"LoRA adapter saved to {lora_adapter_path}")

# --- 8. Inference Example (Optional) ---
print("\n--- Inference Example ---")
from peft import PeftModel

# Load the base model
base_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
# Load the LoRA adapter
loaded_model = PeftModel.from_pretrained(base_model, lora_adapter_path)
loaded_tokenizer = AutoTokenizer.from_pretrained(lora_adapter_path) # Uses the saved tokenizer

# Ensure the model is on the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model = loaded_model.to(device)
loaded_model.eval() # Set to evaluation mode

test_sentence_en = "The user can click the button to save the file."
print(f"Source ({SOURCE_LANG}): {test_sentence_en}")

# Tokenize for M2M100
loaded_tokenizer.src_lang = SOURCE_LANG_M2M # Set source language
inputs = loaded_tokenizer(test_sentence_en, return_tensors="pt", truncation=True, max_length=MAX_SOURCE_LENGTH).to(device)

# Generate translation
# M2M100 requires forced_bos_token_id to specify the target language for generation
generated_tokens = loaded_model.generate(
    **inputs,
    forced_bos_token_id=loaded_tokenizer.lang_code_to_id[TARGET_LANG_M2M],
    max_length=MAX_TARGET_LENGTH,
    num_beams=5, # Use beam search for better quality
    early_stopping=True
)

# Decode the generated tokens
translation_fr = loaded_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
print(f"Translated ({TARGET_LANG}): {translation_fr}")

print("\n--- Example with original M2M100 (for comparison) ---")
original_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
original_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, src_lang=SOURCE_LANG_M2M, tgt_lang=TARGET_LANG_M2M)

original_inputs = original_tokenizer(test_sentence_en, return_tensors="pt").to(device)
original_generated_tokens = original_model.generate(
    **original_inputs,
    forced_bos_token_id=original_tokenizer.lang_code_to_id[TARGET_LANG_M2M],
    max_length=MAX_TARGET_LENGTH
)
original_translation_fr = original_tokenizer.batch_decode(original_generated_tokens, skip_special_tokens=True)[0]
print(f"Original M2M100 Translated ({TARGET_LANG}): {original_translation_fr}")

print("\nScript finished successfully!")

In [None]:
# --- 8. Inference Example (Optional) ---
print("\n--- Inference Example ---")
from peft import PeftModel

# Load the base model
base_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
# Load the LoRA adapter
loaded_model = PeftModel.from_pretrained(base_model, lora_adapter_path)
loaded_tokenizer = AutoTokenizer.from_pretrained(lora_adapter_path) # Uses the saved tokenizer

# Ensure the model is on the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model = loaded_model.to(device)
loaded_model.eval() # Set to evaluation mode

test_sentence_en = "home"
print(f"Source ({SOURCE_LANG}): {test_sentence_en}")

# Tokenize for M2M100
loaded_tokenizer.src_lang = SOURCE_LANG_M2M # Set source language
inputs = loaded_tokenizer(test_sentence_en, return_tensors="pt", truncation=True, max_length=MAX_SOURCE_LENGTH).to(device)

# Generate translation
# M2M100 requires forced_bos_token_id to specify the target language for generation
generated_tokens = loaded_model.generate(
    **inputs,
    forced_bos_token_id=loaded_tokenizer.lang_code_to_id[TARGET_LANG_M2M],
    max_length=MAX_TARGET_LENGTH,
    num_beams=5, # Use beam search for better quality
    early_stopping=True
)

# Decode the generated tokens
translation_fr = loaded_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
print(f"Translated ({TARGET_LANG}): {translation_fr}")

print("\n--- Example with original M2M100 (for comparison) ---")
original_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
original_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, src_lang=SOURCE_LANG_M2M, tgt_lang=TARGET_LANG_M2M)

original_inputs = original_tokenizer(test_sentence_en, return_tensors="pt").to(device)
original_generated_tokens = original_model.generate(
    **original_inputs,
    forced_bos_token_id=original_tokenizer.lang_code_to_id[TARGET_LANG_M2M],
    max_length=MAX_TARGET_LENGTH
)
original_translation_fr = original_tokenizer.batch_decode(original_generated_tokens, skip_special_tokens=True)[0]
print(f"Original M2M100 Translated ({TARGET_LANG}): {original_translation_fr}")

print("\nScript finished successfully!")