# Fine-Tuning Practice

In [2]:
import tempfile
import logging
import random
import config
import os
import yaml
import time
import torch
import transformers
import pandas as pd
import jsonlines
from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import TrainingArguments
from transformers import AutoModelForCausalLM

logger = logging.getLogger(__name__)
global_config = None

  from .autonotebook import tqdm as notebook_tqdm


## **Set up the model, training config, and tokenizer**

### Tokenize Data

In [23]:
def load_tokenize_and_split_data(training_config, tokenizer, test_size=0.2, random_state=42):
    dataset_path = training_config["dataset"]["path"]
    max_length = training_config["model"]["max_input_length"]
    model_input_key = training_config["dataset"]["example_input_key"]     # ej. "question"
    model_output_key = training_config["dataset"]["example_output_key"]   # ej. "answer"

    data = []
    with jsonlines.open(dataset_path) as reader:
        for obj in reader:
            input_text = obj[model_input_key]
            output_text = obj[model_output_key]
            data.append({model_input_key: input_text, model_output_key: output_text})

    train_data, test_data = train_test_split(data, test_size=test_size, random_state=random_state)

    tensor_format = training_config["dataset"]["tensor_format"]
    
    def tokenize_pair(example):
        full_text = example[model_input_key] + tokenizer.eos_token + example[model_output_key] + tokenizer.eos_token
        encoding = tokenizer(full_text, truncation=True, max_length=max_length, padding="max_length", return_tensors=tensor_format)
        input_ids = encoding["input_ids"].squeeze(0)
        attention_mask = encoding["attention_mask"].squeeze(0)

        prompt_encoding = tokenizer(example[model_input_key] + tokenizer.eos_token, truncation=True, max_length=max_length, padding=False, return_tensors=tensor_format)
        prompt_length = prompt_encoding["input_ids"].size(1)

        labels = input_ids.clone()
        labels[:prompt_length] = -100

        return {
            model_input_key: example[model_input_key],
            model_output_key: example[model_output_key],
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

    train_dataset = [tokenize_pair(ex) for ex in train_data]
    test_dataset = [tokenize_pair(ex) for ex in test_data]

    return train_dataset, test_dataset, data


In [24]:
dataset_name = "TriviaQA.jsonl"
dataset_path = f"./data/{dataset_name}"

In [102]:
model_name ="EleutherAI/pythia-70m"

training_config = {
    "model": {
        "pretrained_name": model_name,
        "max_input_length" : 50,
        "max_output_length" : 50,
    },
    "dataset": {
        "path": dataset_path,
        "example_input_key": "question",
        "example_output_key": "answer",
        "tensor_format": "pt"  # "pt" for PyTorch tensors, "tf" for TensorFlow tensors
    },
    "training": {
        "learning_rate":               5e-5,
        "num_train_epochs":            100,
        "per_device_train_batch_size": 10,
        "max_steps":                   -1,
        "gradient_accumulation_steps": 2,
        "eval_steps":                  10,
        "save_steps":                  20,
        "checkpoints_output_dir":      None,  # Will be set later based on max_steps
        "output_dir":                  None  # Will be set later based on max_steps"
    },
    "verbose": True
}

trained_model_name = f"finetuned_model_{training_config["training"]["max_steps"]}_steps"
training_config["training"]["output_dir"] = trained_model_name
training_config["training"]["checkpoints_output_dir"] = os.path.join(trained_model_name, "checkpoints")


tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
train_dataset, test_dataset, raw_dataset = load_tokenize_and_split_data(training_config, tokenizer)

## **Load the base model**

In [103]:
base_model = AutoModelForCausalLM.from_pretrained(model_name)

In [104]:
## Select device
logger.debug("Checking available devices for training...")
device_count = torch.cuda.device_count()
if device_count > 0:
    logger.debug("Select GPU device")
    device = torch.device("cuda")
else:
    logger.debug("Select CPU device")
    device = torch.device("cpu")

In [105]:
# Move model to the selected device
print(f"Moving model to device: {device}")
base_model.to(device)

Moving model to device: cuda


GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (act): GELUActivation()
        )
      )
    )
    (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise

### Function for inference

In [106]:
def inference(text, model, tokenizer, training_config):
  # Tokenize
  encoding = tokenizer(
          text,
          return_tensors="pt",
          truncation=True,
          max_length=training_config["model"]["max_input_length"],
          padding="max_length"
  )
  input_ids = encoding["input_ids"]
  attention_mask = encoding["attention_mask"]
  
  # Move to same device as model
  input_ids = input_ids.to(model.device)
  attention_mask = attention_mask.to(model.device)

  # Generate
  generated_tokens_with_prompt = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=training_config["model"]["max_input_length"] + training_config["model"]["max_output_length"],
    pad_token_id=model.config.pad_token_id
  )

  # Decode
  generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

  # Strip the prompt
  generated_text_answer = generated_text_with_prompt[0][len(text):]

  return generated_text_answer

## **Try the base model**

In [107]:
test_index = 1

test_text = test_dataset[test_index]['question']
print("Question input (test):", test_text, "\n")

print("Correct answer from Dataset: ")
print(test_dataset[test_index]['answer'], "\n")

print("Model's answer:")
print(inference(test_text, base_model, tokenizer, training_config))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question input (test): What is the boiling point of water in Celsius? 

Correct answer from Dataset: 
100 degrees Celsius 

Model's answer:
Q.

A:

The answer is that the boiling point of water in Celsius is not the same as the boiling point of water in Celsius.  The boiling point of water in Celsius is the same as


## **Setup training**

### Calculate the approximate number of FLOPs
Calculates the approximate number of floating-point operations per second (FLOPs) required for a single forward pass of the model, adjusted for gradient accumulation.

The FLOPs are multiplied by the number of gradient accumulation steps. This accounts for the fact that during training, gradients are accumulated over multiple steps before performing a weight update, effectively increasing the computational cost.

This calculation provides an estimate of the computational cost of training the model, considering both the model's architecture and the gradient accumulation strategy. It helps in understanding the resource requirements for training.

In [108]:
model_flops = (
  base_model.floating_point_ops(
    {
       "input_ids": torch.zeros((1, training_config["model"]["max_input_length"]))
    }
  )
  * training_config["training"]["gradient_accumulation_steps"]
)

print(base_model)
# Memoria que ocupa el modelo en tiempo de inferencia.
print("Memory footprint", base_model.get_memory_footprint() / 1e9, "GB")
# FLOPS que realiza el modelo para procesar una muestra (batch) de entrada, teniendo en cuenta también la acumulación de gradientes.
print("Flops", model_flops / 1e9, "GFLOPs")

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (act): GELUActivation()
        )
      )
    )
    (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise

In [109]:
"""
En reemplazo de Trainer, se puede usar un bucle de entrenamiento personalizado con torch.
Se opta por esta última opción ya que no se logró importar el Trainer de transformers con la versión de PyTorch y Accelerate utilizadas.
"""
import os
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import Adafactor, AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import torch

def train_loop(
    model,
    tokenizer,
    train_dataset,
    eval_dataset,
    training_config,
    device=None,
    collate_fn=None
):
    # Device setup
    device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if training_config["verbose"]:
        print(f"Using device: {device}")
    model = model.to(device)
    model.train()

    # Extract config
    tconf = training_config["training"]
    lr = tconf["learning_rate"]
    epochs = tconf["num_train_epochs"]
    max_steps = tconf["max_steps"]
    batch_size = tconf["per_device_train_batch_size"]
    grad_acc_steps = tconf["gradient_accumulation_steps"]
    eval_steps = tconf["eval_steps"]
    save_steps = tconf["save_steps"]
    ckpt_output_dir = tconf["checkpoints_output_dir"]

    # DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    eval_loader = DataLoader(eval_dataset, batch_size=batch_size, collate_fn=collate_fn)

    # Optimizer
    optimizer = Adafactor(
        model.parameters(),
        lr=lr,
        scale_parameter=False,
        relative_step=False,
        warmup_init=False,
    )

    # Training loop
    step = 0
    best_eval_loss = float("inf")
    os.makedirs(ckpt_output_dir, exist_ok=True)

    for epoch in range(1, epochs+1):
        epoch_bar = tqdm(train_loader, desc=f"Epoch {epoch}", leave=False)
        for batch in epoch_bar:
            # Usar los tensores ya tokenizados
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Forward + backward
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss / grad_acc_steps
            loss.backward()

            if (step + 1) % grad_acc_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
                step += 1
                epoch_bar.set_postfix({"step": step, "loss": loss.item()})

                # Evaluation
                if step % eval_steps == 0:
                    model.eval()
                    total_eval_loss = 0.0
                    n_eval = 0
                    with torch.no_grad():
                        for eval_batch in eval_loader:
                            eval_input_ids = eval_batch["input_ids"].to(device)
                            eval_attention_mask = eval_batch["attention_mask"].to(device)
                            eval_labels = eval_batch["labels"].to(device)

                            out = model(
                                input_ids=eval_input_ids,
                                attention_mask=eval_attention_mask,
                                labels=eval_labels
                            )
                            total_eval_loss += out.loss.item()
                            n_eval += 1
                    avg_eval_loss = total_eval_loss / max(1, n_eval)
                    model.train()

                    # Save best checkpoint
                    if avg_eval_loss < best_eval_loss:
                        best_eval_loss = avg_eval_loss
                        model.save_pretrained(os.path.join(ckpt_output_dir, "best"))
                        tokenizer.save_pretrained(os.path.join(ckpt_output_dir, "best"))

                # Save periodic checkpoint
                if step % save_steps == 0:
                    ckpt_dir = os.path.join(ckpt_output_dir, f"step_{step}")
                    model.save_pretrained(ckpt_dir)
                    tokenizer.save_pretrained(ckpt_dir)

                if step >= max_steps:
                    break
        if step >= max_steps:
            break

    return model, tokenizer


def custom_collate_fn(batch):
    """
    Used in PyTorch data loading to combine a list of samples (a batch) into a single batch dictionary.
    """
    batch_out = {}
    for key in batch[0]:
        if isinstance(batch[0][key], torch.Tensor):
            # Asegurar que todos los tensores tengan la misma forma
            batch_out[key] = torch.stack([item[key] for item in batch])
        else:
            batch_out[key] = [item[key] for item in batch]
    return batch_out

## **TRAIN THE MODEL !!**

In [110]:
model, tokenizer = train_loop(
    model=base_model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    training_config=training_config,
    device=device,
    collate_fn=custom_collate_fn
)

model.save_pretrained(os.path.join(training_config["training"]["output_dir"], "final"))
tokenizer.save_pretrained(os.path.join(training_config["training"]["output_dir"], "final"))

Using device: cuda


                                                        

('finetuned_model_-1_steps\\final\\tokenizer_config.json',
 'finetuned_model_-1_steps\\final\\special_tokens_map.json',
 'finetuned_model_-1_steps\\final\\tokenizer.json')

### Save model locally

In [99]:
finetuned_slightly_model = AutoModelForCausalLM.from_pretrained(os.path.join(training_config["training"]["output_dir"], "final"), local_files_only=True)
finetuned_slightly_model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (act): GELUActivation()
        )
      )
    )
    (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise

### Run trained model

In [100]:
index = 1

test_question = test_dataset[index][training_config["dataset"]["example_input_key"]]
print("Question input (test):", test_question)

print("Correct answer from Dataset: ")
print(test_dataset[index][training_config["dataset"]["example_output_key"]], "\n")

print("Finetuned slightly model's answer: ")
print(inference(test_question, finetuned_slightly_model, tokenizer, training_config))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question input (test): What is the boiling point of water in Celsius?
Correct answer from Dataset: 
100 degrees Celsius 

Finetuned slightly model's answer: 
Q.

A:

The answer is that the boiling point of water in Celsius is not the same as the boiling point of water in Celsius.  The boiling point of water in Celsius is the same as


In [101]:
index = 1

test_question = test_dataset[index][training_config["dataset"]["example_input_key"]]
test_question = "What is the capital of France?"
print("Question input (test):", test_question)

print("Correct answer from Dataset: ")
print(test_dataset[index][training_config["dataset"]["example_output_key"]], "\n")

print("Finetuned model's answer: ")
print(inference(test_question, finetuned_slightly_model, tokenizer, training_config))

print("Base model's answer: ")
print(inference(test_question, base_model, tokenizer, training_config))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question input (test): What is the capital of France?
Correct answer from Dataset: 
100 degrees Celsius 

Finetuned model's answer: 


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Q: What is the capital of the world?Q: What is the capital of the world?Q: What is the capital of the world?Q: What is the capital of the world?Q: What is the capital of the world?
Base model's answer: 
Q: What is the capital of the world?Q: What is the capital of the world?Q: What is the capital of the world?Q: What is the capital of the world?Q: What is the capital of the world?
