In [2]:
import torch
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
    Trainer
)
from trl import SFTTrainer, SFTConfig
from evaluate import load
import time
# from utils import 
from utils import load_datasets

In [3]:
# HACK: Hyperparameters of the training
# 1. Quantization configuration hyperparameters
load_in_8bit = True
llm_int8_threshold = 6.0
llm_int8_skip_modules = None
quant_type = "nf4"

# 2. LoRA configuration hyperparameters
r = 16               # Try 8
scaling_factor = 16  # Try 32
lora_dropout = 0.05  # Try 0.1
bias = "none"
task_type = "CAUSAL_LM"

# 3. Training hyperparameters
train_batch_size = 8
eval_batch_size = 8
num_train_epochs = 1
logging_steps = 1000
save_steps = 1000
save_total_limit = 1
output_dir = "output"
overwrite_output_dir = True
per_device_train_batch_size = 4 # Try 8
per_device_eval_batch_size = 8
warmup_steps = 0
weight_decay = 0.01
learning_rate = 5e-5
adam_epsilon = 1e-8
max_grad_norm = 1.0
seed = 42

# 4. Evaluation hyperparameters
eval_steps = 1000
eval_logging_steps = 1000
eval_output_dir = "eval_output"
eval_overwrite_output_dir = True
eval_per_device_eval_batch_size = 8

# 5. Model configuration hyperparameters
model_name = "mistralai/Mistral-7B-v0.3"

# 6. Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#######################################################################################


In [4]:
# Step 1: Configure quantization with BitsAndBytes
tokenizer = AutoTokenizer.from_pretrained(  # por que el tokenizador depende del modelo a cargar? 
    model_name,
    add_eos_token=True,
    use_fast=True,
    padding_side="right"
)

tokenizer.pad_token = tokenizer.eos_token

In [5]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                    # Enable loading the model in 4-bit precision
    bnb_4bit_quant_type="nf4",            # Specify quantization type as Normal Float 4
    bnb_4bit_compute_dtype=getattr(torch, "bfloat16"), # Set computation data type
    bnb_4bit_use_double_quant=True,       # Use double quantization for better accuracy
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config
)
model = prepare_model_for_kbit_training(model)
model.config.pad_token_id = tokenizer.pad_token_id

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
# Step 3: Load datasets
data_path = "./data"  # Path where datasets are stored or will be downloaded
lima_train, lima_val, lima_test, oasst1_train, oasst1_val, oasst1_test = load_datasets(data_path)

# Tokenize datasets
def tokenize_function(example):
    return tokenizer(
        example["prompt"],
        text_target=example["response"],
        truncation=True,
        max_length=256,
        padding="max_length"
    )

# Tokenize OASST1 dataset
# tokenized_oasst1_train = oasst1_train.map(tokenize_function, batched=True)
# tokenized_oasst1_val = oasst1_val.map(tokenize_function, batched=True)
# Tokenize OASST1 dataset
tokenized_oasst1_train = oasst1_train.map(tokenize_function, batched=True, remove_columns=oasst1_train.column_names)
tokenized_oasst1_val = oasst1_val.map(tokenize_function, batched=True, remove_columns=oasst1_val.column_names)

# Remove any columns not needed for training
# tokenized_oasst1_train.rename_columns()

oasst1_train.set_format(type="torch")
oasst1_val.set_format(type="torch")

Saving the dataset (0/1 shards):   0%|          | 0/824 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/103 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/103 [00:00<?, ? examples/s]

LIMA dataset downloaded and saved in the data path ./data/lima.


Filter:   0%|          | 0/84437 [00:00<?, ? examples/s]

Map:   0%|          | 0/52912 [00:00<?, ? examples/s]

Filter:   0%|          | 0/52912 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/47620 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5292 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4401 [00:00<?, ? examples/s]

Map:   0%|          | 0/2756 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2756 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2756 [00:00<?, ? examples/s]

OASST1 dataset downloaded and saved in the data path ./data/oasst1.


Map:   0%|          | 0/47620 [00:00<?, ? examples/s]

Map:   0%|          | 0/5292 [00:00<?, ? examples/s]

In [7]:
lora_config = LoraConfig(
    r=r,                        # Rank of the LoRA decomposition
    lora_alpha=scaling_factor,  # Scaling factor for LoRA updates
    lora_dropout=lora_dropout,  # Dropout rate applied to LoRA layers
    bias=bias,                  # No bias is added to the LoRA layers
    task_type="CAUSAL_LM",      # Specify the task as causal language modeling
    target_modules=[            # Modules to apply LoRA to
        'k_proj', 'q_proj', 'v_proj', 'o_proj',
        'gate_proj', 'down_proj', 'up_proj'
    ]
)

In [8]:
oasst1_training_args = TrainingArguments(
    output_dir="./output_oasst1",
    eval_strategy="steps",
    do_eval=True,
    optim="paged_adamw_8bit",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=2,
    log_level="debug",
    logging_steps=10,
    learning_rate=1e-4,
    eval_steps=25,
    max_steps=100,
    save_steps=25,
    warmup_steps=25,
    lr_scheduler_type="linear",
)

In [None]:
oasst1_trainer = SFTTrainer(
    model=model,
    train_dataset=oasst1_train,
    eval_dataset=oasst1_val,
    peft_config=lora_config,
    max_seq_length=256,
    tokenizer=tokenizer,
    args=oasst1_training_args,
    dataset_text_field="response",
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
PyTorch: setting up devices
PyTorch: setting up devices


Map:   0%|          | 0/47620 [00:00<?, ? examples/s]

Map:   0%|          | 0/5292 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [20]:
oasst1_trainer.train()

Currently training with a batch size of: 4
***** Running training *****
  Num examples = 47,620
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 2
  Total optimization steps = 100
  Number of trainable parameters = 41,943,040
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss



***** Running Evaluation *****
  Num examples = 5292
  Batch size = 2


KeyboardInterrupt: 