In [None]:
#Vamos utilizar o modelo llama-2-7b em quantitização de 4bits

#instalando dependencias
!pip install -i https://pypi.org/simple/ bitsandbytes
!pip install git+https://github.com/huggingface/accelerate.git
!pip install transformers datasets peft torch

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Declarando variáveis de ambiente e importando as dependências necessárias


import sys
import torch
import random
import json

from datasets import load_dataset
import transformers
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
)


# Parâmetros de fine-tuning
MICRO_BATCH_SIZE = 4
BATCH_SIZE = 4
NAME = "llama-sumarizer"
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
EPOCHS = 1
LEARNING_RATE = 0.0002
CUTOFF_LEN = 2048

# Parâmetros do adaptador LoRA
LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.05
VAL_SET_SIZE = 2000
TARGET_MODULES = [
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
    "down_proj",
    "gate_proj",
    "up_proj",
]
DATA_PATH = "/content/drive/MyDrive/Colab Notebooks/fine-tuning-fiap/Aula 01 - Preparando dados de treinamento para fine-tuning/news_dataset_chat_data.json"
OUTPUT_DIR = "checkpoints/{}".format(NAME)


In [None]:
# Upload do dataset para processamento
data = []

data = load_dataset("json", data_files=DATA_PATH)


Generating train split: 0 examples [00:00, ? examples/s]

In [None]:

# Upload do modelo e preparação para o treino
device_map = torch.device("cuda:0")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
"daryl149/llama-2-7b-hf",
  #  load_in_4bit=True,
    device_map=device_map,
    quantization_config=bnb_config,
    trust_remote_code=True,
)
total_params, params = 0, 0

tokenizer = AutoTokenizer.from_pretrained(
    "daryl149/llama-2-7b-hf", add_eos_token=True
)

model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)
config.save_pretrained(OUTPUT_DIR)

model = get_peft_model(model, config)
tokenizer.pad_token_id = 0

for n, p in model.model.named_parameters():
    if any([x in n for x in ["lora"]]):
        total_params += p.numel()
    params += p.numel()

print(
    "Total number of parameters: {}M, rate: {}%".format(
        total_params // 1000 / 1000, round(total_params / params * 100, 2)
    )
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/745 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Total number of parameters: 19.988M, rate: 0.57%


In [None]:
# Preprocessamento dos dados
def generate_prompt(data_point):
    return data_point["input"]


def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=CUTOFF_LEN + 1,
        padding="max_length",
    )
    return {
        "input_ids": result["input_ids"][:-1],
        "attention_mask": result["attention_mask"][:-1],
    }


def generate_and_tokenize_prompt(data_point):
    prompt = generate_prompt(data_point)
    return tokenize(prompt)

#Divisão de parte do dataset nas bases de treino e de teste

if VAL_SET_SIZE > 0:
    train_val = data["train"].train_test_split(
        train_size=1000,test_size=200, shuffle=True, seed=42
    )
    train_data = train_val["train"].shuffle().map(generate_and_tokenize_prompt)
    val_data = train_val["test"].shuffle().map(generate_and_tokenize_prompt)
else:
    train_data = data["train"].shuffle().map(generate_and_tokenize_prompt)
    val_data = None


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
# Executando o treinamento

trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=MICRO_BATCH_SIZE,
        per_device_eval_batch_size=MICRO_BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        gradient_checkpointing=True,
        warmup_steps=50,
        num_train_epochs=EPOCHS,
        learning_rate=LEARNING_RATE,
        fp16=True,
        logging_steps=1,
        evaluation_strategy="steps" if VAL_SET_SIZE > 0 else "no",
        save_strategy="steps",
        eval_steps=100 if VAL_SET_SIZE > 0 else None,
        save_steps=100,
        output_dir=OUTPUT_DIR,
        save_total_limit=5,

        load_best_model_at_end=True if VAL_SET_SIZE > 0 else False,
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False

old_state_dict = model.state_dict
model.state_dict = (
    lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())
).__get__(model, type(model))



trainer.train()

model.save_pretrained(OUTPUT_DIR)




Step,Training Loss,Validation Loss
100,1.7049,1.577119




Step,Training Loss,Validation Loss
100,1.7049,1.577119
200,1.4359,1.571378




SafetensorError: Error while deserializing header: InvalidHeaderDeserialization

In [None]:
#Precisamos subir o modelo a partir do último checkpoint junto ao adaptador LoRA e fazer a inferência
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )
base_model_id = "daryl149/llama-2-7b-hf"
base_model = AutoModelForCausalLM.from_pretrained(base_model_id,
                                                      device_map='auto',
                                                      quantization_config=bnb_config,
                                                      trust_remote_code=True,
                                                      use_auth_token=True)
eval_tokenizer = AutoTokenizer.from_pretrained(base_model_id, add_bos_token=True, trust_remote_code=True, use_fast=False)
eval_tokenizer.pad_token = eval_tokenizer.eos_token
from peft import PeftModel
ft_model = PeftModel.from_pretrained(base_model, "/peft-dialogue-summary-training-1705417060/checkpoint-1000",torch_dtype=torch.float16,is_trainable=False)
seed=42
from transformers import set_seed
set_seed(seed)
prompt = f"SUMMARIZE THIS NEWS.\n[|News|] Today something happened [|eNews|]\n\n[|summary|]"
peft_model_res = gen(ft_model,prompt,512,)
peft_model_output = peft_model_res[0].split('Output:\n')[1]