<a href="https://colab.research.google.com/github/edumarcelino/br.ufpe.cin.mlp.fss.pso/blob/master/FineTunning_PHI3_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Instala as bibliotecas necessárias para o projeto
# pip é o gerenciador de pacotes do Python, usado para instalar e atualizar bibliotecas.
# datasets: biblioteca para acesso a conjuntos de dados em vários formatos.
# peft: biblioteca para treinamento de modelos.
# torch: biblioteca de aprendizado de máquina (ML) desenvolvida pela NVIDIA.
# transformers: biblioteca para processamento de texto usando modelos pré- treinados.
# trl: biblioteca de treinamento de recompensas.
# bitsandbytes: biblioteca de manipulação de dados binários.

!pip install datasets
!pip install peft
!pip install torch
!pip install transformers
!pip install trl
!pip install bitsandbytes



In [9]:
# Importa a biblioteca sys para acesso a métodos de sistema.
import sys

# Importa a biblioteca logging para gerenciamento de mensagens de erro e informação.
import logging

# Importa a classe Dataset da biblioteca datasets, que fornece uma interface para manipular conjuntos de dados.
from datasets import Dataset

# Carrega um conjunto de dados específico usando a função load_dataset da biblioteca datasets.
from datasets import load_dataset

# Importa a configuração do Lora (Low-Rank Adaptation) da biblioteca peft.
from peft import LoraConfig

# Importa a biblioteca PyTorch, que fornece uma plataforma para aprendizado de máquina.
import torch

# Importa a biblioteca Transformers, que fornece modelos pré-treinados para processamento de texto.
import transformers

# Importa a classe SFTTrainer da biblioteca trl, que é um treinador de recompensas.
from trl import SFTTrainer

# Carrega os modelos pré-treinados da biblioteca Transformers.
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments

# Importa a biblioteca Google Colab para acesso ao drive do Google Cloud.
from google.colab import drive

# Carrega os dados do drive do Google Colab para o ambiente local.
import json

import pandas as pd

# Configura a variável de ambiente WANDB_MODE para desativar o modo de monitoramento do WANDB.
import os
os.environ["WANDB_MODE"] = "disabled"


In [5]:
###################
# Hyper-parameters
###################

# Cria um objeto de logador (logger) para gerenciar mensagens de erro e informação.
logger = logging.getLogger(__name__)

###################
# Hyper-parameters
###################
# Define as configurações hiperparâmetros do treinamento do modelo.


# Configuração geral do treinamento:
training_config = {
    # Utilizar armazenamento de ponto flutuante 16-bit (bf16) durante o treinamento?
    "bf16": True,

    # Avaliar o desempenho durante o treinamento?
    "do_eval": False,

    # Taxa de aprendizado do modelo?
    "learning_rate": 5.0e-06,

    # Nível de log (info, debug, warning, error)?
    "log_level": "info",

    # Número de passos para cada logs?
    "logging_steps": 20,

    # Estratégia de logging (steps ou epoch)?
    "logging_strategy": "steps",

    # Tipo de scheduler de taxas de aprendizado?
    "lr_scheduler_type": "cosine",

    # Número de épocas do treinamento?
    "num_train_epochs": 1,

    # Número máximo de passos durante o treinamento?
    "max_steps": -1,

    # Diretório para armazenar os checkpoints do treinamento?
    "output_dir": "./checkpoint_dir",

    # Atualizar o diretório de saída se existir?
    "overwrite_output_dir": True,

    # Tamanho de cada batch durante a avaliação em dispositivo?
    "per_device_eval_batch_size": 4,

    # Tamanho de cada batch durante o treinamento no dispositivo?
    "per_device_train_batch_size": 4,

    # Remover colunas desnecessárias do dataset?
    "remove_unused_columns": True,

    # Número de passos entre os logs?
    "save_steps": 100,

    # Limite total para o número de checkpoints armazenados?
    "save_total_limit": 1,

    # Semente para o pseudo-aleatório do treinamento?
    "seed": 0,

    # Desative ou não a cópia de gradientes?
    "gradient_checkpointing": True,

    # Configurações para a cópia de gradientes (kwargs)?
    "gradient_checkpointing_kwargs":{"use_reentrant": False},

    # Número de passos de gradientes acumulados durante o treinamento?
    "gradient_accumulation_steps": 1,

    # Porcentagem do tempo de warmup para a taxa de aprendizado?
    "warmup_ratio": 0.2,
}

# Configuração específica para o Lora (Low-Rank Adaptation).
peft_config = {
    # Número máximo de camadas para o Lora?
    "r": 16,

    # Valor do parâmetro alpha do Lora?
    "lora_alpha": 32,

    # Taxa de desaproximação do dropout do Lora?
    "lora_dropout": 0.05,

    # Tipo de bias (none, identity, or learned)?
    "bias": "none",

    # Tipo de tarefa para o modelo (CAUSAL_LM, etc.)?
    "task_type": "CAUSAL_LM",

    # Números das camadas que devem ser salvas?
    "target_modules": "all-linear",
}

# Cria os objetos de configuração para o treinamento e Lora.
train_conf = TrainingArguments(**training_config)
peft_conf = LoraConfig(**peft_config)





In [6]:
###############
# Setup logging
###############
# Configura o sistema de log para capturar e exibir informações do treinamento.

# Define a formatagem padrão para as mensagens de log, incluindo data e hora, nível de log, nome da log ou mensagem.
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",

    # Especifica a forma como a data e hora devem ser exibidas (ano, mês, dia, hora, minutos, segundos).
    datefmt="%Y-%m-%d %H:%M:%S",

    # Adiciona um manipulador de log para o output padrão (stdout), permitindo que as mensagens sejam exibidas na console.
    handlers=[logging.StreamHandler(sys.stdout)],
)

# Obtém o nível de log específico para a processagem do treinamento.
log_level = train_conf.get_process_log_level()

# Estabelece o nível de log da variável logger com o valor obtido acima.
logger.setLevel(log_level)

# Atualiza a configuração geral de logging dos transformers, ajustando a verificação de log e formatagem.
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()

# Ativa a forma explicita para as mensagens de log, permitindo que os logadores sejam personalizados.
transformers.utils.logging.enable_explicit_format()

# Loga uma mensagem de warning com informações específicas sobre o treinamento, incluindo rank do processo, dispositivo e número de GPUs.
logger.warning(
    f"Process rank: {train_conf.local_rank}, device: {train_conf.device}, n_gpu: {train_conf.n_gpu}"
    + f" distributed training: {bool(train_conf.local_rank != -1)}, 16-bits training: {train_conf.fp16}"
)

# Loga uma mensagem de informação com os parâmetros do treinamento e PEFT, permitindo que o progresso seja monitorado.
logger.info(f"Training/evaluation parameters {train_conf}")
logger.info(f"PEFT parameters {peft_conf}")


INFO:__main__:Training/evaluation parameters TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=True,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None

In [7]:
##################
# Model Loading
##################

# Define a variável para o caminho do checkpoint (arquivo de estado) da modelo pré-preparada.
checkpoint_path = "microsoft/Phi-3-mini-4k-instruct"
# # Optionally, pode ser alterado para outro caminho de checkpoint disponível.

# Cria uma dicionário com as configurações para o carregamento da modelo pré-preparada.
model_kwargs = dict(
    use_cache=False,  # Desabilita a cache do modelo durante o carregamento.

    trust_remote_code=True,  # Permite que o código remoto seja executado automaticamente durante o carregamento.

    attn_implementation="eager",  # Implementação da atenção (FlashAttention) desabilitada.

    torch_dtype=torch.bfloat16,  # Utiliza a representação de ponto flutuante bfloat16 do PyTorch.

    device_map=None  # Mapeamento de dispositivos não é necessário nesse caso.
)

# Carrega e inicializa a modelo pré-preparada (tabela de pesos pré-definida) com as configurações especificadas.
model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)
# Utiliza o método `from_pretrained` da biblioteca Hugging Face para carregar e inicializar a modelo.

# Carrega e inicializa o tokenizador da modelo pré-preparada.
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

# Define a configuração do modelo máximo de comprimento (tamanho do input).
tokenizer.model_max_length = 2048

# Define a configuração da tokenização, especificando que o token de pad (token de alinhamento) seja o mesmo que o token de desconhecido (unk token).
tokenizer.pad_token = tokenizer.unk_token
# Isso ajuda a evitar gerar endless generation quando o modelo não consegue processar um input.

# Define a configuração do token de id do token de pad.
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
# Converte o token de pad para seu código de ID correspondente.

# Define a configuração da side padding, especificando que a padded (pad) seja sempre na direita (right).
tokenizer.padding_side = 'right'





The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

[INFO|configuration_utils.py:679] 2024-12-17 18:21:57,888 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/config.json


configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
[INFO|configuration_utils.py:679] 2024-12-17 18:21:58,398 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/config.json
[INFO|configuration_utils.py:746] 2024-12-17 18:21:58,400 >> Model config Phi3Config {
  "_name_or_path": "microsoft/Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_rang

modeling_phi3.py:   0%|          | 0.00/73.2k [00:00<?, ?B/s]

- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

[INFO|modeling_utils.py:3937] 2024-12-17 18:22:00,734 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/model.safetensors.index.json


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

[INFO|modeling_utils.py:1670] 2024-12-17 18:25:03,684 >> Instantiating Phi3ForCausalLM model under default dtype torch.bfloat16.
[INFO|configuration_utils.py:1096] 2024-12-17 18:25:03,687 >> Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 32000,
  "pad_token_id": 32000,
  "use_cache": false
}



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO|modeling_utils.py:4800] 2024-12-17 18:25:04,420 >> All model checkpoint weights were used when initializing Phi3ForCausalLM.

[INFO|modeling_utils.py:4808] 2024-12-17 18:25:04,421 >> All the weights of Phi3ForCausalLM were initialized from the model checkpoint at microsoft/Phi-3-mini-4k-instruct.
If your task is similar to the task the model of the checkpoint was trained on, you can already use Phi3ForCausalLM for predictions without further training.


generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

[INFO|configuration_utils.py:1051] 2024-12-17 18:25:04,913 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/generation_config.json
[INFO|configuration_utils.py:1096] 2024-12-17 18:25:04,915 >> Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": [
    32000,
    32001,
    32007
  ],
  "pad_token_id": 32000
}



tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

[INFO|tokenization_utils_base.py:2211] 2024-12-17 18:25:08,228 >> loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/tokenizer.model
[INFO|tokenization_utils_base.py:2211] 2024-12-17 18:25:08,229 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/tokenizer.json
[INFO|tokenization_utils_base.py:2211] 2024-12-17 18:25:08,229 >> loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/added_tokens.json
[INFO|tokenization_utils_base.py:2211] 2024-12-17 18:25:08,230 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/special_tokens_map

In [38]:
########################
# CREATE FUNCTION ANSWER
########################


# Função para gerar resposta com o modelo, com temperatura ajustada
def generate_answer(question, model, tokenizer, temperature=0.3):
    # Encode a pergunta como texto de entrada para o modelo
    inputs = tokenizer.encode(question, return_tensors="pt")

    # Gerar resposta com o modelo
    outputs = model.generate(
        inputs,
        max_length=80,
        num_return_sequences=1,
        temperature=temperature  # Adicionando o parâmetro de temperatura
    )

    # Decodificar a resposta gerada pelo modelo em texto
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return answer

def clean_answer(answer, question):
    # Remove a pergunta se ela estiver no início da resposta
    if answer.startswith(question):
        return answer[len(question):].strip()
    return answer

In [33]:
# Função para retornar um DataFrame com as respostas geradas pelo modelo
def get_response_dataframe(dataset, model, tokenizer, temperature=0.3, n_rows=None):
    responses = []

    for i, row in enumerate(dataset.take(n_rows)):
        question = row['question']
        answer = generate_answer(question, model, tokenizer, temperature)
        response = clean_answer(answer, question)

        responses.append({"Pergunta": question, "Resposta": response})

    df = pd.DataFrame(responses)

    return df

In [11]:

##################
# Dataset Loading
##################

# Carrega o token de login do Google Drive para permitir acesso aos arquivos.
drive.mount('/content/drive')

# Define a variável para o caminho do arquivo JSON que será lido.
caminho_arquivo = '/content/drive/MyDrive/Colab Notebooks/desafio3_dados/output_finetune.json'

# Abre o arquivo JSON em modo de leitura e carrega seus conteúdos em uma variável chamada `data`.
with open(caminho_arquivo, 'r') as f:
    data = json.load(f)  # 'data' é uma lista de dicionários

# Verifica se a carga do arquivo foi bem-sucedida e se os dados estão na forma esperada.
if isinstance(data, list):
    # Converte a lista de dicionários em um dicionário de listas (formato tabelar) para facilitar as operações seguintes.
    dict_data = {key: [item[key] for item in data] for key in data[0].keys()}
else:
    raise ValueError("O JSON deve ser uma lista de objetos para este processamento.")

# Cria um objeto Dataset do Hugging Face, utilizando a lista de dicionários carregada anteriormente.
dataset = Dataset.from_list(data)

# Imprime as informações do dataset para verificar se o processo foi bem-sucedido.
print(dataset)


Mounted at /content/drive
Dataset({
    features: ['question', 'answer'],
    num_rows: 1498718
})


In [34]:
############################
# TESTE COM 10.000 REGISTROS
############################
df_sem_tunning = pd.DataFrame()

df_sem_tunning = get_response_dataframe(dataset, model, tokenizer, temperature=0.3, n_rows=10)

pd.set_option('display.max_colwidth', 300) # Largura máxima da coluna
pd.set_option('display.max_rows', None) # Mostrar todas as linhas do DataFrame

print(df_sem_tunning)





                                                                                           Pergunta  \
0                            What is the description for the product 'Girls Ballet Tutu Neon Pink'?   
1                                          What is the description for the product 'Mog's Kittens'?   
2                            What is the description for the product 'Girls Ballet Tutu Neon Blue'?   
3                                            What is the description for the product 'The Prophet'?   
4                              What is the description for the product 'Rightly Dividing the Word'?   
5                            What is the description for the product 'Worship with Don Moen [VHS]'?   
6                             What is the description for the product 'Autumn Story Brambly Hedge'?   
7  What is the description for the product 'Spirit Led-Moving By Grace In The Holy Spirit's Gifts'?   
8                   What is the description for the product 'The Very Bad

In [35]:
# Carregar o tokenizer para o modelo phi3
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

# Função para processar os dados
def apply_chat_template(example, tokenizer):
    # Criação das mensagens no formato adequado para o fine-tuning
    messages = [
        {"role": "user", "content": example["question"]},
        {"role": "assistant", "content": example["answer"]}
    ]
    # Gerar o texto usando o tokenizer
    example["text"] = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False
    )
    return example

# Carregar o seu dataset (substitua com o seu dataset real)
# Supondo que 'dataset' seja o seu dataset original com as colunas "question" e "answer"
raw_dataset = dataset  # dataset já deve ser carregado

# Selecionar um subset de 1000 linhas
subset_dataset = raw_dataset.select(range(10000))  # Seleciona as 1000 primeiras linhas

# Obter os nomes das colunas do dataset original
column_names = list(subset_dataset.features)

# Processar o subset de 1000 linhas
processed_dataset = subset_dataset.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,  # Processamento paralelo (ajuste conforme necessário)
    remove_columns=column_names,
    desc="Aplicando o template de chat ao subset de 1000 linhas",
)

# Verificando os primeiros exemplos processados
print(processed_dataset[0])

[INFO|tokenization_utils_base.py:2211] 2024-12-17 19:12:50,961 >> loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/tokenizer.model
[INFO|tokenization_utils_base.py:2211] 2024-12-17 19:12:50,962 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/tokenizer.json
[INFO|tokenization_utils_base.py:2211] 2024-12-17 19:12:50,963 >> loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/added_tokens.json
[INFO|tokenization_utils_base.py:2211] 2024-12-17 19:12:50,963 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/special_tokens_map

Aplicando o template de chat ao subset de 1000 linhas (num_proc=10):   0%|          | 0/10000 [00:00<?, ? exam…

{'text': "<|user|>\nWhat is the description for the product 'Girls Ballet Tutu Neon Pink'?<|end|>\n<|assistant|>\nHigh quality 3 layer ballet tutu. 12 inches in length<|end|>\n<|endoftext|>"}


In [36]:
###########
# Training
###########
trainer = SFTTrainer(
    model=model,
    args=train_conf,
    peft_config=peft_conf,
    train_dataset=processed_dataset,
    eval_dataset=processed_dataset,
    tokenizer=tokenizer,
)
train_result = trainer.train()
metrics = train_result.metrics
trainer.save_state()


  trainer = SFTTrainer(
[INFO|training_args.py:2169] 2024-12-17 19:14:49,071 >> PyTorch: setting up devices


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

[INFO|trainer.py:699] 2024-12-17 19:15:05,449 >> Using auto half precision backend
[INFO|trainer.py:2314] 2024-12-17 19:15:05,989 >> ***** Running training *****
[INFO|trainer.py:2315] 2024-12-17 19:15:05,991 >>   Num examples = 10,000
[INFO|trainer.py:2316] 2024-12-17 19:15:05,992 >>   Num Epochs = 1
[INFO|trainer.py:2317] 2024-12-17 19:15:05,994 >>   Instantaneous batch size per device = 4
[INFO|trainer.py:2320] 2024-12-17 19:15:05,994 >>   Total train batch size (w. parallel, distributed & accumulation) = 4
[INFO|trainer.py:2321] 2024-12-17 19:15:05,995 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:2322] 2024-12-17 19:15:05,996 >>   Total optimization steps = 2,500
[INFO|trainer.py:2323] 2024-12-17 19:15:06,001 >>   Number of trainable parameters = 25,165,824
[INFO|integration_utils.py:812] 2024-12-17 19:15:06,011 >> Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss
20,2.5754
40,2.5856
60,2.4324
80,2.5594
100,2.428
120,2.4542
140,2.4891
160,2.5577
180,2.4586
200,2.4453


[INFO|trainer.py:3812] 2024-12-17 19:16:09,100 >> Saving model checkpoint to ./checkpoint_dir/checkpoint-100
[INFO|configuration_utils.py:679] 2024-12-17 19:16:09,587 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/config.json
[INFO|configuration_utils.py:746] 2024-12-17 19:16:09,589 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
 

In [37]:
# Defina o caminho para salvar o modelo no Google Drive
output_dir = '/content/drive/MyDrive/Colab Notebooks/desafio3_dados/phii3_model_finetunning'

# Salvar o modelo treinado no Google Drive
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Modelo salvo em: {output_dir}")

[INFO|trainer.py:3812] 2024-12-17 19:41:31,263 >> Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/desafio3_dados/phii3_model_finetunning
[INFO|configuration_utils.py:679] 2024-12-17 19:41:32,537 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/config.json
[INFO|configuration_utils.py:746] 2024-12-17 19:41:32,539 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_siz

Modelo salvo em: /content/drive/MyDrive/Colab Notebooks/desafio3_dados/phii3_model_finetunning


In [41]:
#############
# Evaluation
#############
tokenizer.padding_side = 'left'
#metrics = trainer.evaluate(processed_dataset)
#metrics["eval_samples"] = len(proc)
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

***** eval metrics *****
  epoch                    =         1.0
  total_flos               = 100090939GF
  train_loss               =      2.1341
  train_runtime            =  0:26:25.23
  train_samples_per_second =       6.308
  train_steps_per_second   =       1.577


In [42]:
# Carregar o modelo e o tokenizer salvos
model = AutoModelForCausalLM.from_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(output_dir)


# Função para gerar resposta com o modelo, com temperatura ajustada
def generate_answer(question, model, tokenizer, temperature=0.3):
    inputs = tokenizer.encode(question, return_tensors="pt")
    outputs = model.generate(
        inputs,
        max_length=80,
        num_return_sequences=1,
        temperature=temperature  # Adicionando o parâmetro de temperatura
    )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

def clean_answer(answer, question):
    # Remove a pergunta se ela estiver no início da resposta
    if answer.startswith(question):
        return answer[len(question):].strip()
    return answer





print("RODADA 01")
# Testar com uma pergunta
question = "What is Girls Ballet Tutu Neon Pink?"
answer = generate_answer(question, model, tokenizer, temperature=0)
answer = clean_answer(answer, question)
print(f"Pergunta: {question}")
print(f"Resposta: {answer}")

print("RODADA 02")
# Testar com uma pergunta
question = "What is Girls Ballet Tutu Neon Pink?"
answer = generate_answer(question, model, tokenizer, temperature=0)
answer = clean_answer(answer, question)
print(f"Pergunta: {question}")
print(f"Resposta: {answer}")

print("RODADA 03")
# Testar com uma pergunta
question = "What is Girls Ballet Tutu Neon Pink?"
answer = generate_answer(question, model, tokenizer, temperature=0)
answer = clean_answer(answer, question)
print(f"Pergunta: {question}")
print(f"Resposta: {answer}")









[INFO|configuration_utils.py:679] 2024-12-17 19:47:38,912 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/config.json
[INFO|configuration_utils.py:746] 2024-12-17 19:47:38,915 >> Model config Phi3Config {
  "_name_or_path": "microsoft/Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_hea

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO|modeling_utils.py:4800] 2024-12-17 19:47:42,144 >> All model checkpoint weights were used when initializing Phi3ForCausalLM.

[INFO|modeling_utils.py:4808] 2024-12-17 19:47:42,146 >> All the weights of Phi3ForCausalLM were initialized from the model checkpoint at microsoft/Phi-3-mini-4k-instruct.
If your task is similar to the task the model of the checkpoint was trained on, you can already use Phi3ForCausalLM for predictions without further training.
[INFO|configuration_utils.py:1051] 2024-12-17 19:47:42,396 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/generation_config.json
[INFO|configuration_utils.py:1096] 2024-12-17 19:47:42,399 >> Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": [
    32000,
    32001,
    32007
  ],
  "pad_token_id": 32000
}

[INFO|tokenization_utils_base.py:2209] 2024-12-17 19:47:42,990 >>

RODADA 01
Pergunta: What is Girls Ballet Tutu Neon Pink?
Resposta: Girls Ballet Tutu Neon Pink is a type of ballet costume that is designed specifically for young girls. It is typically made from a lightweight, stretchy fabric that allows for ease of movement while dancing. The tutu is usually a bright, neon pink color, which adds a fun and playful touch to the costume.

The ballet tutu is a
RODADA 02
Pergunta: What is Girls Ballet Tutu Neon Pink?
Resposta: Girls Ballet Tutu Neon Pink is a type of ballet costume that is designed specifically for young girls. It is typically made from a lightweight, stretchy fabric that allows for ease of movement while dancing. The tutu is usually a bright, neon pink color, which adds a fun and playful touch to the costume.

The ballet tutu is a
RODADA 03
Pergunta: What is Girls Ballet Tutu Neon Pink?
Resposta: Girls Ballet Tutu Neon Pink is a type of ballet costume that is designed specifically for young girls. It is typically made from a lightweight,

In [46]:

# Exibindo os nomes das colunas
print(df_sem_tunning.columns)

print(df_sem_tunning["Resposta"])

print(df_sem_tunning["Pergunta"])

dataframe_completo = pd.DataFrame()







Index(['Pergunta', 'Resposta'], dtype='object')
0                                                                                                 Input:\n\nThe 'Girls Ballet Tutu Neon Pink' is a vibrant and eye-catching tutu designed for young ballet dancers. Made from high-quality, stretchy fabric, it allows for a full range of motion
1                                                                                           Input:\n\nMog's Kittens\n\nOutput:\n\nMog's Kittens is a delightful collection of adorable, plush kittens designed to bring joy and comfort to your home. Each kitten is handcrafted with soft, high-
2                                                                                          Input:\n\nProduct: Girls Ballet Tutu Neon Blue\n\nOutput:\n\nThe Girls Ballet Tutu Neon Blue is a vibrant and eye-catching dance costume designed for young ballet enthusiasts. Made from high-quality
3            The Prophet is a novel written by the renowned author Khalil Gibran. 