In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
from kaggle_secrets import UserSecretsClient

os.environ["HUGGINGFACE_TOKEN"] = UserSecretsClient().get_secret("HUGGINGFACE-TOKEN-TO-UPLOAD")

ModuleNotFoundError: No module named 'kaggle_secrets'

In [4]:
def install_lib(libname):
    print(f">>> {libname}")
    get_ipython().system(f"pip install -qqq {libname}")

libs = [
    "bitsandbytes",
    "peft",
    "transformers",
    "trl",
    "accelerate",
    "datasets",
]

for lib in libs:
    install_lib(lib)

>>> bitsandbytes
>>> peft
>>> transformers
>>> trl
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
kfp 2.5.0 requires google-cloud-storage<3,>=2.2.1, but you have google-cloud-storage 1.44.0 which is incompatible.[0m[31m
[0m>>> accelerate
>>> datasets


In [5]:
import logging
import warnings

# Suprimir avisos específicos de FutureWarning e UserWarning
warnings.filterwarnings("ignore", category=FutureWarning, message=".*TRANSFORMERS_CACHE.*")
warnings.filterwarnings("ignore", message=".*resume_download.*deprecated.*", category=FutureWarning)
warnings.filterwarnings("ignore", message=".*use_cache=True.*", category=UserWarning)
warnings.filterwarnings("ignore", message=".*use_reentrant parameter should be passed explicitly.*", category=UserWarning)
warnings.filterwarnings("ignore", message="torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly.")


# Configurar o nível de log para a biblioteca transformers
logging.getLogger("transformers.trainer").setLevel(logging.WARNING)
logging.getLogger("transformers.trainer_utils").setLevel(logging.WARNING)
logging.getLogger("transformers.training_args").setLevel(logging.WARNING)




In [6]:
import torch
import bitsandbytes
import peft
import accelerate
import datasets
import trl
import warnings
import transformers

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("torch version:", torch.__version__)
print("bitsandbytes version:", bitsandbytes.__version__)
print("peft version:", peft.__version__)
print("accelerate version:", accelerate.__version__)
print("datasets version:", datasets.__version__)
print("trl version:", trl.__version__)
print("transformers version:", transformers.__version__)
print(f"Device name: '{torch.cuda.get_device_name()}'")
print("Device:", device)
print(f"Device properties: '{torch.cuda.get_device_properties(torch.cuda.current_device())}'")
print("Suporta bfloat16." if torch.cuda.is_bf16_supported() else "Não suporta bfloat16.")

torch version: 2.1.2
bitsandbytes version: 0.43.3
peft version: 0.12.0
accelerate version: 0.32.1
datasets version: 2.20.0
trl version: 0.9.6
transformers version: 4.42.3
Device name: 'Tesla T4'
Device: cuda
Device properties: '_CudaDeviceProperties(name='Tesla T4', major=7, minor=5, total_memory=15095MB, multi_processor_count=40)'
Não suporta bfloat16.


In [7]:
import os
from random import randrange

import torch
import numpy as np
import pandas as pd
from huggingface_hub import login
from datasets import load_dataset, Dataset


from trl import SFTConfig, SFTTrainer
from peft import LoraConfig, prepare_model_for_kbit_training, TaskType, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    TrainerCallback,
    set_seed,
    pipeline,
    TrainerCallback,
    TrainerControl,
    TrainerState,
)

2024-08-19 14:02:27.011796: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-19 14:02:27.011907: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-19 14:02:27.276582: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [8]:
class SaveCheckpointCallback(TrainerCallback):
    def on_save(self, args, state, control, **kwargs):
        print(f"Saving checkpoint at step {state.global_step}")

class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, early_stopping_patience=3, early_stopping_threshold=0.02):
        self.early_stopping_patience = early_stopping_patience
        self.early_stopping_threshold = early_stopping_threshold
        self.best_metric = None
        self.counter = 0

    def on_evaluate(self, args, state: TrainerState, control: TrainerControl, metrics=None, **kwargs):
        current_metric = metrics.get("eval_loss")  # Use the relevant metric for your task

        if current_metric is None:
            return

        if self.best_metric is None or current_metric < self.best_metric - self.early_stopping_threshold:
            self.best_metric = current_metric
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.early_stopping_patience:
                control.should_training_stop = True
                print(f"Early stopping at step {state.global_step} with best eval_loss = {self.best_metric}")


class SaveMetricsCallback(TrainerCallback):
    def __init__(self, output_dir):
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        self.output_dir = output_dir
        self.output_path = os.path.join(output_dir, "metrics.json")
        self.state_path = os.path.join(output_dir, "state.json")
        print(f"Output directory initialized at {output_dir}")
        self.metrics = self.load_existing_metrics()

    def load_existing_metrics(self):
        if os.path.isfile(self.output_path):
            return pd.read_json(self.output_path, lines=True).to_dict('records')
        return []

    def on_step_end(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        state.save_to_json(self.state_path)

    def on_evaluate(self, args, state: TrainerState, control: TrainerControl, metrics=None, **kwargs):
        if metrics:
            step = state.global_step
            _metrics = {"Step": step, **metrics}
            self.metrics.append(_metrics)
            metrics_df = pd.DataFrame(self.metrics).drop_duplicates(subset=['Step'], keep='last')
            metrics_df.to_json(self.output_path, orient="records", lines=True)



In [9]:
def get_latest_checkpoint(checkpoint_dir):
    try:
        checkpoints = [os.path.join(checkpoint_dir, d) for d in os.listdir(checkpoint_dir) if d.startswith("checkpoint-")]
        if not checkpoints:
            return None
        latest_checkpoint = max(checkpoints, key=os.path.getctime)
        return latest_checkpoint

    except FileNotFoundError:
        return None

In [10]:
def set_tokenizer(model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.padding_side = 'right'
    return tokenizer

In [12]:
dataset = load_dataset("nunorc/squad_v1_pt")
dataset = dataset.map(lambda row: {"response": row['answers']['text'][0] })


columns = ['id', 'context', 'question','response']
train_dataset = dataset["train"].shuffle(42).select(range(5000)).select_columns(columns)
test_dataset = dataset["validation"].shuffle(42).select(range(100)).select_columns(columns)

In [13]:
if torch.cuda.is_bf16_supported():
    compute_dtype = torch.bfloat16
    # attn_implementation = 'flash_attention_2'
    attn_implementation = 'eager'
else:
    compute_dtype = torch.float16
    attn_implementation = 'eager'

print(attn_implementation)
print(compute_dtype)

eager
torch.float16


In [14]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TOKENIZERS_PARALLELISM'] = 'true'
os.environ['TORCH_USE_CUDA_DSA'] = "1"

model_id = "emdemor/question-generator"
LOCAL_MODELPATH = "models/question-generator/" + model_id.lower().replace("/","-").replace(".","_")
login(token=os.environ.get("HUGGINGFACE_TOKEN"))
set_seed(1234)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [15]:
# A quantização é uma técnica para reduzir o tamanho do modelo e aumentar a eficiência computacional.
# Utilizamos a classe BitsAndBytesConfig para configurar a quantização em 4 bits, o que reduz o uso de memória e acelera o treinamento.
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype="bfloat16",
        bnb_4bit_use_double_quant=True,
)

# Usamos a classe AutoModelForCausalLM para carregar um modelo pré-treinado adequado para modelagem de linguagem causal.
# Parâmetros importantes incluem:
#  - torch_dtype=compute_dtype: Define o tipo de dado para o modelo.
#  - quantization_config=bnb_config: Aplica a configuração de quantização.
#  - device_map="auto": Distribui automaticamente o modelo nos dispositivos disponíveis.
#  - attn_implementation=attn_implementation: Define a implementação da atenção.
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=compute_dtype,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation,
)

# adapta o modelo para o treinamento em k-bits, otimizando ainda mais o desempenho.
model = prepare_model_for_kbit_training(model)

tokenizer = set_tokenizer(model_id)

adapter_config.json:   0%|          | 0.00/734 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/35.7M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [16]:
class LanguageModel:

    def __init__(self, tokenizer, model, device):
        self.tokenizer = tokenizer
        self.model = model
        self.device = device
        if self.tokenizer.pad_token_id is None:
            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id

    def tokenize(self, messages):
        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        model_inputs = tokenizer([text], return_tensors="pt").to(self.device)
        return model_inputs

    def generate(self, messages):
        model_inputs = self.tokenize(messages)
        model_inputs['attention_mask'] = model_inputs['attention_mask'].to(model_inputs['input_ids'].device)
        generated_ids = model.generate(
            model_inputs.input_ids,
            max_new_tokens=512,
            do_sample=True,
            attention_mask=model_inputs['attention_mask'],
            pad_token_id=self.tokenizer.pad_token_id
        )
        generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
        return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [17]:
%%time

llm = LanguageModel(tokenizer, model, device="cuda")

prompt = "Qual a capital do Brasil?"

messages = [
    # {"role": "user", "content": "Olá. Você é um expert em geografia e vai me ajudar a responder algumas questões."},
    # {"role": "assistent", "content": "Tudo bem! Como posso ajudar?"},
    {"role": "user", "content": prompt},
]

llm.generate(messages)

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.


CPU times: user 1min 19s, sys: 73.5 ms, total: 1min 19s
Wall time: 1min 20s


'O Rio de Janeiro é a capital do Brasil, onde está localizado o governo federal localizado no Palácio do Planalto. No entanto, o Rio de Janeiro é apenas a capital do Centro-Sul, enquanto o Brasil tem várias cidades capitais entre diferentes unidades federativas: Brasília é a capital do centro do país e sede do governo federal brasileiro, separada pelos planaltos de Goiás e Minas Gerais; Recife é a capital do norte do Brasil, com sede no estado de Pernambuco; Salvador, no sul da Bahia, é a capital do estado da Bahia e sede do governo estadual; São Luís, no nordeste do Maranhão, é a capital e a capital regional do Norte Regional em São Luis - no Norte da Região Norte; Campo Grande é a capital do interior do Mato Grosso do Sul (antigamente o Mato Grosso) e sede do órgão legislativo, com a finalidade de garantir a representação de seus quatros milhões de habitantes; Curitiba é a capital regional do Oeste Paulista do estado do Paraná, sede do governo estadual do estado de mesmo nome; Floria

In [18]:
def format_dataset_chatml(row):
    messages = [
        {
            "content": "Você é um assistente especializado em interpretação de texto",
            "role": "system"
        },
        {
            "content": f"Gere uma pergunta para o seguinte contexto:\n```\n{row['context']}\n```\nPergunta:",
            "role": "user"
        },
        {
            "content": f"{row['question']}",
            "role": "assistant"
        }
    ]

    return {"text": tokenizer.apply_chat_template(messages, add_generation_prompt=False, tokenize=False)}

train_dataset_chatml = train_dataset.map(format_dataset_chatml)
test_dataset_chatml = test_dataset.map(format_dataset_chatml)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [19]:
%%time

sft_config = SFTConfig(
    seed=42,
    output_dir=LOCAL_MODELPATH,
    dataset_text_field="text",
    max_seq_length=512,

    # Training Hyperparameters
    learning_rate=5e-4,
    num_train_epochs=3,
    warmup_ratio=0.1,
    lr_scheduler_type="linear",

    # Validation
    do_eval=True,
    eval_strategy="steps",
    eval_steps=50,

    # Chackpoints
    save_strategy="steps",  # Salvando a cada 1000 passos
    save_steps=50,         # Salvando a cada 1000 passos
    # save_strategy="epoch",  # Salvando ao final de cada época

    # Loggings
    log_level="warning",
    logging_steps=50,

    # Weights yype
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),

    # Report to Weights And Bias? TensorBoard?
    report_to="none",
    
    push_to_hub=True,
    hub_model_id="emdemor/question-generator"
)

peft_config = LoraConfig(
        r=16,
        lora_alpha=1,
        lora_dropout=0.05,
        task_type=TaskType.CAUSAL_LM,
        target_modules=['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"],
)

CPU times: user 3.36 ms, sys: 0 ns, total: 3.36 ms
Wall time: 29 ms


In [20]:
%%time
trainer = SFTTrainer(
    model,
    train_dataset=train_dataset_chatml,
    eval_dataset=test_dataset_chatml,
    args=sft_config,
    peft_config=peft_config,
    tokenizer=tokenizer,
    callbacks=[SaveCheckpointCallback(), EarlyStoppingCallback( early_stopping_threshold=0.0005), SaveMetricsCallback(LOCAL_MODELPATH)],
)

trainer.train()

trainer.save_model()

Output directory initialized at models/question-generator/emdemor-question-generator


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss
50,1.8852,1.70717
100,1.6036,1.485588
150,1.5186,1.466216
200,1.4822,1.460276
250,1.5035,1.457846
300,1.4813,1.45578
350,1.4878,1.453382
400,1.4765,1.452285
450,1.4803,1.448461
500,1.4925,1.447762


Saving checkpoint at step 50
Saving checkpoint at step 100
Saving checkpoint at step 150
Saving checkpoint at step 200
Saving checkpoint at step 250
Saving checkpoint at step 300
Saving checkpoint at step 350
Saving checkpoint at step 400
Saving checkpoint at step 450
Saving checkpoint at step 500
Saving checkpoint at step 550
Saving checkpoint at step 600
Saving checkpoint at step 650
Saving checkpoint at step 700
Early stopping at step 750 with best eval_loss = 1.4460580348968506
Saving checkpoint at step 750
CPU times: user 1h 41min 24s, sys: 6.91 s, total: 1h 41min 31s
Wall time: 1h 41min 32s


In [21]:
trainer.model.push_to_hub(repo_id="emdemor/question-generator", commit_description="Using more data")

README.md:   0%|          | 0.00/2.15k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/emdemor/question-generator/commit/e44fd6a0c1a58a5d2864a090439ddb22a62d22a4', commit_message='Upload model', commit_description='Using more data', oid='e44fd6a0c1a58a5d2864a090439ddb22a62d22a4', pr_url=None, pr_revision=None, pr_num=None)

In [22]:
1

1

In [None]:

ft_model = AutoModelForCausalLM.from_pretrained(
    LOCAL_MODELPATH,
    torch_dtype=compute_dtype,
    trust_remote_code=True,
    #quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation,
)

In [None]:
context = "Em nota a família informou que o desejo de Silvio Santos em relação a sua morte era de que quando partisse, fosse levado diretamente ao cemitério particular onde fosse realizada uma cerimônia judaica. Silvio Santos também havia desejado que sua morte não fosse explorada, pois gostava de ser celebrado em vida. Gostaria de ser lembrado com a alegria que viveu e que seu desejo fosse respeitado."


messages = [
    {
        "content": "Você é um assistente especializado em interpretação de texto",
        "role": "system"
    },
    {
        "content": f"Gere uma pergunta para o seguinte contexto:\n```\n{context}\n```\nPergunta:",
        "role": "user"
    },
]

In [None]:
%%time
with torch.cuda.amp.autocast():
    # model.generate(**tokenizer("test", return_tensors="pt").to("cuda"))
    result = llm.generate(messages)

result

In [None]:
1