<a href="https://colab.research.google.com/github/eisbetterthanpi/LLM/blob/main/peft_hf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# https://github.com/huggingface/peft/blob/main/examples/conditional_generation/peft_lora_seq2seq.ipynb
# https://colab.research.google.com/github/huggingface/peft/blob/main/examples/conditional_generation/peft_lora_seq2seq.ipynb


# https://huggingface.co/docs/peft/en/quicktour
# https://huggingface.co/docs/peft/main/en/task_guides/image_classification_lora



In [None]:
# huggingface parameter efficient fine tuning
# https://huggingface.co/blog/peft
# https://colab.research.google.com/drive/1jCkpikz0J2o20FBQmYmAGdiKmJGOMo-o


#### whg

In [None]:
# https://colab.research.google.com/github/huggingface/peft/blob/main/examples/conditional_generation/peft_lora_seq2seq.ipynb#scrollTo=5eh-cQIoboOQ
from transformers import AutoModelForSeq2SeqLM
from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, LoraConfig, TaskType
import torch
from datasets import load_dataset
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformers import default_data_collator, get_linear_schedule_with_warmup
from tqdm import tqdm
from datasets import load_dataset

device = "cuda"
model_name_or_path = "bigscience/mt0-large"
tokenizer_name_or_path = "bigscience/mt0-large"

checkpoint_name = "financial_sentiment_analysis_lora_v1.pt"
text_column = "sentence"
label_column = "text_label"
max_length = 128
lr = 1e-3
num_epochs = 3
batch_size = 8


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
For effortless bug reporting copy-paste your error into this form: https://docs.google.com/forms/d/e/1FAIpQLScPB8emS3Thkp66nvqwmjTEgxp8Y9ufuWTzFyr9kJ5AoI47dQ/viewform?usp=sf_link
CUDA SETUP: CUDA runtime path found: /home/sourab/miniconda3/envs/ml/lib/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /home/sourab/miniconda3/envs/ml/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


In [None]:
# creating model
peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)
peft_config = IA3Config(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, feedforward_modules=[])

model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
model

In [None]:
# loading dataset
dataset = load_dataset("financial_phrasebank", "sentences_allagree")
dataset = dataset["train"].train_test_split(test_size=0.1)
dataset["validation"] = dataset["test"]
del dataset["test"]

classes = dataset["train"].features["label"].names
dataset = dataset.map(
    lambda x: {"text_label": [classes[label] for label in x["label"]]},
    batched=True, num_proc=1,)

print(dataset["train"][0])

Found cached dataset financial_phrasebank (/home/sourab/.cache/huggingface/datasets/financial_phrasebank/sentences_allagree/1.0.0/550bde12e6c30e2674da973a55f57edde5181d53f5a5a34c1531c53f93b7e141)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

{'sentence': "The 10,000-odd square metre plot that Stockmann has bought for the Nevsky Center shopping center is located on Nevsky Prospect , St Petersburg 's high street , next to the Vosstaniya Square underground station , in the immediate vicinity of Moscow Station .",
 'label': 1,
 'text_label': 'neutral'}

In [None]:
# data preprocessing
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

def preprocess_function(examples):
    inputs = examples[text_column]
    targets = examples[label_column]
    model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    labels = tokenizer(targets, max_length=3, padding="max_length", truncation=True, return_tensors="pt")
    labels = labels["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels
    return model_inputs

processed_datasets = dataset.map(preprocess_function, batched=True, num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False, desc="Running tokenizer on dataset",)

train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"]

train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)
eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)

Running tokenizer on dataset:   0%|          | 0/3 [00:00<?, ?ba/s]

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
# optimizer and lr scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [None]:
# training and evaluation
model = model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(eval_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

100%|████████████████████████████████████████████████████████████████████████████████████████| 255/255 [02:21<00:00,  1.81it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:07<00:00,  4.13it/s]


epoch=0: train_ppl=tensor(14.6341, device='cuda:0') train_epoch_loss=tensor(2.6834, device='cuda:0') eval_ppl=tensor(1.0057, device='cuda:0') eval_epoch_loss=tensor(0.0057, device='cuda:0')


100%|████████████████████████████████████████████████████████████████████████████████████████| 255/255 [02:00<00:00,  2.11it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:05<00:00,  5.66it/s]


epoch=1: train_ppl=tensor(1.7576, device='cuda:0') train_epoch_loss=tensor(0.5640, device='cuda:0') eval_ppl=tensor(1.0052, device='cuda:0') eval_epoch_loss=tensor(0.0052, device='cuda:0')


100%|████████████████████████████████████████████████████████████████████████████████████████| 255/255 [01:33<00:00,  2.74it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:04<00:00,  6.23it/s]

epoch=2: train_ppl=tensor(1.3830, device='cuda:0') train_epoch_loss=tensor(0.3243, device='cuda:0') eval_ppl=tensor(1.0035, device='cuda:0') eval_epoch_loss=tensor(0.0035, device='cuda:0')





In [None]:
# print accuracy
correct = 0
total = 0
for pred, true in zip(eval_preds, dataset["validation"]["text_label"]):
    if pred.strip() == true.strip():
        correct += 1
    total += 1
accuracy = correct / total * 100
print(f"{accuracy=} % on the evaluation dataset")
print(f"{eval_preds[:10]=}")
print(f"{dataset['validation']['text_label'][:10]=}")

accuracy=97.3568281938326 % on the evaluation dataset
eval_preds[:10]=['neutral', 'neutral', 'neutral', 'positive', 'neutral', 'positive', 'positive', 'neutral', 'neutral', 'neutral']
dataset['validation']['text_label'][:10]=['neutral', 'neutral', 'neutral', 'positive', 'neutral', 'positive', 'positive', 'neutral', 'neutral', 'neutral']


In [None]:
# saving model
peft_model_id = f"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}"
model.save_pretrained(peft_model_id)

In [None]:
ckpt = f"{peft_model_id}/adapter_model.bin"
!du -h $ckpt

9,2M	bigscience/mt0-large_LORA_SEQ_2_SEQ_LM/adapter_model.bin


In [None]:
from peft import PeftModel, PeftConfig

peft_model_id = f"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}"

config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, peft_model_id)

In [None]:
model.eval()
i = 13
inputs = tokenizer(dataset["validation"][text_column][i], return_tensors="pt")
print(dataset["validation"][text_column][i])
print(inputs)

with torch.no_grad():
    outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10)
    print(outputs)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

- Demand for fireplace products was lower than expected , especially in Germany .
{'input_ids': tensor([[  259,   264,   259, 82903,   332,  1090, 10040, 10371,   639,   259,
         19540,  2421,   259, 25505,   259,   261,   259, 21230,   281, 17052,
           259,   260,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
tensor([[    0,   259, 32588,     1]])
['negative']


#### sds

In [None]:
# https://huggingface.co/docs/peft/en/tutorial/peft_integrations
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m")

from peft import LoraConfig

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM"
)
model.add_adapter(peft_config)

from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("peft-internal-testing/opt-350m-lora")




In [None]:
from google.colab import userdata
HF_TOKEN = userdata.get('access_token_read')
!huggingface-cli login --token $HF_TOKEN --add-to-git-credential
# HF_TOKEN = userdata.get('access_token_write')
# !huggingface-cli login --token $HF_TOKEN --add-to-git-credential

In [None]:
# @title from huggingface
# https://huggingface.co/docs/peft/en/developer_guides/lora

from peft import LoraConfig, get_peft_model
from peft.optimizers import create_loraplus_optimizer
from transformers import Trainer
import bitsandbytes as bnb


from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")

config = LoraConfig(...)
# peft_config = IA3Config(task_type=TaskType.SEQ_CLS, target_modules=["k_proj", "v_proj", "down_proj"], feedforward_modules=["down_proj"]) # https://huggingface.co/docs/peft/en/conceptual_guides/ia3
# config = LoraConfig(target_modules="all-linear", ...)

model = get_peft_model(base_model, config)
optimizer = create_loraplus_optimizer(model=model, optimizer_cls=bnb.optim.Adam8bit, lr=5e-5, loraplus_lr_ratio=16,)
scheduler = None

model.unmerge_adapter()




# https://huggingface.co/learn/nlp-course/en/chapter3/3
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
# ds = load_dataset("codeparrot/github-code", streaming=True, split="train")
# print(next(iter(ds)))





checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(example): return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
def tokenize_function(examples): return tokenizer(examples["text"], padding="max_length", truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
# small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")
training_args = TrainingArguments(output_dir="./peft-finetuned-model", num_train_epochs=3, per_device_train_batch_size=2, save_steps=500,)

optimizer = Adafactor(model.parameters(), lr=1e-3, eps=(1e-30, 1e-3), decay_rate=-0.8,)
transformers.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=10, num_training_steps=100)

# trainer = Trainer(model=peft_model, args=training_args, train_dataset=train_data)
trainer = Trainer(model, training_args,
    optimizers=(optimizer, scheduler),
    train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)


trainer.train()
peft_model.save_pretrained("./peft-finetuned-model")



from transformers import AutoModelForCausalLM
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
peft_model_id = "alignment-handbook/zephyr-7b-sft-lora"
model = PeftModel.from_pretrained(base_model, peft_model_id)
model.merge_and_unload()

model.unmerge_adapter()




In [None]:
# @title from ichigo
import torch
import os
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    get_cosine_schedule_with_warmup,
)

from trl import SFTTrainer
import multiprocessing
from datasets import load_dataset
from transformers import AutoConfig

def print_once(message):
    if int(os.environ.get("LOCAL_RANK", 0)) == 0:
        print(message)

num_cores = multiprocessing.cpu_count()
print_once(f"Number of CPU cores: {num_cores}")
print_once("___________________________________")

# Model loading
print_once("--- Load Model ---")
model_path = "jan-hq/Jan-Llama3-0708"
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    use_cache=False,
)

# Tokenizer loading
print_once("--- Load Tokenizer ---")
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    use_fast=True,
    padding_side="right",
)

# change pad token to reserve space for special tokens
# set 128023 as pad token
tokenizer.pad_token_id = 128023
tokenizer.pad_token = tokenizer.convert_ids_to_tokens(128023)
print_once(tokenizer.pad_token_id)
print_once(len(tokenizer.get_vocab()))
print_once("--- Initialization complete ---")

# Setting up data train
dataset_train = load_dataset(
    "jan-hq/instruction-speech-conversation-v1.5-phase-2-sound-convo",
    num_proc=num_cores,
    split="train",
)

print_once("--- Dataset loading ---")
print_once("___________________________________")
print_once(dataset_train)
print_once("-----------------------------------")
print_once(dataset_train[0]["text"][:100])
print_once("-----------------------------------")
print_once(dataset_train[200]["text"][:100])
print_once("___________________________________")

# Training args
per_device_train_batch_size = 4
num_train_epochs = 1
gradient_accumulation_steps = 4

print_once("___________________________________")
print_once(f"{'Per Device Train Batch Size:':30} {per_device_train_batch_size}")
print_once(f"{'Number of Training Epochs:':30} {num_train_epochs}")
print_once(f"{'Gradient Accumulation Steps:':30} {gradient_accumulation_steps}")

config = AutoConfig.from_pretrained(model_path)
gpu_count = torch.cuda.device_count()

def training_step_calc(
    dataset, batch_size, num_gpus, num_epochs, gradient_accumulation_steps
):
    total_samples = len(dataset)
    effective_batch_size = batch_size * num_gpus * gradient_accumulation_steps
    steps_per_epoch = total_samples // effective_batch_size
    total_steps = steps_per_epoch * num_epochs
    return total_steps

training_steps = training_step_calc(
    dataset=dataset_train,
    batch_size=per_device_train_batch_size,
    num_gpus=gpu_count,
    num_epochs=num_train_epochs,
    gradient_accumulation_steps=gradient_accumulation_steps,
)

save_steps = int(training_steps // 80)
warmup_steps = int(training_steps*0.05)

print_once(f"{'Training steps':30} {training_steps}")
print_once(f"{'Saving steps':30} {save_steps}")
print_once(f"{'Warming steps':30} {warmup_steps}")
print_once("___________________________________")

# Create the custom trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset_train,
    dataset_text_field="text",
    max_seq_length=4096,
    dataset_num_proc=num_cores,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        num_train_epochs=num_train_epochs,
        bf16=True,
        logging_steps=1,
        save_strategy="steps",
        save_steps=save_steps,
        save_total_limit=5,
        warmup_steps=warmup_steps,
        learning_rate=3e-4,
        weight_decay=0.01,
        seed=3407,
        output_dir="outputs",
        report_to="tensorboard",
        max_grad_norm=1,
        optim="adamw_torch_fused",
        lr_scheduler_type="cosine",
        adam_beta1=0.9,
        adam_beta2=0.98,
        adam_epsilon=1e-6,
        hub_model_id="jan-hq/Jan-Llama3-0719",
        push_to_hub=True,
    ),
)

trainer_stats = trainer.train(resume_from_checkpoint=False)




In [1]:
# @title philschmid
# https://www.philschmid.de/fine-tune-llms-in-2024-with-trl
# !pip install -qU datasets trl bitsandbytes peft optimum unsloth transformers

# from google.colab import userdata
# HF_TOKEN = userdata.get('access_token_read')
# from huggingface_hub import login
# login(token=HF_TOKEN, add_to_git_credential=True)

# conversational format: {"messages": [{"role": "system", "content": "You are..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
# instruction format: {"prompt": "<prompt text>", "completion": "<ideal generated text>"}


from datasets import load_dataset
system_message = """You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.
SCHEMA:
{schema}"""

def create_conversation(sample):
  return {"messages": [
      {"role": "system", "content": system_message.format(schema=sample["context"])},
      {"role": "user", "content": sample["question"]},
      {"role": "assistant", "content": sample["answer"]}
    ]}

dataset = load_dataset("b-mc2/sql-create-context", split="train")
dataset = dataset.shuffle().select(range(12500))
dataset = dataset.map(create_conversation, remove_columns=dataset.features, batched=False)
dataset = dataset.train_test_split(test_size=2500/12500) # 10,000 train 2,500 test

print(dataset["train"][345]["messages"])

# save datasets to disk
dataset["train"].to_json("train_dataset.json", orient="records")
dataset["test"].to_json("test_dataset.json", orient="records")

from datasets import load_dataset
dataset = load_dataset("json", data_files="train_dataset.json", split="train")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

[{'content': 'You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.\nSCHEMA:\nCREATE TABLE table_name_76 (event VARCHAR, notes VARCHAR)', 'role': 'system'}, {'content': 'What Event goes with Notes 53.76?', 'role': 'user'}, {'content': 'SELECT event FROM table_name_76 WHERE notes = "53.76"', 'role': 'assistant'}]


Creating json from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
# @title philschmid
# https://www.philschmid.de/fine-tune-llms-in-2024-with-trl

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from trl import setup_chat_format, SFTTrainer, SFTConfig
from peft import LoraConfig, IA3Config

model_id = "meta-llama/Llama-3.2-1B"
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto", torch_dtype=torch.bfloat16) # attn_implementation="flash_attention_2",
tokenizer = AutoTokenizer.from_pretrained(model_id)

# from unsloth import FastLanguageModel
# model, tokenizer = FastLanguageModel.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto", torch_dtype=torch.bfloat16)
# model, tokenizer = FastLanguageModel.from_pretrained(model_name = "unsloth/Llama-3.2-1B-Instruct",
#     max_seq_length = 512, dtype = torch.float16, load_in_4bit = True, # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
# )

tokenizer.padding_side = 'right' # to prevent warnings
model, tokenizer = setup_chat_format(model, tokenizer) # OAI chatML chat template
# peft_config = LoraConfig(lora_alpha=128, lora_dropout=0.05, r=4, bias="none", target_modules="all-linear", task_type="CAUSAL_LM",) # r=256
# peft_config = LoraConfig(task_type="CAUSAL_LM")
# peft_config = IA3Config(task_type="CAUSAL_LM", target_modules=["k_proj", "v_proj", "down_proj"], feedforward_modules=["down_proj"]) # https://huggingface.co/docs/peft/en/package_reference/ia3
peft_config = IA3Config(task_type="CAUSAL_LM")
# peft_config = IA3Config(peft_type="IA3", task_type="CAUSAL_LM", target_modules=["k", "v", "w0"], feedforward_modules=["w0"],)

# args = TrainingArguments( # https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments
args = SFTConfig( # https://huggingface.co/docs/trl/main/en/sft_trainer#enhance-the-models-performances-using-neftune
    neftune_noise_alpha=5, # SFTConfig
    output_dir="llama1b-text2sql", # directory to save and repository id
    learning_rate=1e-1, optim="adamw_torch_fused", # Lora 2e-4 ; IA3 1e-2?
    lr_scheduler_type="constant", warmup_ratio=0.03,
    num_train_epochs=1,
    per_device_train_batch_size=4, # Lora 4 ; IA3 4?
    # gradient_accumulation_steps=2, gradient_checkpointing=True,
    logging_steps=1, save_strategy="epoch",
    bf16=True,
    max_grad_norm=1.,
    # report_to="tensorboard",                # report metrics to tensorboard
    max_seq_length=512, # 3072 max sequence length for model and packing of the dataset
    packing=True, dataset_kwargs={"add_special_tokens": False, "append_concat_token": False,}
) # 487a2109e55dce4e13fc70681781de9f50f27be7

# args = SFTConfig(neftune_noise_alpha=5,) # https://huggingface.co/docs/trl/main/en/sft_trainer#enhance-the-models-performances-using-neftune

# https://huggingface.co/docs/trl/main/en/trainer
trainer = SFTTrainer(model=model, tokenizer=tokenizer, args=args, train_dataset=dataset, peft_config=peft_config,)
# print(tokenizer.model_max_length) # 131072


# with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False): # TFF https://huggingface.co/docs/trl/main/en/sft_trainer#using-flash-attention-1
trainer.train()
trainer.save_model()


# from peft import AutoPeftModelForCausalLM
# model = AutoPeftModelForCausalLM.from_pretrained(args.output_dir, torch_dtype=torch.float16, low_cpu_mem_usage=True,)
# merged_model = model.merge_and_unload()
# merged_model.save_pretrained(args.output_dir,safe_serialization=True, max_shard_size="2GB")

# PPO, DPO, rlhf


Step,Training Loss
1,2.425
2,1.5737
3,1.7022
4,1.2845
5,1.0819
6,1.0032
7,0.9607
8,0.9192
9,0.918
10,0.8797


In [None]:
# @title unsloth
# https://huggingface.co/docs/trl/main/en/sft_trainer#accelerate-fine-tuning-2x-using-unsloth
# https://github.com/unslothai/unsloth
# https://colab.research.google.com/drive/1T5-zKWM_5OD21QHwXHiV9ixTRR7k3iB9?usp=sharing

!pip install unsloth
# !pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"


from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto", torch_dtype=torch.bfloat16)



model, tokenizer = FastLanguageModel.from_pretrained(model_name = "unsloth/Llama-3.2-1B-Instruct",
    max_seq_length = 512, dtype = torch.bfloat16, load_in_4bit = True,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)


model = FastLanguageModel.get_peft_model(
    model,
    r = 4,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)




In [1]:
from google.colab import userdata
HF_TOKEN = userdata.get('access_token_read')
!huggingface-cli login --token $HF_TOKEN --add-to-git-credential
HF_TOKEN = userdata.get('access_token_write')
!huggingface-cli login --token $HF_TOKEN --add-to-git-credential


Token is valid (permission: read).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /root/.cache/huggingface/token
Login successful
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more 

In [None]:
model.push_to_hub("peft")


model.safetensors:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/bobbobobo/peft/commit/f50674cf3f493b32f6db6e5dddac8ac779c18965', commit_message='Upload LlamaForCausalLM', commit_description='', oid='f50674cf3f493b32f6db6e5dddac8ac779c18965', pr_url=None, pr_revision=None, pr_num=None)

In [10]:
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, pipeline

# peft_model_id = "./code-llama-7b-text-to-sql"
# # peft_model_id = args.output_dir

# # Load Model with PEFT adapter
# model = AutoPeftModelForCausalLM.from_pretrained(
#   peft_model_id,
#   device_map="auto",
#   torch_dtype=torch.float16
# )
# tokenizer = AutoTokenizer.from_pretrained(peft_model_id)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, torch_dtype=torch.bfloat16)


from datasets import load_dataset
from random import randint


# Load our test dataset
eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train")
rand_idx = randint(0, len(eval_dataset))

# Test on sample
prompt = pipe.tokenizer.apply_chat_template(eval_dataset[rand_idx]["messages"][:2], tokenize=False, add_generation_prompt=True)
input_ids = pipe.tokenizer(prompt, return_tensors="pt").input_ids.to(torch.bfloat16).to(model.device)

outputs = pipe(prompt, max_new_tokens=256, do_sample=False, temperature=0.1, top_k=50, top_p=0.1, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)
# outputs = pipe(prompt, input_ids=input_ids, max_new_tokens=256, do_sample=False, temperature=0.1, top_k=50, top_p=0.1, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)

# outputs = pipe(prompt, max_new_tokens=256, do_sample=False, temperature=0.1, top_k=50, top_p=0.1, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id, return_tensors=True)
# outputs = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)


print(f"Query:\n{eval_dataset[rand_idx]['messages'][1]['content']}")
print(f"Original Answer:\n{eval_dataset[rand_idx]['messages'][2]['content']}")
print(f"Generated Answer:\n{outputs[0]['generated_text'][len(prompt):].strip()}")


RuntimeError: expected mat1 and mat2 to have the same dtype, but got: float != c10::BFloat16

In [None]:
from tqdm import tqdm


def evaluate(sample):
    prompt = pipe.tokenizer.apply_chat_template(sample["messages"][:2], tokenize=False, add_generation_prompt=True)
    outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)
    predicted_answer = outputs[0]['generated_text'][len(prompt):].strip()
    if predicted_answer == sample["messages"][2]["content"]: return 1
    else: return 0

success_rate = []
number_of_eval_samples = 1000
# iterate over eval dataset and predict
for s in tqdm(eval_dataset.shuffle().select(range(number_of_eval_samples))):
    success_rate.append(evaluate(s))

# compute accuracy
accuracy = sum(success_rate)/len(success_rate)

print(f"Accuracy: {accuracy*100:.2f}%")
