<a href="https://colab.research.google.com/github/dhnanjay/HuggingFace/blob/main/Fine_tuned_Llama_PEFT_QLora.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Install necessary libraries

Transformers - Transformers provides APIs and tools to easily download and train state-of-the-art pretrained models

Datasets - Datasets is a library for easily accessing and sharing datasets for Audio, Computer Vision, and Natural Language Processing (NLP) tasks

PEFT - Parameter-Efficient Fine-Tuning (PEFT) methods enable efficient adaptation of pre-trained language models (PLMs) to various downstream applications without fine-tuning all the model's parameters

trl - a set of tools to train transformer language models. In this case the Supervised Fine-tuning step (SFT)

accelerate - Accelerate is a library that enables the same PyTorch code to be run across any distributed configuration by adding just four lines of code

bitsandbytes - Library you need to use in order to quantize the LLM

In [None]:
!pip install -q transformers
!pip install xformers
!pip install -q datasets
!pip install -q trl
!pip install git+https://github.com/huggingface/peft.git
!pip install -q bitsandbytes==0.37.2
!pip install -q -U accelerate

# Import following libraries

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel, PeftConfig
from datasets import load_dataset
from transformers import TrainingArguments, pipeline
from trl import SFTTrainer

# Load a model and tokenizer

We're going to use LLama 2 7B model

In [None]:
from huggingface_hub import login
login()

In [None]:
repo_id = "meta-llama/Llama-2-7b-chat-hf" # Modify to whatever model you want to use

base_model = AutoModelForCausalLM.from_pretrained(
    repo_id,
    device_map='auto',
    load_in_8bit=True,
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(repo_id)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token

base_model.config.use_cache = False

In [None]:
print(base_model) # use it to check what target module should be

In [None]:
base_model.get_memory_footprint() # Check the memory

Helper function to see how many parameters the model has:

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

# Test the base model

In [None]:
device = "cuda:0"

def user_prompt(human_prompt):
    prompt_template=f"### HUMAN:\n{human_prompt}\n\n### RESPONSE:\n" # This has to change if your dataset isn't formatted as Alpaca
    return prompt_template

pipe = pipeline(
    task="text-generation",
    model=base_model,
    tokenizer=tokenizer,
    max_length=150,
    repetition_penalty=1.15,
    top_p=0.95
    )
result = pipe(user_prompt("You are an expert youtuber. Give me some ideas for a youtube title for a video about fine tuning LLM"))
print(result[0]['generated_text'])

# Prepare and preprocess the model for training

In [None]:
config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"], # you have to know the target modules, it varies from model to model
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)


model = get_peft_model(base_model, config) # Wrap the base model with get_peft_model() to get a trainable PeftModel
print_trainable_parameters(model)

# Load a dataset from datasets library

In [None]:
dataset = load_dataset("csv", data_files = "you_data_here.csv") # substitute with whatever file name you have
print("Dataset loaded")

# Training step

In [None]:
adam_bits = 8

training_arguments = TrainingArguments(
    output_dir = "Trainer_output",
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 4,
    run_name=f"deb-v2-xl-{adam_bits}bitAdam",
    logging_steps = 20,
    learning_rate = 2e-4,
    fp16=True,
    max_grad_norm = 0.3,
    max_steps = 300,
    warmup_ratio = 0.03,
    group_by_length=True,
    lr_scheduler_type = "constant",
)

In [None]:
trainer = SFTTrainer(
    model = model,
    train_dataset = dataset["train"],
    dataset_text_field="text",
    args = training_arguments,
    max_seq_length = 512,
)

trainer.train()

# Save the adapter

In [None]:
trainer.save_model("Finetuned_adapter")
adapter_model = model

print("Lora Adapter saved")

# Merge the base model and the adapter

In [None]:
# Can't merge the 8 bit/4 bit model with Lora so reload it

repo_id = "meta-llama/Llama-2-7b-chat-hf"
use_ram_optimized_load=False

base_model = AutoModelForCausalLM.from_pretrained(
    repo_id,
    device_map='auto',
    trust_remote_code=True,
)

base_model.config.use_cache = False

In [None]:
base_model.get_memory_footprint()

In [None]:
# Load Lora adapter
model = PeftModel.from_pretrained(
    base_model,
    "/content/Finetuned_adapter",
    )
merged_model = model.merge_and_unload()

merged_model.save_pretrained("/content/Merged_model")
tokenizer.save_pretrained("/content/Merged_model")

# Testing out Fine Tuned model

In [None]:
device = "cuda:0"

def user_prompt(human_prompt):
    prompt_template=f"### HUMAN:\n{human_prompt}\n\n### RESPONSE:\n"
    return prompt_template

pipe = pipeline(
    task="text-generation",
    model=merged_model,
    tokenizer=tokenizer,
    max_length=150,
    repetition_penalty=1.15,
    top_p=0.95
    )
result = pipe(user_prompt("You are an expert youtuber. Give me some ideas for a youtube title for a video about fine tuning LLM"))
print(result[0]['generated_text'])

In [None]:
merged_model.push_to_hub("your_hg_id/name_fine_tuned_model")