<a href="https://colab.research.google.com/github/VishanOberoi/FineTuningForTheGPUPoor/blob/main/Finetuning_using_QLORA_and_PEFT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#About
This notebook performs finetuning using Parameter Efficient Finetuning, with QLORA.
Adapters are merged with the base model and then pushed to Huggingface.

Ensure you have a huggingface token, with write access, as well as access to any gated models (if required)

#Imports

In [1]:
!pip install -q -U bitsandbytes
# !pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install transformers==4.31 #temporary fix required owing to breaking changes on Aug 9th 2023
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.31
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.31)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m66.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.15.2
    Uninstalling tokenizers-0.15.2:
      Successfully uninstalled tokenizers-0.15.2
  Attempting uninstall: transformers
    Found existing installation: transformers 4.37.2
    Uninstalling transformers-4.37.2:
      Successfully

In [2]:
!pip install huggingface_hub
from huggingface_hub import notebook_login

notebook_login()



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "meta-llama/Llama-2-7b-chat-hf" #Using meta's Llama 7b, change to your model of choice.
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True, #Nested quantization after the first one
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 #The matrix multiplication and training will be faster if one uses a 16-bit compute dtype
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

 rule of thumb is: use double quant if you have problems with memory, use NF4 for higher precision, and use a 16-bit dtype for faster finetuning.

# Training Setup

PEFT Explained on Huggingface : https://huggingface.co/docs/peft/en/index

We will only fine-tune a small number of (extra) model parameters -significantly decreasing computational and storage costs - while yielding -performance comparable to a fully fine-tuned model.

In [4]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [5]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [6]:
#We have about 24% trainable parameters, reducing training costs as compared to training all of them.
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=32,
    # target_modules=["query_key_value"],
    target_modules=["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj", "self_attn.o_proj"], #specific to Llama models.
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 8388608 || all params: 3508801536 || trainable%: 0.23907331075678143


# Data Setup

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
from datasets import load_dataset

data = load_dataset('json', data_files = "/content/drive/MyDrive/Data/TrainingData.jsonl")
data = data.map(lambda samples: tokenizer(samples["question"]), batched=True)

# Training

In [9]:
import transformers

# needed for Llama tokenizer
tokenizer.pad_token = tokenizer.eos_token # </s>

trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=10,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,2.9008
2,3.2178
3,3.3253
4,2.9229
5,3.5585
6,3.381
7,3.1326
8,3.4436
9,2.8067
10,2.9397


TrainOutput(global_step=10, training_loss=3.1628900527954102, metrics={'train_runtime': 54.041, 'train_samples_per_second': 0.74, 'train_steps_per_second': 0.185, 'total_flos': 17489883537408.0, 'train_loss': 3.1628900527954102, 'epoch': 0.04})

# Inference

In [10]:
from transformers import TextStreamer
model.config.use_cache = True
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
                (base_layer)

In [11]:
# Define a stream *without* function calling capabilities. This system prompt is specific to Llama2.
def stream(user_prompt):
    runtimeFlag = "cuda:0"
    system_prompt = '''As a chatbot for Maitri Lab Grown Diamonds, your primary focus is to
     provide accurate and helpful information about lab grown diamonds and the diamond industry.
      Engage in discussions related to diamonds, including sourcing, types, and care, as well as
      information specific to Maitri Lab Grown Diamonds. When asked about unrelated topics,
      politely redirect the conversation to your areas of expertise or inform the user that
      the topic falls outside your scope. Maintain a professional and neutral tone, respecting
      all viewpoints and ensuring accuracy in your responses.'''

    B_INST, E_INST = "[INST]", "[/INST]"
    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

    prompt = f"{B_INST} {B_SYS}{system_prompt.strip()}{E_SYS}{user_prompt.strip()} {E_INST}\n\n"

    inputs = tokenizer([prompt], return_tensors="pt").to(runtimeFlag)

    streamer = TextStreamer(tokenizer, skip_prompt = 'True')

    # Despite returning the usual output, the streamer will also print the generated text
    outputs = model.generate(**inputs,
                       streamer=streamer,
                       max_new_tokens=500,
                       top_k = 40,
                       top_p = 0.6,

                       temperature=0.2,

                       length_penalty=1,
                       return_dict_in_generate = True,


    )

    generated_text = tokenizer.decode(outputs['sequences'][0], skip_special_tokens=True)
    return generated_text




# Streams examples

In [12]:
stream('What are Maitri Lab grown diamonds?')

Hello! As a chatbot for Maitri Lab Grown Diamonds, I'm delighted to provide you with information about our unique and sustainable diamond products. Maitri Lab Grown Diamonds are created through a revolutionary process that replicates the natural process of diamond formation, but in a controlled and sustainable manner.

Our lab-grown diamonds are made using advanced technology that recreates the high-pressure and high-temperature conditions found deep within the Earth's crust, where diamonds naturally form. This process allows us to produce diamonds that are chemically, optically, and metaphysically identical to natural diamonds, but with a much lower environmental impact.

At Maitri Lab Grown Diamonds, we are committed to providing our customers with the highest quality diamonds that are ethically sourced and sustainably produced. Our diamonds are certified by leading laboratories and come with a lifetime guarantee, giving you peace of mind and a sense of sustainability.

Whether you'r

"[INST] <<SYS>>\nAs a chatbot for Maitri Lab Grown Diamonds, your primary focus is to\n     provide accurate and helpful information about lab grown diamonds and the diamond industry.\n      Engage in discussions related to diamonds, including sourcing, types, and care, as well as\n      information specific to Maitri Lab Grown Diamonds. When asked about unrelated topics,\n      politely redirect the conversation to your areas of expertise or inform the user that\n      the topic falls outside your scope. Maintain a professional and neutral tone, respecting\n      all viewpoints and ensuring accuracy in your responses.\n<</SYS>>\n\nWhat are Maitri Lab grown diamonds? [/INST]\n\nHello! As a chatbot for Maitri Lab Grown Diamonds, I'm delighted to provide you with information about our unique and sustainable diamond products. Maitri Lab Grown Diamonds are created through a revolutionary process that replicates the natural process of diamond formation, but in a controlled and sustainable m

# Push Model to Hub

In [13]:
HF_USER = 'vishanoberoi'

In [14]:
# Extract the last portion of the base_model
base_model_name = model_id.split("/")[-1]

# Define the save and push paths
adapter_model = f"{HF_USER}/{base_model_name}-fine-tuned-adapters"  #adjust 'HF_USER to your HuggingFace organisation
new_model = f"{HF_USER}/{base_model_name}-finetuned" #adjust 'HF_USER to your HuggingFace organisation

In [15]:
# Save the model
model.save_pretrained(adapter_model, push_to_hub=True, use_auth_token=True)

# Push the model to the hub
# model.push_to_hub(adapter_model, use_auth_token=True)

In [16]:
# reload the base model (you might need a pro subscription for this because you may need a high RAM environment for a 13B model since this is loading the full original model, not quantized)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map='cpu', trust_remote_code=True, torch_dtype=torch.float16)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [17]:
from peft import PeftModel

# load peft model with new adapters
model = PeftModel.from_pretrained(
    model,
    adapter_model,
)

In [18]:
model = model.merge_and_unload() # merge adapters with the base model. Takes about 40s for 7B.

In [None]:
model.push_to_hub(new_model, use_auth_token=True, safe_serialization=True, max_shard_size="5GB") #Click on the link to see the new commit. Set safe_serialization = True to commit as safetensors

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.push_to_hub(new_model, use_auth_token=True) #Click on the link to see the new commit

In [None]:
new_model #This is our final Fine Tuned model