In [1]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Using PEFT (parameter efficient fine-tuning technique) with huggingface to fine-tune Gemma model
 
<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/google-gemini/gemma-cookbook/blob/main/Gemma/%5BGemma_2%5DFinetune_with_LORA.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  
  <td style="text-align: center">
    <a href="https://github.com/google-gemini/gemma-cookbook/blob/main/Gemma/%5BGemma_2%5DFinetune_with_LORA.ipynb">
      <img width="32px" src="https://raw.githubusercontent.com/primer/octicons/refs/heads/main/icons/mark-github-24.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

| Author(s) |
| --- |
| [Shivam Ghuge](https://github.com/Shiv-am-04) |

## Overview

This notebook demonstrates how to fine-tune LLM and SLM using the PEFT (parameter efficient finetuning technique) which is LORA (Low Rank Adaptation) in our case.

### Objective

The Goal is to use fine-tune the model in the environment where we have less compute resources like smaller GPUs, less RAM and less storage. We are fine-tuning google's open source gemma2 model using LORA technique.

**We will cover the following steps:**

1. ***Loading Model*** : We are using huggingface to load the model in the notebook using 4-bit quantization, which leads to a smaller  model size, lower memory usage, faster inference speed, and reduced energy consumption.

2. ***Configure BitsAnsBytes*** : Using bitsandbytes config to load the model from huggingface in 4-bit.

3. ***Prepare the Dataset*** : Download the SQl dataset from huggingface and convert it to Huggingface Dataset.

4. ***Perform fine-tuning*** : Using LORA to do the fine-tuning of the model on the dataset

5. ***Deploy*** : Push the model to the huggingface hub from where we can use it.


#### ***Install PEFT (parameter efficient fine tuning), bitsandbytes and other required packages***


In [None]:
%pip install peft bitsandbytes transformers accelerate datasets trl google

In [None]:
# import tensorflow
import torch
from datasets import load_dataset
from transformers import AutoTokenizer,AutoModelForCausalLM,BitsAndBytesConfig,TrainingArguments,logging
from trl import SFTTrainer
from peft import LoraConfig

#### ***BitsAndBytes Configuration***


In [3]:
### bitsandbytes parameters ###

# The bitsandbytes library is a lightweight Python wrapper around CUDA custom functions, particularly designed for 8-bit optimizers,matrix multiplication (LLM.int8()), and 8-bit and 4-bit quantization functions

bnb4bit_compute_dtype = 'float16'

# Quantization type (fp4 or nf4)
# fp4 : A standard, 4-bit floating-point format that uses a 1-bit sign, a 2-bit exponent, and a 1-bit mantissa.
# nf4 : Same as fp4 but it is normalized 4-bit and optimized for normally distributed data like the weights in large language model.
#       This makes it more efficient for training and inference of LLM models.
bnb4bit_quant_type = 'nf4'

use_nested_quant = False

In [4]:
# fetch the value of bnb4bit_compute_dtype from the torch module.

compute_dtype = getattr(torch,bnb4bit_compute_dtype)

# getattr is a built-in Python function that retrieves an attribute from an object.

In [5]:
bitsAndbytes_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=compute_dtype,
                                         bnb_4bit_quant_type=bnb4bit_quant_type,
                                         bnb_4bit_use_double_quant=False,
                                        )

#### ***Loading gemma-2-2b model from huggingface***

In [6]:
from google.colab import userdata

access_token = userdata.get('HF_TOKEN')

In [None]:
model_name = 'google/gemma-2-2b'

tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             quantization_config=bitsAndbytes_config,
                                             device_map='auto',                                   # device_map is where to load the entire model (0:gpu,'auto':whichever available)
                                             attn_implementation = 'eager',                       # type of self-attention technique
                                             token=access_token)


# Disables the use of caching during model inference.
model.config.use_cache = False
# Caching stores intermediate results to speed up future computations. Turning it off might be necessary if caching leads to high memory consumption
# or isn't beneficial for our task.

# Sets the degree of tensor parallelism for pretraining.
model.config.pretraining_tp = 1
# Tensor parallelism splits the model tensors across multiple devices (e.g., GPUs) to speed up training. A value of 1 means no tensor splitting

In [9]:
print(f"{model.get_memory_footprint()/1e9:,.1f} GB")

2.2 GB


In [None]:
# help(AutoModelForCausalLM)

***Generating before fine-tuning***

In [None]:
question = 'there is a table name Employee containing two columns employee_id and salary. Give me only sql query to fetch the highest and lowest salary along with employee id'
device = 'cuda'
input_ = tokenizer.encode(question,return_tensors='pt').to(device)
response = model.generate(input_).to('cuda')

In [None]:
response = tokenizer.decode(response[0],skip_special_tokens=True)
print(response)

there is a table name Employee containing two columns employee_id and salary. Give me only sql query to fetch the highest and lowest salary along with employee id.

<code>SELECT MAX(salary) AS max_salary, MIN(salary) AS min_


***PEFT***

***Parameter-Efficient Fine-Tuning, is a technique used to adapt pre-trained language models (LLMs) for specific tasks by only training a small subset of the model's parameters. This is a much more efficient and less resource-intensive alternative to traditional fine-tuning, which would update every parameter in a large model.***

***By freezing most of the original model's weights and training a small number of new or existing parameters, PEFT methods achieve comparable performance while saving significant computational power and memory.***


#### ***Tuning Phase***

In [None]:
model

In [11]:
Target_modules = ['q_proj','k_proj','v_proj','o_proj']

In [12]:
### QLORA hyperparameters ###

lora_learning_rate = 1e-4
lora_rank = 8
lora_dropout = 0.2
lora_alpha = 16               # double of lora rank

# even using QLORA lora config is required because LORA low rank optimization is applied after quantization and alpha should be double the rank

In [None]:
peft_config = LoraConfig(r=lora_rank,
              lora_alpha=lora_alpha,
              lora_dropout=lora_dropout,                       # A regularization technique used during training to prevent overfitting of the small, trainable LoRA matrices.
              bias='none',
              task_type='CAUSAL_LM',                           # CAUSAL_LM are those model that generates text by predicting the next word (or token) in a sequence based only on the words that have come before it
              target_modules=Target_modules)

***Data Preparation***

In [14]:
import pandas as pd

splits = {'train': 'data/train-00000-of-00001-36a24700f19484dc.parquet', 'validation': 'data/validation-00000-of-00001-fa01d04c056ac579.parquet'}
df_train = pd.read_parquet("hf://datasets/lamini/spider_text_to_sql/" + splits["train"])
df_test = pd.read_parquet("hf://datasets/lamini/spider_text_to_sql/" + splits["validation"])

In [None]:
df = pd.merge(df_train,df_test,how ='outer')

In [16]:
def remove(row):
  return row.split('\n\n')[-1].replace('[/INST]','')

In [17]:
df['input'] = df['input'].apply(remove)

In [18]:
data = []
for txt,query in zip(df['input'],df['output']):
  template = f"<question> {txt.split(':')[-1]} , <code> {query}"
  data.append(template)

In [19]:
df.shape

(8034, 2)

In [20]:
len(data)

8034

In [21]:
# we are only training on 2000 for quick training

data_for_training = data[:2000]

In [None]:
data_for_training

In [23]:
from datasets import Dataset
import pandas as pd

pd_data = pd.DataFrame(data_for_training,columns=['text'])
hf_dataset = Dataset.from_pandas(pd_data)

In [24]:
hf_dataset

Dataset({
    features: ['text'],
    num_rows: 2000
})

***Training Phase***

In [26]:
### training configuration ###

output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Batch size per GPU for training
train_batch_size_perGPU = 1

# Batch size per GPU for evaluation
eval_batch_size_perGPU = 1

# Number of update steps to accumulate the gradients for if our setup can manage it, keeping it simple with 1 works fine
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Optimizer to use
optimizer_ = "paged_adamw_32bit"

# learning rate (AdamW optimizer), lower learning rates tend to provide more stable and gradual learning.
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate) (optional)
warmup_ratio = 0.03

In [27]:
training_args = TrainingArguments(output_dir=output_dir,
                                  num_train_epochs=num_train_epochs,
                                  per_device_train_batch_size=train_batch_size_perGPU,
                                  per_device_eval_batch_size=eval_batch_size_perGPU,
                                  gradient_accumulation_steps=gradient_accumulation_steps,
                                  optim=optimizer_,
                                  save_steps=0,
                                  logging_steps=25,
                                  learning_rate=learning_rate,
                                  weight_decay=weight_decay,
                                  fp16=False,
                                  bf16=True,
                                  max_grad_norm=max_grad_norm,
                                  max_steps=max_steps,
                                  # warmup_ratio=warmup_ratio,
                                  group_by_length=True,                     # Group sequences into batches with same length
                                  lr_scheduler_type=lr_scheduler_type,
                                  report_to="tensorboard"
                                  )

In [None]:
trainer = SFTTrainer(model=model,
                     args=training_args,
                     peft_config=peft_config,
                     train_dataset=hf_dataset,
                     processing_class=tokenizer,
                    )

In [30]:
torch.cuda.empty_cache()

In [31]:
import gc

gc.collect()

148

In [32]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 1}.


Step,Training Loss
25,1.5819
50,1.2245
75,0.9672
100,1.1548
125,0.9415
150,1.0781
175,0.9815
200,1.0258
225,0.897
250,1.017


TrainOutput(global_step=2000, training_loss=0.8115992393493653, metrics={'train_runtime': 1502.325, 'train_samples_per_second': 1.331, 'train_steps_per_second': 1.331, 'total_flos': 1577734916802048.0, 'train_loss': 0.8115992393493653, 'epoch': 1.0})

In [33]:
%load_ext tensorboard
%tensorboard --logdir results/runs

<IPython.core.display.Javascript object>

In [34]:
# save model to the local folder

trainer.model.save_pretrained('finetuned_model')

In [35]:
del model
del trainer
gc.collect()
gc.collect()

0

#### ***Merging Weights of Lora Config with Base model and Pushing to huggingfacehub models***

In [None]:
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map='auto',
)

In [37]:
model = PeftModel.from_pretrained(base_model,r'/content/finetuned_model')           # This path is only for google colab
model = model.merge_and_unload()

# reloading tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

In [None]:
import locale

locale.preferred_encoding = lambda: "UTF-8"

In [None]:
name = "shiv-am-04/gemma2-2b-SQL"

! huggingface-cli login

model.push_to_hub(name, check_pr=True)

tokenizer.push_to_hub(name,check_pr=True)