<a href="https://colab.research.google.com/github/goelnikhils-lgtm/languagemodels/blob/main/Finetuning_using_Unsloth_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#SOURCE CODE CREDIT - TRELLIS RESEARCH - https://www.youtube.com/watch?v=Ik6nbAjxLk4
#This also has LLM-As-A-Judge
#so the logic is evaluate your model before fine tuning using LLM-As-A-Judge and then evaluate post fine tuning
#fine tune on response only - this is key .... mask the instruction

In [None]:
#vLLM Evalaution + UnSloth Training MS PHI4-Instruct Model on Q&A for Virtual Assitant
#latest version
!pip install uv
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers}
!uv pip install datasets tensorboard openai hf_transfer accelerate pillow -qU --system
!uv pip install scikit-learn pymupdf -qU --system
!uv pip install google.generativeai -qU --system
!uv pip install flashinfer-python --system -qU
!uv pip install vllm
!pip install numpy
!pip install scipy


In [None]:
from huggingface_hub import HfFolder , login
if HfFolder.get_token() is None:
  login()

In [None]:
model_slug ="microsoft/Phi-4-mini-instruct"
dataset_name = "Trelis/touch-rugby-comprehensive-qa"
train_split_name = "train"
eval_split_name = "eval"

#dataset
q_column ="question"
c_coumn = "evaluation_criteria"
a_column = "answer"

In [None]:
#setting up LLM-As-a-Judge
import os
os.environ["PROVIDER"] = "gemini"
judge_model = "gemini-2.0-flash"
max_seq_length = 8192 #long contect length for reasoning we need long context length
temperature = 0.01
os.environ["UNSLOTH_COMPILE_DISABLE"] = "1"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
os.environ["HF_HOME"] = "./"

#setup the client for the judge
from __future__ import annotations
import getpass
from pathlib import Path
from typing import List,Dict,Any
from openai import OpenAI
from google.colab import userdata # Import userdata to access Colab secrets

# Get the API key from Colab secrets
api_key = userdata.get("GEMINI_API_KEY")
if api_key is None:
  raise ValueError("GEMINI_API_KEY not found in Colab secrets. Please add it to the secrets manager.")

model_name = os.getenv("GEMINI_MODEL_NAME","gemini-2.0-flash")

base_url = os.getenv(
    "GEMINI_BASE_URL",
    "https://generativelanguage.googleapis.com/v1beta/openai",
)
client = OpenAI(api_key=api_key,base_url=base_url) #use OpenAI client . this can call any LLM

#unified chat wrapper
def chat(messages: List[Dict[str,str]],**kwargs:Any) ->str:
  response = client.chat.completions.create(
      model=judge_model,
      n=1,
      messages=messages,
      **kwargs
  )
  return response.choices[0].message.content

In [None]:
#prepare the dataset
import json
from datasets import load_dataset

#load the dataset
ds_dict = load_dataset(dataset_name)
print("Splits found",list[ds_dict.keys()])

train_data = ds_dict[train_split_name]
eval_data = ds_dict[eval_split_name]
print("Train data size",len(train_data))
print("Eval data size",len(eval_data))
print(train_data)
print(eval_data)

In [None]:
#!uv pip install transformers -qU --system
#!pip install --upgrade transformers


In [None]:
#load the model to evaluate
#!uv pip install vllm
import json
import re
from vllm import LLM, SamplingParams
import torch

sampling_params = SamplingParams(
    temperature = temperature, # at high temperature it is better to have min_p sampling to handle coherence and diversity. high temperature means exploration/diversity
    top_k = 40,
    top_p= 0.95, #token at very low probability are not accepted
    min_p=0.1,  # use to balance text_coherence and creativity . mostly used when we specify high temperature
    max_tokens = 6000 #max tokens generated
)
#max_model_len
model = LLM(model=model_slug,gpu_memory_utilization=0.9,dtype="bfloat16")

In [None]:
#evaluate loaded model . this model is not yet fine tuned
import re
from pydantic import BaseModel
SYSTEM_MESSAGE ="""
You are an expert evaluator tasked with determining if an answer satisfies specified evaluation criteria.

You will receive
1.A question
2.The evaluation criteria
3.The model's answer to evaluate

First, explain how well the answer meets each of the specified criteria.
Then provide your scores.
-Score 1 - if the answer fully satisfies ALL the specified criteria
-Score 0 - if it fails to meet ANY of the criteria

Format of your response as:
[Your detailed explanation of how the answer meets of fails the criteria]
Score:[Just the number 1 or 0 , with no additional text]
""".strip()

PROMPT_TEMPLATE = """
Question:{question}
Evaluation Criteria:{evaluation_criteria}
Model Answer:{model_answer}
""".strip()

#helper function to call the judge parse it reply
class EvalautionResult(BaseModel):
  reason:str #give reason for evaluation
  is_correct:bool #indication of evaluation  correct / not correct
SCORE_RE = re.compile(r"Score:\s*([01])\s*",re.IGNORECASE + re.MULTILINE) #regular expression to extract score 1 or 0
THINK_RE = re.compile(r"<\s*think\s*\(.*?\)\s*>",re.IGNORECASE | re.DOTALL) # regular expression to strip any thinking as it will be hard to evaluate with thinking

#evaluate function takes a generated answer from the model , ground truth and evaluatues the generated answer vs ground truth
def evaluate_answer(question:str,evaluation_criteria:str,ground_truth:str | None , generated_answer:str , *,strip_reasoning: bool =True)-> tuple[bool,str]:
  """ returns (is_correct,reason_from_judge)
  if strip_reasoning = True , all <think> </think> blocks are removed from generated_answer before it is fed to judge model
  """
  if strip_reasoning:
    generated_answer = THINK_RE.sub("",generated_answer).strip()
  judge_prompt = PROMPT_TEMPLATE.format(question=question,evaluation_criteria=evaluation_criteria,model_answer=generated_answer)

  #call your judge LLM
  completion = client.chat.completions.create(
      model=judge_model,
      messages=[
      {"role":"system", "content":SYSTEM_MESSAGE},
      {"role":"user", "content":judge_prompt}
      ],
      temperature=0.0,
      top_p=1.0,
      max_tokens=1024,
      )
  judge_reply = completion.choices[0].message.content

  # parse judge's reply
  m = SCORE_RE.search(judge_reply)
  if not m:
    return False
  score = int(m.group(1))
  reason = judge_reply[:m.start()].strip()
  return bool(score),reason # returns a tuple of score and reason

  #evaluate a model by giving user role and providing content as problem message then generate anwer and provide this to evaluate model and this a full loop
  #not coding here

In [None]:
#we want to evaluate batch of questions using vLLM so we use Threadpool executor class and run parallel threads
#SET UP EVALUATION WITH BATCHING OF ANSWER GENERATION AND THREADING FOR JUDGING
#make parallel calls to Gemini API - LLM-As-a-Judge

In [None]:
from huggingface_hub import HfFolder , login
if HfFolder.get_token() is None:
  login()

model_slug ="microsoft/Phi-4-mini-instruct"
max_seq_length = 8192
dtype = None
load_in_4bit = False
load_in_8bit = False

dataset_name = "Trelis/touch-rugby-comprehensive-qa"
train_split_name = "train"
eval_split_name = "eval"

#dataset
q_column ="question"
c_coumn = "evaluation_criteria"
a_column = "answer"

In [None]:
#fine tune
from unsloth import FastLanguageModel
import torch
import gc, inspect , sys

model , tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_slug,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    load_in_8bit = load_in_8bit,
)
print(tokenizer.padding_side) #pad on right hand side or what ever model deafault is

In [None]:
rank = 32
lora_alpha = 50
model = FastLanguageModel.get_peft_model(
    model,
    r = rank,
    finetune_attention_modules = True,
    lora_alpha = lora_alpha,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    full_finetuning = False,
    random_state = 3407,
    use_rslora= True,
    )

In [None]:
model.print_trainable_parameters()

In [None]:
from datasets import load_dataset
ft_data = load_dataset(dataset_name)
ft_train_data = ft_data["train"]
ft_eval_data = ft_data["eval"]
print("Train data size",len(ft_train_data))
print("Eval data size",len(ft_eval_data))
print(ft_train_data)
print(ft_eval_data)
print(ft_train_data["question"][0])
print(ft_eval_data["question"][0])

In [None]:
#formatting the dataset
def formatting_func(batch):
  texts = []
  for i in range(len(batch[q_column])):
    user_content = batch[q_column][i]
    print("user_content",user_content)
    assistant_content = batch[a_column][i]
    messages =[
        {"role":"user", "content":user_content},
        {"role":"assistant", "content":assistant_content}
    ]
    text = tokenizer.apply_chat_template(messages,tokenize=False,add_generation_prompt=False)
    if text.startswith("<bos>"):
      text = text[len("<bos>"):]
    texts.append(text)
  print("texts",text)
  return texts

In [None]:
print(formatting_func(ft_eval_data)[0])

In [None]:
from trl import SFTTrainer, SFTConfig
from transformers import TrainingArguments , DataCollatorForSeq2Seq
from transformers import get_scheduler
from datetime import datetime

per_device_train_batch_size = 4
gradient_accumlation_steps = 8 #gradient accumlation steps
#smaller model can fit larger batch size in memory
#virtual batch size is 32
epochs = 2 #one epoch at constant rate and one epoch decaying at constant rate (annealing)
learning_rate = 1e-4

#get current timestamp
#Define training variables
#steps
total_steps = (len(ft_train_data) // (per_device_train_batch_size * gradient_accumlation_steps)) * epochs # batchsize *gradient accumlation steps
warm_up_steps = int(0.01*total_steps) #1% of total steps .... warmup steps ....
anneal_start_step = int(0.5*total_steps) #Annealing starts at 50% of total steps - this is key ....


#intialize optimizer
optimizer = torch.optim.AdamW(model.parameters(),lr=learning_rate, weight_decay = 0.01)

from torch.optim.lr_scheduler import LambdaLR
def custom_lr_scheduler(optimizer, num_warmup_steps, num_training_steps, anneal_start_step): #custom scheduler
  def lr_lambda(current_step):
    if current_step < num_warmup_steps:
      return float(current_step) / float(max(1, num_warmup_steps))
    elif current_step < anneal_start_step:
      return 1.0
    else:
      return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - anneal_start_step)))
  return LambdaLR(optimizer, lr_lambda)
#create the learning rate scheduler
scheduler = custom_lr_scheduler(optimizer,warm_up_steps,total_steps,anneal_start_step)

In [None]:
#setting training steps
from unsloth import is_bfloat16_supported

common_args={
    "per_device_train_batch_size" :per_device_train_batch_size,
    "per_device_eval_batch_size": per_device_train_batch_size,
    "gradient_accumulation_steps":gradient_accumlation_steps,
    "num_train_epochs":epochs,
    "logging_strategy":"steps",
    "eval_strategy":"steps",
    "logging_steps":min(max(int(0.5*total_steps),1),10),
    "eval_steps":min(max(int(0.5*total_steps),1),10),
    "bf16": is_bfloat16_supported(),
    "fp16": not is_bfloat16_supported(),
    "report_to": "tensorboard",
    "seed":3407,
    "output_dir":"outputs",
    "gradient_checkpointing":True,
    "gradient_checkpointing_kwargs":{"use_reentrant":True},
    "remove_unused_columns":True
}
common_args["dataset_num_proc"] = 1
common_args["max_seq_length"] = max_seq_length

In [None]:
#fine tuning
#run_name = f"{run_name}-ft"
#print(f"Setting up for run:{run_name}")

#common_args["run_name"] = "run_name" #model name
#common_args["logging_dir"]*f"./logs/{run_name}" #logging directory

trainer = SFTTrainer(
    model = model,
    tokenizer= tokenizer,
    train_dataset = ft_train_data,
    eval_dataset = ft_eval_data,
    formatting_func = formatting_func,
    args = SFTConfig(**common_args),
)
#must be done outside because of unsloth
trainer.optimizer = optimizer
trainer.lr_scheduler = scheduler

#disable the internal builders so they don't override again
trainer.create_optimizer = lambda *a , **k:trainer.optimizer
trainer.create_scheduler = lambda *a , **k:trainer.lr_scheduler
print(trainer.train_dataset)
print(tokenizer.chat_template)

In [None]:
#training on completions / responses only and don't use Instructions . This will help model to fine tune better on response and perform better
#applicable for Conversational use cases

from unsloth.chat_templates import train_on_responses_only

TEMPLATES = {
    "llama":{
        "<|start_header_id|>user<|end_header_id|>\n\n",
        "<|start_header_id|>assistant<|end_header_id|>\n\n",
    },
    "gemma":{
        "<start_of_turn>user\n",
        "<start_of_turn>model\n",
    },
    "qwen":{
        "<|im_start|>user\n",
        "<|im_start|>assitant\n",
    },
    "mistral":{
        "[INST]",
        "[/INST]",
    },
    "phi":{
        "<|user|>\n",
        "<|end|><assistant|>\n",
    }
}
instruction_tag , response_tag = TEMPLATES["phi"] # Changed from "mistral" to "phi"
#masks everything between instruction_part and response_part
trainer = train_on_responses_only(
    trainer,
    instruction_part= instruction_tag,
    response_part = response_tag,
)

In [None]:
#decoding
tokenizer.decode(trainer.train_dataset[0]["input_ids"])
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[0]["labels"]]).replace(tokenizer.pad_token,"")

In [None]:
trainer_stats = trainer.train()