<a href="https://colab.research.google.com/github/goelnikhils-lgtm/languagemodels/blob/main/Code_Generation_Fine_Tuning_LLama3_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Fine Tuning LLama3.2 on Code Generation
!pip install  datasets
!pip install --upgrade peft trl
!pip install -U bitsandbytes

In [None]:
from huggingface_hub import notebook_login
notebook_login("HF_TOKEN")

In [None]:
#import data set - DATA
from datasets import load_dataset
from datasets.arrow_dataset import Dataset

def format_sample(sample):
    """helper function to format as single input sample """
    instruction = sample['instruction']
    input_text = sample['input']
    output_text = sample['output']

    if input_text is None or input_text =="":
      formatted_prompt = ( # case when there is no input from user
          f"<|start_header_id|>user<|end_header_id|>\n\n"
          f"Below is an instruction that describes a task.Write a response that appropriately completes the task.\n\n"
          f"### Instruction:\n{instruction}\n\n"
          f"### Response:\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" #eot indicates to assitant that end of turn from user and now assitant needs to generate the response
          f"{output_text}<|eot_id|>"
      )
    else:
          formatted_prompt = (
          f"<|start_header_id|>user<|end_header_id|>\n\n"
          f"Below is an instruction that describes a task. Write a response that appropriately completes the task.\n\n "
          f"### Instruction:\n{instruction}\n\n ### Input:\n{input_text}\n\n" # input from user
          f"### Response:\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" # assitant start generating the response
          f"{output_text}<|eot_id|>"
          )
    formatted_prompt = "".join(formatted_prompt)
    return formatted_prompt

  #function for generating training data for model
def gen_train_input():
    """format all data input in alpaca style
      Return: A generator on train_data "train_gen"
    """
    #load_data
    ds = load_dataset("iamtarun/python_code_instructions_18k_alpaca",streaming=True, split="train")
    #dataset has 18.6k samples , we use 16.8k (90% of training +1.8 k for validation)
    num_samples = 16800
    counter = 0
    for sample in iter(ds):
      if counter >= num_samples:
        break
      formatted_prompt = format_sample(sample)
      yield {"text": formatted_prompt}
      counter += 1

#function for generating validation data for model
def gen_val_input():
    """format all data input in alpaca style
      Return: A generator on train_data "train_gen"
    """
    ds = load_dataset("iamtarun/python_code_instructions_18k_alpaca",streaming=True, split="train")
    #dataset has 18.6k samples , we use 16.8 K
    num_samples = 16800
    counter = 0
    for sample in iter(ds):
      if counter <  num_samples:
        counter += 1
        continue
      if counter >= num_samples + 1800:
        break
      formatted_prompt = format_sample(sample)
      yield {"text": formatted_prompt}

#train datatset
dataset_train = Dataset.from_generator(gen_train_input)
#validate dataset
dataset_val = Dataset.from_generator(gen_val_input)



In [None]:
print(f"Train dataset size: {len(dataset_train)}")
print(f"Validation dataset size: {len(dataset_val)}")
print(f"Sample train : \n{dataset_train[0]}")


In [None]:
#Model and Tokenizer
import torch
from peft import LoraConfig,AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM , AutoTokenizer , TrainingArguments , BitsAndBytesConfig
from trl import SFTTrainer

model_name = "meta-llama/Llama-3.2-1B-Instruct"
from google.colab import userdata
access_token = userdata.get('HF_TOKEN')
print("access_token",access_token)

def create_and_prepare_model():
  #QLoRA - load the model in 4 bit quantization as the model is having 1BN parameters
  bnb_config = BitsAndBytesConfig( #Model Quantization
      load_in_4bit=True,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_quant_type="nf4", #reduce model size
      bnb_4bit_compute_dtype=torch.bfloat16)

  model = AutoModelForCausalLM.from_pretrained(
      model_name,
      quantization_config=bnb_config,
      device_map="auto",
      token= access_token
  )
  peft_config = LoraConfig(
      lora_alpha=16,
      lora_dropout=0.05, #drop out for regularization to prevent overfitting. each neuron has a chance of 5%
      r=8,
      bias="none",
      task_type="CAUSAL_LM",
      target_modules=["q_proj","k_proj","v_proj"]
  )

  tokenizer = AutoTokenizer.from_pretrained(model_name)
  tokenizer.pad_token = "<|finetune_right_pad_id|>"
  tokenizer.padding_side = "right"
  return model , peft_config , tokenizer
model , peft_config , tokenizer = create_and_prepare_model()

In [None]:
#fine tuning
from trl import SFTConfig , SFTTrainer
args = SFTConfig(
    output_dir = "./llama32_finetuned_code_generation-python",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True, # to save memory
    optim = "adamw_torch_fused",
    logging_steps=50,
    learning_rate=2e-4,
    bf16=True, # better for training stability in comparsion to fp16 . can't use tf32 as we don't have ampere GPU
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    report_to = "tensorboard",
    gradient_checkpointing_kwargs={"use_reentrant": False},
    dataset_text_field="text",
    eval_strategy = "steps",
    eval_steps = 50,
    save_strategy = "epoch"
)
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    peft_config=peft_config,
    processing_class=tokenizer
    )
trainer.train()

In [None]:
#save model
model_file_name = "llama32_finetuned_code_generation-python.pth"
torch.save(model.state.dict(),model_file_name)
print(f"Model saved to {model_file_name}")

#free the memory
del model
del trainer
torch.cuda.empty_cache()

In [None]:
#load fine tuned model and run inference
from peft import PeftModel , LoraConfig
from transformers import AutoModelForCasualLM , AutoTokenizer
import torch

def load_fine_tune_model(base_model_id,saved_weights):
  #load tokenizer and base model
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  tokenizer = AutoTokenizer.from_pretrained(base_model_id)
  base_model = AutoModelForCasualLM.from_pretrained(
      base_model_id,
      load_in_4bit=True,
      device_map="auto",
      torch_dtype=torch.bfloat16
  )
  base_model.to(device)
  #create LoRA Config - make sure these parameters match your training configuration
  peft_config = LoraConfig(
      lora_alpha=16,
      lora_dropout=0.05,
      r=8,
      bias="none",
      task_type="CAUSAL_LM",
      target_modules=["q_proj","k_proj","v_proj"]
  )
  #Initialize PeftModel
  lora_model = PeftModel(base_model,peft_config)
  #load the saved_weights
  state_dict = torch.load(saved_weights , map_location=device)

  #create new dict
  new_state_dict = {}
  for k,v in state_dict.items():
    if k.startswith("base_model."):
      new_key = f"base_{key}"
      new_state_dict[new_key] = v

  #load the weights with strict = false to allow partial loading
  lora_model.load_state_dict(new_state_dict,strict=False)
  lora_model.eval()
  return lora_model , tokenizer

#Original model and saved_weight
base_model_id = "meta-llama/Llama-3.2-1B-Instruct"
lora_weights = "llama32_finetuned_code_generation-python.pth"

#load model
print("Loading model")
model_ft , tokenizer = load_fine_tune_model(base_model_id,lora_weights)
total_params = sum(p.numel() for p in model_ft.parameters())
trainable_params = sum(p.numel() for p in model_ft.parameters() if p.requires_grad)
print(f"Total parameters in the model: {total_params}")
print(f"Trainable parameters in the model: {trainable_params}")

In [None]:
def generate_prompt(model,prompt,tokenizer,max_new_tokens, context_size=512,temperature = 0.0 , top_k=1, eos_id=[128001,128009]):
  """Generate text using a language model with proper dtype handling and improved sampling"""
  #Get Model's expected dtype
  model_dtype = next(model.parameters()).dtype
  model_device = next(model.parameters()).device

  #user driven prompt can be very raw and hence we need to format the prompt
  formatted_prompt = (
      f"<|start_header_id|>user<|end_header_id|>\n\n"
      f"Below is an instruction that describes a task.Write a response that appropriately completes the task.\n\n"
      f"### Instruction:\n{prompt}\n\n"
      f"### Response:\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" # model generate the response
  )
  formatted_prompt = "".join(formatted_prompt)
  #encode and prepare input
  idx = tokenizer.encode(formatted_prompt)
  idx = torch.tensor(idx,dtype=torch.long, device = model_device).unsqueeze(0) #convert the prompt to tensor to give it to model
  num_tokens = idx.shape[1]

  #generation loop
  for _ in range(max_new_tokens):
    idx_cond = idx if idx.size(1) <= context_size else idx[:,-context_size:]
    with torch.no_grad():
      outputs = model(input_ids=idx_cond,use_cache = False)
      logits = outputs.logits
    logits = logits[:, -1, :] # focus on last time time step to get logits

    #apply top-k filtering
    if not top_k and top_k > 0:
      top_logits, _ = torch.topk(logits,top_k) #pick top logits
      min_val = top_logits[:,[-1]]
      #make rest all logits to -inf as we don't need them
      #we can constraint the logits that we want to produce....
      #use case for constrainting the logits is - structuring the output , constrainting the tool/function list
      logits = torch.where(logits < min_val, torch.tensor(float("-inf"), device = model_device , dtype = model_dtype) , logits)


    #apply temperature and sample
    if temperature > 0.0: # if temp is 1 keep logits as same . temp is way to magnify to pick high logits
      logits = logits / temperature
    #apply softmax
      probs = torch.softmax(logits, dim=-1)
      idx_next = torch.multinomial(probs, num_samples=1)
    else:
      idx_next = torch.argmax(logits, dim=-1, keepdim=True)

    #check for EOS
    if idx_next.item() in eos_id:
      break
    #append new token
    idx = torch.cat((idx,idx_next), dim=1) #append new tokens every single step untill we reach max_new_tokens

#decode generated text
  generated_ids = idx.squeeze(0)[num_tokens:] #exclude the intial tokens of the prompt. num_tokens indicate the number of intial prompt tokens
  generated_text = tokenizer.decode(generated_ids)
  return generated_text

In [None]:
prompt = ("Write a function to compute preorder traversal of a tree")
print(generate_prompt(model_ft,prompt,tokenizer,max_new_tokens=512))