### LLaMA Supervised Fine-Tuning on Ground Truth Answers after distillation

This document will take the ground truth answers of the Kababutare Medical Dataset and then fine-tune the LLaMA Model (distilled/fine-tuned on GPT-4o-mini answers before) on those answers

The purpose of this exercise is to test whether the LLaMA (distilled) fine-tuning on ground truth answers will improve the performance on the open-ended question/answering related to healthcare dataset

In [None]:
import os

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
import pandas as pd
import json
import torch
import pickle
from unsloth import FastLanguageModel, is_bfloat16_supported, train_on_responses_only
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset
from trl import SFTTrainer

#### Reading the Question and Answer Pairs from Training Dataset Phase 2

In [None]:
ques_list = []
ans_list = []

with open('phase2_data_kabatubare/train_kabatubare.jsonl', 'rb') as file: #only reading the training dataset
    for line in file:
        json_object = json.loads(line)
        ques_list.append(json_object['question'])
        ans_list.append(json_object['answer']) #getting GPT Responses from the dataset

In [None]:
groundtruth_data = pd.DataFrame({'question': ques_list, 'answer': ans_list})
groundtruth_data

Create the HuggingFace Dataset from Pandas Dataframe

In [None]:
dataset = Dataset.from_pandas(groundtruth_data)
dataset = dataset.train_test_split(test_size=0.1) #dividing the training dataset into further train:validation dataset
dataset

### Fine-Tuning Code

#### Loading the model and tokenizer

In [None]:
distilled_peft_model_path = "./llama32-sft-peft-kabatubare-phase2-distill" #use for LoRA based fine-tuning

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = distilled_peft_model_path,
    max_seq_length = 2048,
    load_in_4bit = False, # 4 bit quantization to reduce memory
    load_in_8bit = True, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    dtype=None, #None for auto-detection. Can be torch.bfloat16 or torch.float16 (will be automatically detected)
    device_map="auto"
)

#### Setting up the PEFT settings for the model

https://huggingface.co/blog/damjan-k/rslora\
https://medium.com/@fartypantsham/what-rank-r-and-alpha-to-use-in-lora-in-llm-1b4f025fd133

Since we are already loading the lora_adapter, the following steps are not needed:

In [None]:
# model = FastLanguageModel.get_peft_model(
#     model,
#     r = 64, #max_full_rank=64 by default in FastLanguageModel
#     target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
#                       "gate_proj", "up_proj", "down_proj",],
#     lora_alpha = 64, #scaling_factor = lora_alpha/r. If we select lora_alpha = 2 * r then it will multiply the adapter weights by 2 which can be un-ncessary
#     lora_dropout = 0.1,
#     bias = "none",
#     use_gradient_checkpointing = "unsloth",
#     use_rslora = True,
#     loftq_config = None,
# )

#### Forming the chat template

In [None]:
# Define a function to apply the chat template
def format_chat_template(example):
        
    messages = [
        {"role": "system", "content": "You are a medical knowledge assistant trained to provide information and guidance on various health-related topics."},
        {"role": "user", "content": example['question']},
        {"role": "assistant", "content": example['answer']}
    ]
    
    prompt = tokenizer.apply_chat_template(messages, tokenize=False)

    return {"text": prompt}

In [None]:
dataset_formatted = dataset.map(format_chat_template)

In [None]:
print(dataset_formatted['train']['text'][0])

#### Initializing the TRL SFTTrainer and related Arguments

In [None]:
peft_model_path = "./llama32-sft-peft-kabatubare-phase3-distill-groundtruth" #use for LoRA based fine-tuning

training_args = TrainingArguments(
        output_dir=peft_model_path,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        # gradient_accumulation_steps=4,
        eval_strategy="steps",
        eval_steps=50,
        logging_strategy="steps",
        logging_steps=50,
        save_strategy="steps",
        save_steps=400,
        warmup_steps = 5,
        num_train_epochs = 3,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        seed = 42,
        report_to = "none",
    )

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset=dataset_formatted["train"],
    eval_dataset=dataset_formatted["test"],
    dataset_text_field = "text",
    max_seq_length = 2048,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer), #only use when using train_on_responses_only()
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = training_args)

In [None]:
trainer.train_dataset

In [None]:
print(tokenizer.decode(trainer.train_dataset['input_ids'][0]))

#### Only Focus on the `Response Part` for the generation

In [None]:
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

In [None]:
trainer.train_dataset

In [None]:
# The labels are created which only contain response. Left Padding is implemented and all the padding tokens are given a score of -100 to avoid loss calculation for pad_tokens
trainer.train_dataset['labels'][0]

#### Train the model

In [None]:
trainer_stats = trainer.train()

#### Saving the model and tokenizer

Just save the LoRA Adapters without merging with base model

In [None]:
peft_model_path = "./llama32-sft-peft-kabatubare-phase3-distill-groundtruth" #use for LoRA based fine-tuning

# Or run the two below statements
model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

### Inference

In [None]:
peft_model_path = "./llama32-sft-peft-kabatubare-phase3-distill-groundtruth" #use for LoRA based fine-tuning

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = peft_model_path,
    max_seq_length = 2048,
    load_in_4bit = False, # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    dtype=None, #None for auto-detection. Can be torch.bfloat16 or torch.float16 (will be automatically detected)
    device_map="auto"
)

In [None]:
dataset['test']

In [None]:
FastLanguageModel.for_inference(model)

# for idx in range(1,50):

idx = 1

print(dataset['test']['question'][idx])

messages = [{"role": "system", "content": "You are a medical knowledge assistant trained to provide information and guidance on various health-related topics."},
            {"role": "user", "content": dataset['test']['question'][idx]}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to(model.device)
temp_resp = tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True)

outputs = model.generate(**inputs, max_new_tokens=2048, num_return_sequences=1)

resp = tokenizer.decode(outputs[0], skip_special_tokens=True)
resp = resp[len(temp_resp):] #getting only the response part (i.e., assistant)

print(resp)
print('---------------------------------------------------')