![image](../images/kdd24-logo-small.jpeg)

### Hands-on Tutorial
## Domain-Driven LLM Development: Insights into RAG and Fine-Tuning Practices
### Lab 2.3 (optional) : LLM Fine-Tuning through DPO    
#### Summary: 
This lab focused on fine-tuning with preference alignment - Direct Preference Optimization (DPO) on Meta-Llama-3-8B-Instruct SFT model   

- The training dataset is from CUAD - BONTONSTORESINC_04_20_2018-EX-99.3-AGENCY AGREEMENT.PDF and evaluation metric on Meta-Llama-3-8B-Instruct SFT model 


In [None]:
#!pip install "trl<0.9.0"

In [None]:
import random
import pandas as pd
from datasets import load_dataset

from operator import itemgetter
import warnings
warnings.filterwarnings('ignore')

from datasets import Dataset, load_dataset

In [None]:
import torch
from torch.utils.data import Dataset, random_split

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers import AutoModelForSequenceClassification,AutoTokenizer,TrainingArguments
from transformers import TextDataset, DataCollatorForLanguageModeling
import bitsandbytes as bnb

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel, AutoPeftModelForCausalLM

In [None]:
from trl import DPOTrainer  
from trl import create_reference_model
from trl.core import LengthSampler

### Construct training data

In this step, we construct training dataset from the LLM responses and feedback score    

[question, answer, feedback_score]   

The feedback score can come from human evaluation or AI evaluation. For the Reinforcement Learning, we need to categorize the answers into "chosen_answer" and "rejected_answer", based on the feedback_score. For example, any answers with feedback_score greater than a threshold (such as 4 out of 5) are "chosen" otherwise "rejected". The processed data format is   

[question, chosen_response, rejected_response] 

In [None]:
from datasets import Dataset, load_dataset

def construct_trining_data(df, threshold):
    
    df['tup'] = list(zip(df['response'], df['eval_score']))

    #grouping together all the answers for a given question along with its feedback
    df_g = df.groupby('prompt')['tup'].apply(list).reset_index()

    # sort each group based on the feedback score
    df_g["sorted_tup"] = df_g["tup"].apply(lambda x :sorted(x,key=itemgetter(1)) )

    # answer with highest feedback score is "chosen"
    df_g["chosen"] = df_g["sorted_tup"].apply(lambda x: x[-1][0])
    df_g["chosen_score"] = df_g["sorted_tup"].apply(lambda x: x[-1][1])

    # answer with highest feedback score is "rejected"
    df_g["rejected"] = df_g["sorted_tup"].apply(lambda x: x[0][0])
    df_g["rejected_score"] = df_g["sorted_tup"].apply(lambda x: x[0][1])
    df_g = df_g.dropna()
    
    df_g = df_g[(df_g['chosen_score']>=threshold) & (df_g['rejected_score']<threshold)]
    
    # build dataset in [instruction, chosen_response, rejected_response]
    rows = []
    for record in df_g.itertuples(index=True, name='Pandas'):
        if record is None or len(record) == 0:
            continue
        rows.append({
            "instruction": record.prompt,
            "chosen_response": record.chosen,
            "rejected_response": record.rejected
        })
        
    processed_dataset = Dataset.from_list(rows)
    processed_df = processed_dataset.to_pandas()
    
    return processed_df, processed_dataset 

Load the SFT data file generated in the Lab 2.2 notebook

In [None]:
SFT_FILE =  '../lab-data/sft_trn_result.csv'

df = pd.read_csv(SFT_FILE)
df = df.drop(['token_overlap_recall','rouge_l_recall'], axis=1)

In [None]:
df.columns = ['prompt','reference','response','eval_score']  # rename the columns
df

Set the threshold to categorize chosen and rejected responses, then generate the training dataset/dataframe 

In [None]:
Threshold = 0.6
prepared_df, prepared_dataset = construct_trining_data(df, Threshold)

In [None]:
prepared_df

In [None]:
# If you like, you can store the training data in a csv file 
OUTPUT_FILE = '../lab-data/dpo_trn_data.csv' 
prepared_df.to_csv(OUTPUT_FILE, index=False)

### Load the SFT model

In [None]:
from sentence_transformers import SentenceTransformer, util

In [None]:
model_name = 'meta-llama/Meta-Llama-3-8B-Instruct'

In [None]:
output_dir = "./ft_model_llama3-8b_instruct_cuad"

In [None]:
# Initialize static strings for the prompt template
INTRO_BLURB = 'Below is an instruction that describes a task. Write a response that appropriately completes the request. \n'

INSTRUCTION_KEY = """
[Instruction]: You are a legal AI assistant reviwing commercial contracts. 
Please provide answer to the question listed below about the important contract clauses. 
The questions are provided after the [Question] tag, present your answer after the [Response] tag. 
DO NOT put any premables in the response. If you don't know the answer, just say I don't know, DO NOT make up the answers' 
"""

INPUT_KEY = '[Question]: '
RESPONSE_KEY = '[Response]: '
END_KEY = "[End]"

Load FM and Peft-load adapter then merge model

In [None]:
load_in_4bit = True
bnb_4bit_use_double_quant = True
bnb_4bit_quant_type = "nf4"
bnb_4bit_compute_dtype = torch.bfloat16

In [None]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit = load_in_4bit,
        bnb_4bit_use_double_quant = bnb_4bit_use_double_quant,
        bnb_4bit_quant_type = bnb_4bit_quant_type,
        bnb_4bit_compute_dtype = bnb_4bit_compute_dtype,
)

In [None]:
TOKEN = "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"

In [None]:
from peft import PeftModel, PeftConfig

model_ft = AutoModelForCausalLM.from_pretrained(  
    model_name,
    quantization_config = bnb_config,
    return_dict=True,
    low_cpu_mem_usage=True,
    device_map="auto",
)

In [None]:
model_ft = PeftModel.from_pretrained(
    model_ft, 
    output_dir, 
    torch_dtype = torch.float16,
    device_map="auto",
)

In [None]:
model_ft = model_ft.merge_and_unload()

Load tokenizer

In [None]:
tokenizer_ft = AutoTokenizer.from_pretrained(model_name)
tokenizer_ft.pad_token = tokenizer_ft.eos_token
tokenizer_ft.padding_side = "right"

In [None]:
!nvidia-smi

### Prepare the DPO training data in Datasets format

In [None]:
def return_prompt_and_responses(samples):
    return {
        "prompt": samples["instruction"],
        "chosen": samples["chosen_response"],
        "rejected": samples["rejected_response"],
    }

In [None]:
dataset = load_dataset("csv", data_files=OUTPUT_FILE, split="train")

original_columns = dataset.column_names

dataset = dataset.map(
    return_prompt_and_responses,
    batched=True,
    remove_columns=original_columns
)
dataset

### Setup PEFT/LoRA parameters

In [None]:
OUTPUT_DIR = "./dpo_model_llama3-8b_instruct_cuad"

In [None]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=16,  
    lora_alpha=32,  
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

Setup DPO parameters

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    per_device_train_batch_size=1,
    num_train_epochs=2,    
    save_steps= 10000,
    learning_rate=1e-6,    
    logging_steps=10,
    output_dir=OUTPUT_DIR,
    remove_unused_columns=False    # for using DPODataCollatorWithPadding
)

In [None]:
dpo_trainer = DPOTrainer(
    model_ft,
    ref_model=None,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer_ft,
    peft_config=lora_config,
    max_prompt_length=1024,
    max_length=2048,
)

### Launch training and save model

In [None]:
dpo_trainer.train()

In [None]:
dpo_trainer.save_model(OUTPUT_DIR)

dpo_trainer.model.save_pretrained(OUTPUT_DIR)   
tokenizer_ft.save_pretrained(OUTPUT_DIR)

### Test Inference

In [None]:
!nvidia-smi

In [None]:
import time

def Llama_Infer(prompt):

    st = time.time()
    
    batch = tokenizer_ft(prompt, return_tensors="pt")
    input_ids = batch["input_ids"].cuda()

    with torch.no_grad():
    
        output = model_ft.generate(input_ids, 
                                    max_new_tokens=256,
                                    do_sample=True,
                                    temperature = 0.01,
                                    pad_token_id=tokenizer_ft.eos_token_id,
                                    )[0]       

        response = tokenizer_ft.decode(output)



    et = time.time()
    elapsed_time = et - st
    
    #print("generated_text = ", response)
    if('[Response]:' in response):
        full_text = response.split('[Response]:')[1].strip()
        if ('[End]' in response):
            full_text = full_text.split('[End]')[0].strip()
    else:
        full_text = response
    answer = full_text
    
    return answer, elapsed_time


In [None]:
TRN_FILE = '../lab-data/ENERGOUSCORP_qa.csv'
df_test_data = pd.read_csv(TRN_FILE)

In [None]:
IDX = 1

query = df_test_data['question'][IDX]
gt = df_test_data['answer'][IDX]

blurb = f"{INTRO_BLURB}"
instruction = f"{INSTRUCTION_KEY}"
input_context = f'{INPUT_KEY}{query}\n\n{RESPONSE_KEY}'

prompt = blurb+'\n'+instruction+'\n'+input_context

answer, elapse_time = Llama_Infer(prompt)
print("Question = ", query, "\nAnswer = ", answer, "\nGT = ", gt, "\nElapse time = ", elapse_time)