![image](../images/kdd24-logo-small.jpeg)

## Hands-on Tutorial
## Domain-Driven LLM Development: Insights into RAG and Fine-Tuning Practices
### Lab 2.2 : LLM Fine-Tuning through QLoRA.    
#### Summary: 
This lab focused on Instruction fine-tuning a Meta-Llama-3-8B-Instruct model using CUAD data 

- The training dataset is from CUAD - BONTONSTORESINC_04_20_2018-EX-99.3-AGENCY AGREEMENT.PDF  
- The training is transformers trainer through QLoRA     


### Initalization

In [None]:
#!pip install transformers
#!pip install datasets
#!pip install accelerate
#!pip install bitsandbytes
#!pip install peft
#!pip install sentence_transformers

#!pip install continuous_eval
#!pip install tiktoken

#!pip install --upgrade pandas

In [None]:
!nvidia-smi

In [None]:
import torch
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          HfArgumentParser,
                          Trainer,
                          TrainingArguments,
                          DataCollatorForLanguageModeling,
                          EarlyStoppingCallback,
                          pipeline,
                          logging,
                          set_seed)

import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel, AutoPeftModelForCausalLM

In [None]:
from datasets import Dataset, load_dataset
import pandas as pd
import numpy as np

import json

### Load training data

In [None]:
TRN_FILE = '../lab-data/ENERGOUSCORP_qa.csv'
df_train_data = pd.read_csv(TRN_FILE)

sample = Dataset.from_pandas(df_train_data)
sample

### Prepare for training

In [None]:
# Initialize static strings for the prompt template
INTRO_BLURB = 'Below is an instruction that describes a task. Write a response that appropriately completes the request. \n'

INSTRUCTION_KEY = """
[Instruction]: You are a legal AI assistant reviwing commercial contracts. 
Please provide answer to the question listed below about the important contract clauses. 
The questions are provided after the [Question] tag, present your answer after the [Response] tag. 
DO NOT put any premables in the response. If you don't know the answer, just say I don't know, DO NOT make up the answers' 
"""

INPUT_KEY = '[Question]: '
RESPONSE_KEY = '[Response]: '
END_KEY = "[End]"

In [None]:
def create_prompt_formats(sample):

    blurb = f"{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}"
    input_context = f"{INPUT_KEY}{sample['question']}\n" if sample['question'] else None
    response = f"{RESPONSE_KEY}{sample['answer']}\n"
    end = f"{END_KEY}"

    parts = [part for part in [blurb, instruction, input_context, response, end] if part]

    formatted_prompt = "\n".join(parts)

    sample["text"] = formatted_prompt
    
    return sample

In [None]:
from random import randrange

sample_p = create_prompt_formats(sample[randrange(9)])
print(sample_p['text'])

In [None]:
len((sample_p['text']))

In [None]:
def get_max_length(model):

    conf = model.config

    max_length = None

    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
            
    if not max_length:
        max_length = 8192
        print(f"Using default max length: {max_length}")
        
    return max_length

In [None]:
def preprocess_batch(batch, tokenizer, max_length):

    return tokenizer(
        batch["text"],
        max_length = max_length,
        truncation = True,
    )

In [None]:
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset: str):

    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)

    _preprocessing_function = partial(preprocess_batch, max_length = max_length, tokenizer = tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched = True,
        remove_columns = ['index', 'question', 'input', 'answer', 'qa_id', 'text'],
    )

    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    dataset = dataset.shuffle(seed = seed)

    return dataset

### Setup model and tokenizer

In [None]:
model_name = 'meta-llama/Meta-Llama-3-8B-Instruct'

Configuration model quantization

In [None]:
load_in_4bit = True
bnb_4bit_use_double_quant = True
bnb_4bit_quant_type = "nf4"
bnb_4bit_compute_dtype = torch.bfloat16

In [None]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit = load_in_4bit,
        bnb_4bit_use_double_quant = bnb_4bit_use_double_quant,
        bnb_4bit_quant_type = bnb_4bit_quant_type,
        bnb_4bit_compute_dtype = bnb_4bit_compute_dtype,
)

In [None]:
TOKEN = "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"

Load model and tokenizer

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config,
    device_map = "auto", 
    token = TOKEN,
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                          token = TOKEN,
                                         )
tokenizer.pad_token = tokenizer.eos_token

Preprocess the data

In [None]:
from functools import partial

seed = 0

max_length = get_max_length(model)
preprocessed_dataset = preprocess_dataset(tokenizer, max_length, seed, sample)

In [None]:
print(preprocessed_dataset)

### Setup training parameters

Setup LoRA parameters

In [None]:
lora_config = LoraConfig(
    r = 64,    
    lora_alpha = 256,    
    target_modules = ['gate_proj', 'up_proj', 'q_proj', 'v_proj', 'down_proj', 'k_proj', 'o_proj'],
    lora_dropout = 0.05,
    bias = "none",
    task_type = "CAUSAL_LM",
)

In [None]:
def print_trainable_parameters(model, use_4bit = False):
    """
    Prints the number of trainable parameters in the model.

    :param model: PEFT model
    """

    trainable_params = 0
    all_param = 0

    for _, param in model.named_parameters():
        num_params = param.numel()
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel
        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params

    if use_4bit:
        trainable_params /= 2

    print(
        f"All Parameters: {all_param:,d} || Trainable Parameters: {trainable_params:,d} || Trainable Parameters %: {100 * trainable_params / all_param}"
    )

In [None]:
model.gradient_checkpointing_enable()    #reduce memory usage during fine-tuning
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

print_trainable_parameters(model)

Setup training parameters

In [None]:
output_dir = "./ft_model_llama3-8b_instruct_cuad"
per_device_train_batch_size = 1
learning_rate = 1e-5   
warmup_steps = 2  # Linear warmup steps from 0 to learning_rate 
fp16 = True  
epoch = 2             
logging_steps = epoch*2
save_steps = 20000

In [None]:
trainer = Trainer(
    model = model,
    train_dataset = preprocessed_dataset,
    args = TrainingArguments(
        per_device_train_batch_size = per_device_train_batch_size,
        warmup_steps = warmup_steps,
        num_train_epochs = epoch,
        learning_rate = learning_rate,
        fp16 = fp16,
        logging_steps = logging_steps,
        output_dir = output_dir,
        save_strategy="steps",
        save_steps = save_steps,
    ),
    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm = False)
)

In [None]:
!nvidia-smi

### Launch training and save model

In [None]:
model.config.use_cache = False

do_train = True

print("Training...")

if do_train:
    train_result = trainer.train()
    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()
    print(metrics)

In [None]:
!nvidia-smi

In [None]:
print("Saving last checkpoint of the model...")
trainer.model.save_pretrained(output_dir,
                              token = TOKEN,
                              trust_remote_code=True,
                             )

### Test inference 

In [None]:
model_ft = model
tokenizer_ft = tokenizer

In [None]:
import time

def Llama_Infer(prompt):

    st = time.time()
    
    batch = tokenizer_ft(prompt, return_tensors="pt")
    input_ids = batch["input_ids"].cuda()

    with torch.no_grad():
    
        output = model_ft.generate(input_ids, 
                                    max_new_tokens=256,
                                    do_sample=True,
                                    temperature = 0.01,
                                    pad_token_id=tokenizer_ft.eos_token_id,
                                    )[0]       

        response = tokenizer_ft.decode(output)



    et = time.time()
    elapsed_time = et - st
    
    #print("generated_text = ", response)
    full_text = response.split('[Response]:')[1].split('[End]')[0].strip()
    answer = full_text
    
    return answer, elapsed_time


In [None]:
IDX = 7

query = df_train_data['question'][IDX]
gt = df_train_data['answer'][IDX]

In [None]:
query, gt

In [None]:
blurb = f"{INTRO_BLURB}"
instruction = f"{INSTRUCTION_KEY}"
input_context = f'{INPUT_KEY}{query}\n\n{RESPONSE_KEY}'

prompt = blurb+'\n'+instruction+'\n'+input_context
print(prompt)

In [None]:
answer, elapse_time = Llama_Infer(prompt)
print("Question = ", query, "\nAnswer = ", answer, "\nGT = ", gt, "\nElapse time = ", elapse_time)

### Optional: restart the kernel and run batch inference on the fine-tuned model

In [None]:
# --- restart kernel ---
from IPython.display import display_html
def restartkernel() :
    display_html("<script>Jupyter.notebook.kernel.restart()</script>",raw=True)
restartkernel()

In [None]:
import torch
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          HfArgumentParser,
                          Trainer,
                          TrainingArguments,
                          DataCollatorForLanguageModeling,
                          EarlyStoppingCallback,
                          pipeline,
                          logging,
                          set_seed)

import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel, AutoPeftModelForCausalLM

from sentence_transformers import SentenceTransformer, util

In [None]:
from datasets import Dataset, load_dataset
import pandas as pd
import numpy as np

import json

Preparation

In [None]:
model_name = 'meta-llama/Meta-Llama-3-8B-Instruct'

In [None]:
output_dir = "./ft_model_llama3-8b_instruct_cuad"

In [None]:
# Initialize static strings for the prompt template
INTRO_BLURB = 'Below is an instruction that describes a task. Write a response that appropriately completes the request. \n'

INSTRUCTION_KEY = """
[Instruction]: You are a legal AI assistant reviwing commercial contracts. 
Please provide answer to the question listed below about the important contract clauses. 
The questions are provided after the [Question] tag, present your answer after the [Response] tag. 
DO NOT put any premables in the response. If you don't know the answer, just say I don't know, DO NOT make up the answers' 
"""

INPUT_KEY = '[Question]: '
RESPONSE_KEY = '[Response]: '
END_KEY = "[End]"

Load FM and Peft-load adapter then merge model

In [None]:
load_in_4bit = True
bnb_4bit_use_double_quant = True
bnb_4bit_quant_type = "nf4"
bnb_4bit_compute_dtype = torch.bfloat16

In [None]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit = load_in_4bit,
        bnb_4bit_use_double_quant = bnb_4bit_use_double_quant,
        bnb_4bit_quant_type = bnb_4bit_quant_type,
        bnb_4bit_compute_dtype = bnb_4bit_compute_dtype,
)

In [None]:
TOKEN = "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"

In [None]:
from peft import PeftModel, PeftConfig

model_ft = AutoModelForCausalLM.from_pretrained(  
    model_name,
    quantization_config = bnb_config,
    return_dict=True,
    low_cpu_mem_usage=True,
    device_map="auto",
)

In [None]:
model_ft = PeftModel.from_pretrained(
    model_ft, 
    output_dir, 
    torch_dtype = torch.float16,
    device_map="auto",
)

In [None]:
model_ft = model_ft.merge_and_unload()

Load tokenizer

In [None]:
tokenizer_ft = AutoTokenizer.from_pretrained(model_name)
tokenizer_ft.pad_token = tokenizer_ft.eos_token
tokenizer_ft.padding_side = "right"

In [None]:
!nvidia-smi

Prepare for model inference and evaluation

In [None]:
import time

def Llama_Infer(prompt):

    st = time.time()
    
    batch = tokenizer_ft(prompt, return_tensors="pt")
    input_ids = batch["input_ids"].cuda()

    with torch.no_grad():
    
        output = model_ft.generate(input_ids, 
                                    max_new_tokens=256,
                                    do_sample=True,
                                    temperature = 0.01,
                                    pad_token_id=tokenizer_ft.eos_token_id,
                                    )[0]       

        response = tokenizer_ft.decode(output)



    et = time.time()
    elapsed_time = et - st
    
    #print("generated_text = ", response)
    if('[Response]:' in response):
        full_text = response.split('[Response]:')[1].strip()
        if ('[End]' in response):
            full_text = full_text.split('[End]')[0].strip()
    else:
        full_text = response
    answer = full_text
    
    return answer, elapsed_time


In [None]:
import boto3
import json

boto3_bedrock = boto3.client(service_name="bedrock", region_name="us-west-2")
boto3_bedrock_runtime = boto3.client(service_name="bedrock-runtime", region_name="us-west-2")

def get_titan_embedding(text):
    
    body = json.dumps({"inputText": text})
    modelId = 'amazon.titan-embed-text-v2:0'     
    accept = 'application/json'
    contentType = 'application/json'    
    
    response = boto3_bedrock_runtime.invoke_model(body=body, 
                                                  modelId=modelId, 
                                                  accept=accept, 
                                                  contentType=contentType)
    response_body = json.loads(response.get('body').read())
    embedding = response_body.get('embedding')
    
    return embedding
    
def calculate_semantic_sim_titan(pred_list,ref_list):
   
    sem_score_titan = []
    average_sem_sim = 0
    
    for i in range(len(ref_list)):
        print(i,end = '|')
        ref_embedding = get_titan_embedding(ref_list[i])
        pred_embedding = get_titan_embedding(pred_list[i])
        cos_sim = util.cos_sim(ref_embedding, pred_embedding)
        
        sem_score_titan.append(cos_sim[0][0].item())
    
    return sem_score_titan

In [None]:
from continuous_eval.metrics.generation.text import DeterministicAnswerCorrectness

def calculate_answer_correctness(pred_list,ref_list):
   
    token_overlap_recall = []
    rouge_l_recall = []
    
    metric = DeterministicAnswerCorrectness()
    
    for i in range(len(ref_list)):
        print(i,end = '|')
    
        datum = {
            "answer": pred_list[i],
            "ground_truth_answers": [ref_list[i]],
        } 
        ac = metric(**datum)    
        
        token_overlap_recall.append(ac['token_overlap_recall'])
        rouge_l_recall.append(ac['rouge_l_recall'])
        
    return token_overlap_recall, rouge_l_recall

Test single inference

In [None]:
TRN_FILE = '../lab-data/ENERGOUSCORP_qa.csv'
df_test_data = pd.read_csv(TRN_FILE)

In [None]:
IDX = 10

query = df_test_data['question'][IDX]
gt = df_test_data['answer'][IDX]

blurb = f"{INTRO_BLURB}"
instruction = f"{INSTRUCTION_KEY}"
input_context = f'{INPUT_KEY}{query}\n\n{RESPONSE_KEY}'

prompt = blurb+'\n'+instruction+'\n'+input_context

answer, elapse_time = Llama_Infer(prompt)
print("Question = ", query, "\nAnswer = ", answer, "\nGT = ", gt, "\nElapse time = ", elapse_time)

Batch inference (training data)

In [None]:
TRN_FILE = '../lab-data/ENERGOUSCORP_qa.csv'
df_test_data = pd.read_csv(TRN_FILE)

In [None]:
test_question_list = []
test_answer_list = []
test_ref_answer_list = []

st = time.time()

for i in range(len(df_test_data['question'])):
    print(i,end='|')
    
    query = df_test_data['question'][i].strip()
    ref_answer = df_test_data['answer'][i].strip()
    
    blurb = f"{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}"
    input_context = f'{INPUT_KEY}{query}\n\n{RESPONSE_KEY}'

    prompt = blurb+'\n'+instruction+'\n'+input_context

    response_text,response_time = Llama_Infer(prompt)
    print(response_text)
    
    test_question_list.append(query)
    test_answer_list.append(response_text)
    test_ref_answer_list.append(ref_answer)
    
et = time.time()
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

In [None]:
test_ss_list = calculate_semantic_sim_titan(test_answer_list,test_ref_answer_list)
test_tor_list, test_rlr_list_list = calculate_answer_correctness(test_answer_list,test_ref_answer_list)

average_sem_sim_titan = np.average(test_ss_list)   
average_sem_sim_titan

In [None]:
df_response = pd.DataFrame()  

df_response["question"] = test_question_list
df_response["ref_answer"] = test_ref_answer_list
df_response["response"] = test_answer_list
df_response["semantic_similarity"] = test_ss_list
df_response["token_overlap_recall"] = test_tor_list
df_response["rouge_l_recall"] = test_rlr_list_list

In [None]:
df_response

In [None]:
TEST_OUTPUT_FILE = '../lab-data/sft_trn_q4b_result.csv'
df_response.to_csv(TEST_OUTPUT_FILE, index=False)

Batch inference (testing data)

In [None]:
TEST_FILE = '../lab-data/ENERGOUSCORP_qa_test.csv'
df_test_data = pd.read_csv(TEST_FILE)

In [None]:
test_question_list = []
test_answer_list = []
test_ref_answer_list = []

st = time.time()

for i in range(len(df_test_data['question'])):
    print(i,end='|')
    
    query = df_test_data['question'][i].strip()
    ref_answer = df_test_data['answer'][i].strip()
    
    blurb = f"{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}"
    input_context = f'{INPUT_KEY}{query}\n\n{RESPONSE_KEY}'

    prompt = blurb+'\n'+instruction+'\n'+input_context

    response_text,response_time = Llama_Infer(prompt)
    print(response_text)
    
    test_question_list.append(query)
    test_answer_list.append(response_text)
    test_ref_answer_list.append(ref_answer)
    
et = time.time()
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

In [None]:
test_ss_list = calculate_semantic_sim_titan(test_answer_list,test_ref_answer_list)
test_tor_list, test_rlr_list_list = calculate_answer_correctness(test_answer_list,test_ref_answer_list)

average_sem_sim_titan = np.average(test_ss_list)   
average_sem_sim_titan

In [None]:
df_response = pd.DataFrame()  

df_response["question"] = test_question_list
df_response["ref_answer"] = test_ref_answer_list
df_response["response"] = test_answer_list
df_response["semantic_similarity"] = test_ss_list
df_response["token_overlap_recall"] = test_tor_list
df_response["rouge_l_recall"] = test_rlr_list_list

In [None]:
df_response

In [None]:
TEST_OUTPUT_FILE = '../lab-data/sft_test_q4b_result.csv'
df_response.to_csv(TEST_OUTPUT_FILE, index=False)