## PEFT/LoRA fine tuning with  dataset

Instruction fine-tuning: meta.llama3-8b-instruct-v1:0 (meta-llama/Meta-Llama-3-8B-Instruct) with LoRA, no quantization
     
Training: transformer trainer   
Data: CUAD - BONTONSTORESINC_04_20_2018-EX-99.3-AGENCY AGREEMENT.PDF

2024/07/31: first version
2024/08/15: second version (using csv file)

In [None]:
#!pip install transformers
#!pip install datasets
#!pip install accelerate
#!pip install bitsandbytes
#!pip install peft
#!pip install sentence_transformers

#!pip install continuous_eval
#!pip install tiktoken

#!pip install --upgrade pandas

In [None]:
!nvidia-smi

## SFT

In [None]:
import torch
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          HfArgumentParser,
                          Trainer,
                          TrainingArguments,
                          DataCollatorForLanguageModeling,
                          EarlyStoppingCallback,
                          pipeline,
                          logging,
                          set_seed)

import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel, AutoPeftModelForCausalLM
#from trl import SFTTrainer

In [None]:
torch.__version__       

In [None]:
from datasets import Dataset, load_dataset
import pandas as pd
import numpy as np

import json

### Fine tuning

In [None]:
TRN_FILE = '../lab-data/ENERGOUSCORP_qa.csv'
df_train_data = pd.read_csv(TRN_FILE)

sample = Dataset.from_pandas(df_train_data)
sample

Training parameters 

In [None]:
# Initialize static strings for the prompt template
INTRO_BLURB = 'Below is an instruction that describes a task. Write a response that appropriately completes the request. \n'

INSTRUCTION_KEY = """
[Instruction]: You are a legal AI assistant reviwing commercial contracts. 
Please provide answer to the question listed below about the important contract clauses. 
The questions are provided after the [Question] tag, present your answer after the [Response] tag. 
DO NOT put any premables in the response. If you don't know the answer, just say I don't know, DO NOT make up the answers' 
"""

INPUT_KEY = '[Question]: '
RESPONSE_KEY = '[Response]: '
END_KEY = "[End]"

In [None]:
def create_prompt_formats(sample):
    # Combine a prompt with the static strings
    blurb = f"{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}"
    input_context = f"{INPUT_KEY}{sample['question']}\n" if sample['question'] else None
    response = f"{RESPONSE_KEY}{sample['answer']}\n"
    #response = f"{RESPONSE_KEY}{[[sample['routing_label']]]}\n"
    end = f"{END_KEY}"

    # Create a list of prompt template elements
    parts = [part for part in [blurb, instruction, input_context, response, end] if part]

    # Join prompt template elements into a single string to create the prompt template
    formatted_prompt = "\n".join(parts)

    # Store the formatted prompt template in a new key "text"
    sample["text"] = formatted_prompt
    
    return sample

In [None]:
from random import randrange

sample_p = create_prompt_formats(sample[randrange(9)])
print(sample_p['text'])

In [None]:
len((sample_p['text']))

In [None]:
def get_max_length(model):
    """
    Extracts maximum token length from the model configuration

    :param model: Hugging Face model
    """

    # Pull model configuration
    conf = model.config
    # Initialize a "max_length" variable to store maximum sequence length as null
    max_length = None
    # Find maximum sequence length in the model configuration and save it in "max_length" if found
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    # Set "max_length" to 1024 (default value) if maximum sequence length is not found in the model configuration
    if not max_length:
        max_length = 8192
        print(f"Using default max length: {max_length}")
    return max_length

In [None]:
def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizes dataset batch

    :param batch: Dataset batch
    :param tokenizer: Model tokenizer
    :param max_length: Maximum number of tokens to emit from the tokenizer
    """

    return tokenizer(
        batch["text"],
        max_length = max_length,
        truncation = True,
    )

In [None]:
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset: str):
    """
    Tokenizes dataset for fine-tuning

    :param tokenizer (AutoTokenizer): Model tokenizer
    :param max_length (int): Maximum number of tokens to emit from the tokenizer
    :param seed: Random seed for reproducibility
    :param dataset (str): Instruction dataset
    """

    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)

    # Apply preprocessing to each batch of the dataset & and remove "instruction", "input", "output", and "text" fields
    _preprocessing_function = partial(preprocess_batch, max_length = max_length, tokenizer = tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched = True,
        remove_columns = ['index', 'question', 'input', 'answer', 'qa_id', 'text'],
    )

    
    # Filter out samples that have "input_ids" exceeding "max_length"
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    # Shuffle dataset
    dataset = dataset.shuffle(seed = seed)

    return dataset

Setup model and tokenizer

In [None]:
model_name = 'meta-llama/Meta-Llama-3-8B-Instruct'

In [None]:
# Activate 4-bit precision base model loading
load_in_4bit = False

# Activate nested quantization for 4-bit base models (double quantization)
bnb_4bit_use_double_quant = True

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Compute data type for 4-bit base models
bnb_4bit_compute_dtype = torch.bfloat16

In [None]:
# use this for qLoRA
bnb_config = BitsAndBytesConfig(
        load_in_4bit = load_in_4bit,
        bnb_4bit_use_double_quant = bnb_4bit_use_double_quant,
        bnb_4bit_quant_type = bnb_4bit_quant_type,
        bnb_4bit_compute_dtype = bnb_4bit_compute_dtype,
)

In [None]:
# Get number of GPU device and set maximum memory
n_gpus = torch.cuda.device_count()
max_memory = f'{40960}MB'

In [None]:
TOKEN = "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"

In [None]:
# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config,
    device_map = "auto", # dispatch the model efficiently on the available resources
    #max_memory = {i: max_memory for i in range(n_gpus)},
    token = TOKEN,
    trust_remote_code=True,
)

# Load model tokenizer with the user authentication token
tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                          token = TOKEN,
                                         )
#tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Set padding token as EOS token
tokenizer.pad_token = tokenizer.eos_token

In [None]:
!nvidia-smi

Preprocess the data

In [None]:
from functools import partial

seed = 0

max_length = get_max_length(model)
preprocessed_dataset = preprocess_dataset(tokenizer, max_length, seed, sample)

In [None]:
print(preprocessed_dataset)

In [None]:
lora_config = LoraConfig(
    r = 16,    
    lora_alpha = 64,    
    #target_modules = ["q_proj","k_proj","v_proj","o_proj",],
    target_modules = ['gate_proj', 'up_proj', 'q_proj', 'v_proj', 'down_proj', 'k_proj', 'o_proj'],
    lora_dropout = 0.05,
    bias = "none",
    task_type = "CAUSAL_LM",
)

In [None]:
def print_trainable_parameters(model, use_4bit = False):
    """
    Prints the number of trainable parameters in the model.

    :param model: PEFT model
    """

    trainable_params = 0
    all_param = 0

    for _, param in model.named_parameters():
        num_params = param.numel()
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel
        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params

    if use_4bit:
        trainable_params /= 2

    print(
        f"All Parameters: {all_param:,d} || Trainable Parameters: {trainable_params:,d} || Trainable Parameters %: {100 * trainable_params / all_param}"
    )

In [None]:
# Enable gradient checkpointing to reduce memory usage during fine-tuning
model.gradient_checkpointing_enable()

# Prepare the model for training
model = prepare_model_for_kbit_training(model)

In [None]:
# Create PEFT configuration for these modules and wrap the model to PEFT
model = get_peft_model(model, lora_config)

# Print information about the percentage of trainable parameters
print_trainable_parameters(model)

In [None]:
# Output directory where the model predictions and checkpoints will be stored
output_dir = "./ft_model_llama3-8b_instruct_cuad"

# Batch size per GPU for training
per_device_train_batch_size = 1

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1  #4

# Initial learning rate (AdamW optimizer)
learning_rate = 1e-5    # 1e-4 caused ocsillation...

# Optimizer to use
optim = "paged_adamw_32bit"

# Number of training steps (overrides num_train_epochs)
#max_steps = 1000

# Linear warmup steps from 0 to learning_rate
warmup_steps = 2

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True  

# Log every X updates steps
epoch = 2
logging_steps = epoch*2

save_steps = 20000

In [None]:
# Training parameters
trainer = Trainer(
    model = model,
    train_dataset = preprocessed_dataset,
    args = TrainingArguments(
        per_device_train_batch_size = per_device_train_batch_size,
        #gradient_accumulation_steps = gradient_accumulation_steps,
        warmup_steps = warmup_steps,
        #max_steps = max_steps,
        num_train_epochs = epoch,
        learning_rate = learning_rate,
        fp16 = fp16,
        logging_steps = logging_steps,
        output_dir = output_dir,
        #optim = optim,
        save_strategy="steps",
        save_steps = save_steps,
    ),
    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm = False)
)

In [None]:
model.config.use_cache = False

do_train = True

# Launch training and log metrics
print("Training...")

if do_train:
    train_result = trainer.train()
    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()
    print(metrics)

In [None]:
!nvidia-smi

In [None]:
# Save model
print("Saving last checkpoint of the model...")
trainer.model.save_pretrained(output_dir,
                              token = TOKEN,
                              trust_remote_code=True,
                             )

### Test inference 

In [None]:
model_ft = model
tokenizer_ft = tokenizer

In [None]:
import time

def Llama_Infer(prompt):

    st = time.time()
    
    batch = tokenizer_ft(prompt, return_tensors="pt")
    input_ids = batch["input_ids"].cuda()

    with torch.no_grad():
    
        output = model_ft.generate(input_ids, 
                                    max_new_tokens=256,
                                    do_sample=True,
                                    temperature = 0.01,
                                    pad_token_id=tokenizer_ft.eos_token_id,
                                    )[0]       

        response = tokenizer_ft.decode(output)



    et = time.time()
    elapsed_time = et - st
    
    #print("generated_text = ", response)
    full_text = response.split('[Response]:')[1].split('[End]')[0].strip()
    answer = full_text
    
    return answer, elapsed_time


In [None]:
IDX = 3

query = df_train_data['question'][IDX]
gt = df_train_data['answer'][IDX]

In [None]:
query, gt

In [None]:
blurb = f"{INTRO_BLURB}"
instruction = f"{INSTRUCTION_KEY}"
input_context = f'{INPUT_KEY}{query}\n\n{RESPONSE_KEY}'

prompt = blurb+'\n'+instruction+'\n'+input_context
print(prompt)

In [None]:
answer, elapse_time = Llama_Infer(prompt)
print("Question = ", query, "\nAnswer = ", answer, "\nGT = ", gt, "\nElapse time = ", elapse_time)

In [None]:
#model --- restart kernel ---

In [1]:
import torch
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          HfArgumentParser,
                          Trainer,
                          TrainingArguments,
                          DataCollatorForLanguageModeling,
                          EarlyStoppingCallback,
                          pipeline,
                          logging,
                          set_seed)

import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel, AutoPeftModelForCausalLM
#from trl import SFTTrainer

In [2]:
from datasets import Dataset, load_dataset
import pandas as pd
import numpy as np

In [3]:
from sentence_transformers import SentenceTransformer, util

In [4]:
model_name = 'meta-llama/Meta-Llama-3-8B-Instruct'

In [5]:
output_dir = "./ft_model_llama3-8b_instruct_cuad"

In [6]:
# Initialize static strings for the prompt template
INTRO_BLURB = 'Below is an instruction that describes a task. Write a response that appropriately completes the request. \n'

INSTRUCTION_KEY = """
[Instruction]: You are a legal AI assistant reviwing commercial contracts. 
Please provide answer to the question listed below about the important contract clauses. 
The questions are provided after the [Question] tag, present your answer after the [Response] tag. 
DO NOT put any premables in the response. If you don't know the answer, just say I don't know, DO NOT make up the answers' 
"""

INPUT_KEY = '[Question]: '
RESPONSE_KEY = '[Response]: '
END_KEY = "[End]"

Load FM and Peft-load adapter then merge

In [7]:
# Activate 4-bit precision base model loading
load_in_4bit = False

# Activate nested quantization for 4-bit base models (double quantization)
bnb_4bit_use_double_quant = True

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Compute data type for 4-bit base models
bnb_4bit_compute_dtype = torch.bfloat16

In [8]:
# use this for qLoRA
bnb_config = BitsAndBytesConfig(
        load_in_4bit = load_in_4bit,
        bnb_4bit_use_double_quant = bnb_4bit_use_double_quant,
        bnb_4bit_quant_type = bnb_4bit_quant_type,
        bnb_4bit_compute_dtype = bnb_4bit_compute_dtype,
)

In [9]:
# Get number of GPU device and set maximum memory
n_gpus = torch.cuda.device_count()
max_memory = f'{40960}MB'

In [10]:
TOKEN = "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"

In [11]:
from peft import PeftModel, PeftConfig

model_ft = AutoModelForCausalLM.from_pretrained(  
    model_name,
    quantization_config = bnb_config,
    return_dict=True,
    low_cpu_mem_usage=True,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [12]:
model_ft = PeftModel.from_pretrained(
    model_ft, 
    output_dir, 
    torch_dtype = torch.float16,
    device_map="auto",
)

In [13]:
model_ft = model_ft.merge_and_unload()



In [14]:
tokenizer_ft = AutoTokenizer.from_pretrained(model_name)
tokenizer_ft.pad_token = tokenizer_ft.eos_token
tokenizer_ft.padding_side = "right"

In [15]:
!nvidia-smi

Thu Aug 15 22:02:42 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       On  | 00000000:00:1E.0 Off |                    0 |
| N/A   39C    P0              33W /  70W |   7087MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

Prepare data

In [16]:
TEST_FILE = '../lab-data/ENERGOUSCORP_qa_test.csv'
df_test_data = pd.read_csv(TEST_FILE)

#sample = Dataset.from_pandas(df_test_data)
#sample

Prepare prompt

In [17]:
IDX = 3

query = df_test_data['question'][IDX]
gt = df_test_data['answer'][IDX]

In [18]:
query, gt

('What date is specified in the contract as the "Effective Date" when the terms become legally binding?',
 'November 6, 2016')

In [19]:
blurb = f"{INTRO_BLURB}"
instruction = f"{INSTRUCTION_KEY}"
input_context = f'{INPUT_KEY}{query}\n\n{RESPONSE_KEY}'

prompt = blurb+'\n'+instruction+'\n'+input_context
print(prompt)

Below is an instruction that describes a task. Write a response that appropriately completes the request. 


[Instruction]: You are a legal AI assistant reviwing commercial contracts. 
Please provide answer to the question listed below about the important contract clauses. 
The questions are provided after the [Question] tag, present your answer after the [Response] tag. 
DO NOT put any premables in the response. If you don't know the answer, just say I don't know, DO NOT make up the answers' 

[Question]: What date is specified in the contract as the "Effective Date" when the terms become legally binding?

[Response]: 


Inference

In [20]:
import time

def Llama_Infer(prompt):

    st = time.time()
    
    batch = tokenizer_ft(prompt, return_tensors="pt")
    input_ids = batch["input_ids"].cuda()

    with torch.no_grad():
    
        output = model_ft.generate(input_ids, 
                                    max_new_tokens=256,
                                    do_sample=True,
                                    temperature = 0.01,
                                    pad_token_id=tokenizer_ft.eos_token_id,
                                    )[0]       

        response = tokenizer_ft.decode(output)



    et = time.time()
    elapsed_time = et - st
    
    #print("generated_text = ", response)
    if('[Response]:' in response and '[End]' in response):
        full_text = response.split('[Response]:')[1].split('[End]')[0].strip()
    else:
        full_text = response
    answer = full_text
    
    return answer, elapsed_time


In [21]:
answer, elapse_time = Llama_Infer(prompt)
print("Question = ", query, "\nAnswer = ", answer, "\nGT = ", gt, "\nElapse time = ", elapse_time)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Question =  What date is specified in the contract as the "Effective Date" when the terms become legally binding? 
Answer =  <|begin_of_text|>Below is an instruction that describes a task. Write a response that appropriately completes the request. 


[Instruction]: You are a legal AI assistant reviwing commercial contracts. 
Please provide answer to the question listed below about the important contract clauses. 
The questions are provided after the [Question] tag, present your answer after the [Response] tag. 
DO NOT put any premables in the response. If you don't know the answer, just say I don't know, DO NOT make up the answers' 

[Question]: What date is specified in the contract as the "Effective Date" when the terms become legally binding?

[Response]: 2023-02-15

Please note that the above instruction is just a sample, and you should adjust it according to your actual requirements. 

Please provide your response in the format below. 

[Question] [Your Response] 

[Question] [You

Batch inference

In [22]:
test_question_list = []
test_answer_list = []
test_ref_answer_list = []

st = time.time()

for i in range(len(df_test_data['question'])):
    print(i,end='|')
    response_text,response_time = Llama_Infer(df_test_data['question'][i].strip())
    
    test_question_list.append(df_test_data['question'][i])
    test_answer_list.append(response_text)
    test_ref_answer_list.append(df_test_data['answer'][i])
    
et = time.time()
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

0|1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24|25|26|27|28|29|30|31|Execution time: 584.46608543396 seconds


Evaluation (semantic similarity score)

In [23]:
# Bedrock on PDX
import boto3
import json

boto3_bedrock = boto3.client(service_name="bedrock", region_name="us-west-2")
boto3_bedrock_runtime = boto3.client(service_name="bedrock-runtime", region_name="us-west-2")

In [24]:
# SS using Titan embedding model
def get_titan_embedding(text):
    
    body = json.dumps({"inputText": text})
    modelId = 'amazon.titan-embed-text-v2:0'     
    accept = 'application/json'
    contentType = 'application/json'    
    
    response = boto3_bedrock_runtime.invoke_model(body=body, 
                                                  modelId=modelId, 
                                                  accept=accept, 
                                                  contentType=contentType)
    response_body = json.loads(response.get('body').read())
    embedding = response_body.get('embedding')
    
    return embedding
    
def calculate_semantic_sim_titan(pred_list,ref_list):
   
    sem_score_titan = []
    average_sem_sim = 0
    
    for i in range(len(ref_list)):
        print(i,end = '|')
        ref_embedding = get_titan_embedding(ref_list[i])
        pred_embedding = get_titan_embedding(pred_list[i])
        cos_sim = util.cos_sim(ref_embedding, pred_embedding)
        
        sem_score_titan.append(cos_sim[0][0].item())
    
    #average_sem_sim_titan = np.mean(sem_score_titan)   
    
    #print("Average similarity: ", average_sem_sim)
    
    return sem_score_titan

In [25]:
from continuous_eval.metrics.generation.text import DeterministicAnswerCorrectness

def calculate_answer_correctness(pred_list,ref_list):
   
    token_overlap_recall = []
    rouge_l_recall = []
    
    metric = DeterministicAnswerCorrectness()
    
    for i in range(len(ref_list)):
        print(i,end = '|')
    
        datum = {
            "answer": pred_list[i],
            "ground_truth_answers": [ref_list[i]],
        } 
        ac = metric(**datum)    
        
        token_overlap_recall.append(ac['token_overlap_recall'])
        rouge_l_recall.append(ac['rouge_l_recall'])
        
    return token_overlap_recall, rouge_l_recall

In [26]:
test_ss_list = calculate_semantic_sim_titan(test_answer_list,test_ref_answer_list)
test_tor_list, test_rlr_list_list = calculate_answer_correctness(test_answer_list,test_ref_answer_list)

average_sem_sim_titan = np.average(test_ss_list)   
average_sem_sim_titan

0|1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24|25|26|27|28|29|30|31|0|1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24|25|26|27|28|29|30|31|

0.37321117520332336

In [27]:
df_response = pd.DataFrame()  

df_response["question"] = test_question_list
df_response["ref_answer"] = test_ref_answer_list
df_response["response"] = test_answer_list
df_response["semantic_similarity"] = test_ss_list
df_response["token_overlap_recall"] = test_tor_list
df_response["rouge_l_recall"] = test_rlr_list_list

In [28]:
TEST_OUTPUT_FILE = '../lab-data/sft_result.csv'
df_response.to_csv(TEST_OUTPUT_FILE, index=False)