## PEFT/LoRA fine tuning with  dataset

Instruction fine-tuning: meta.llama3-8b-instruct-v1:0 (meta-llama/Meta-Llama-3-8B-Instruct) with LoRA, no quantization
     
Training: transformer trainer   
Data: CUAD - BONTONSTORESINC_04_20_2018-EX-99.3-AGENCY AGREEMENT.PDF

2024/07/31: first version

In [None]:
#!pip install transformers
#!pip install datasets
#!pip install accelerate
#!pip install bitsandbytes
#!pip install peft

In [None]:
#!pip install gradio
#!pip install py7zr
#!pip install --upgrade accelerate

In [None]:
#!pip install --upgrade sagemaker 
#!pip install ipywidgets==7.0.0 
#!pip install langchain==0.0.148 
#!pip install faiss-cpu 

#!pip install pypdf
#!pip install sentence_transformers
#!pip install chromadb

In [None]:
#!pip install einops

In [None]:
!nvidia-smi

## SFT

In [None]:
import torch
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          HfArgumentParser,
                          Trainer,
                          TrainingArguments,
                          DataCollatorForLanguageModeling,
                          EarlyStoppingCallback,
                          pipeline,
                          logging,
                          set_seed)

import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel, AutoPeftModelForCausalLM
#from trl import SFTTrainer

In [None]:
torch.__version__       

### Prepare data from csv

In [None]:
from datasets import Dataset, load_dataset
import pandas as pd
import numpy as np

In [None]:
CONTRACT_FILES = ["BONTONSTORESINC_04_20_2018-EX-99.3-AGENCY AGREEMENT.PDF",
                  "ENERGOUSCORP_03_16_2017-EX-10.24-STRATEGIC ALLIANCE AGREEMENT.PDF",
                  "MRSFIELDSORIGINALCOOKIESINC_01_29_1998-EX-10-FRANCHISE AGREEMENT.PDF",
                  "PlayboyEnterprisesInc_20090220_10-QA_EX-10.2_4091580_EX-10.2_Content License Agreement_ Marketing Agreement_ Sales-Purchase Agreement1.pdf"
                 ]

In [None]:
TRN_FILE = "../lab-data/"+"master_clauses.csv"  

df = pd.read_csv(TRN_FILE)

question_list = []
answer_list = []

In [None]:
def create_qa_data():
    for k in range(len(CONTRACT_FILES)):
        df_qa = pd.melt(df[df["Filename"]==CONTRACT_FILES[k]], 
                        id_vars=['Filename'], 
                        var_name='title', 
                        value_name='answer'
                       )

        df_qa = df_qa.drop(['Filename'],axis=1)

        for i in range(len(df_qa)):
            if (i%2==0):
                question_list.append("What is the "+df_qa['title'][i]+" in the contract "+CONTRACT_FILES[k].split('.PDF')[0]+" ?")
            else:
                answer_list.append(df_qa['answer'][i])

    return

In [None]:
create_qa_data()

In [None]:
# build dataframe
df_seed_data = pd.DataFrame()
df_seed_data['question'] = question_list
df_seed_data['answer'] = answer_list

# remove nan answer
df_seed_data = df_seed_data[df_seed_data.isna().answer==False]
df_seed_data = df_seed_data.reset_index()
df_seed_data

In [None]:
df_train_data = df_seed_data

sample = Dataset.from_pandas(df_train_data)
sample

### Prepare data from JSON

In [None]:
# load data from json

In [None]:
# create separate training and testing datasets

### Fine tuning

Training parameters 

In [None]:
# Initialize static strings for the prompt template
INTRO_BLURB = 'Below is an instruction that describes a task. Write a response that appropriately completes the request. \n'

INSTRUCTION_KEY = """
[Instruction]: You are a legal AI assistant reviwing commercial contracts. 
Please provide answer to the question listed below about the important contract clauses. 
The questions are provided after the [Question] tag, present your answer after the [Response] tag. 
DO NOT put any premables in the response. If you don't know the answer, just say I don't know, DO NOT make up the answers' 
"""

INPUT_KEY = '[Question]: '
RESPONSE_KEY = '[Response]: '
END_KEY = "[End]"

In [None]:
def create_prompt_formats(sample):
    # Combine a prompt with the static strings
    blurb = f"{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}"
    input_context = f"{INPUT_KEY}{sample['question']}\n" if sample['question'] else None
    response = f"{RESPONSE_KEY}{sample['answer']}\n"
    #response = f"{RESPONSE_KEY}{[[sample['routing_label']]]}\n"
    end = f"{END_KEY}"

    # Create a list of prompt template elements
    parts = [part for part in [blurb, instruction, input_context, response, end] if part]

    # Join prompt template elements into a single string to create the prompt template
    formatted_prompt = "\n".join(parts)

    # Store the formatted prompt template in a new key "text"
    sample["text"] = formatted_prompt
    
    return sample

In [None]:
from random import randrange

sample_p = create_prompt_formats(sample[randrange(100)])
print(sample_p['text'])

In [None]:
def get_max_length(model):
    """
    Extracts maximum token length from the model configuration

    :param model: Hugging Face model
    """

    # Pull model configuration
    conf = model.config
    # Initialize a "max_length" variable to store maximum sequence length as null
    max_length = None
    # Find maximum sequence length in the model configuration and save it in "max_length" if found
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    # Set "max_length" to 1024 (default value) if maximum sequence length is not found in the model configuration
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length

In [None]:
def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizes dataset batch

    :param batch: Dataset batch
    :param tokenizer: Model tokenizer
    :param max_length: Maximum number of tokens to emit from the tokenizer
    """

    return tokenizer(
        batch["text"],
        max_length = max_length,
        truncation = True,
    )

In [None]:
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset: str):
    """
    Tokenizes dataset for fine-tuning

    :param tokenizer (AutoTokenizer): Model tokenizer
    :param max_length (int): Maximum number of tokens to emit from the tokenizer
    :param seed: Random seed for reproducibility
    :param dataset (str): Instruction dataset
    """

    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)

    # Apply preprocessing to each batch of the dataset & and remove "instruction", "input", "output", and "text" fields
    _preprocessing_function = partial(preprocess_batch, max_length = max_length, tokenizer = tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched = True,
        remove_columns = ['question', 'answer', 'text', 'index'],
    )

    
    # Filter out samples that have "input_ids" exceeding "max_length"
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    # Shuffle dataset
    dataset = dataset.shuffle(seed = seed)

    return dataset

Setup model and tokenizer

In [None]:
model_name = 'meta-llama/Meta-Llama-3-8B-Instruct'

In [None]:
# Activate 4-bit precision base model loading
load_in_4bit = False

# Activate nested quantization for 4-bit base models (double quantization)
bnb_4bit_use_double_quant = True

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Compute data type for 4-bit base models
bnb_4bit_compute_dtype = torch.bfloat16

In [None]:
# use this for qLoRA
bnb_config = BitsAndBytesConfig(
        load_in_4bit = load_in_4bit,
        bnb_4bit_use_double_quant = bnb_4bit_use_double_quant,
        bnb_4bit_quant_type = bnb_4bit_quant_type,
        bnb_4bit_compute_dtype = bnb_4bit_compute_dtype,
)

In [None]:
# Get number of GPU device and set maximum memory
n_gpus = torch.cuda.device_count()
max_memory = f'{40960}MB'

In [None]:
TOKEN = <your token>

In [None]:
# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config,
    #load_in_8bit=True,
    device_map = "auto", # dispatch the model efficiently on the available resources
    #max_memory = {i: max_memory for i in range(n_gpus)},
    token = TOKEN,
    trust_remote_code=True,
)

# Load model tokenizer with the user authentication token
tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                          token = TOKEN,
                                         )
#tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Set padding token as EOS token
tokenizer.pad_token = tokenizer.eos_token

In [None]:
!nvidia-smi

Preprocess the data

In [None]:
from functools import partial

seed = 0

max_length = get_max_length(model)
preprocessed_dataset = preprocess_dataset(tokenizer, max_length, seed, sample)

In [None]:
print(preprocessed_dataset)

In [None]:
lora_config = LoraConfig(
    r = 64,    #16,
    lora_alpha = 256,    #64,
    #target_modules = ["q_proj","k_proj","v_proj","o_proj",],
    target_modules = ['gate_proj', 'up_proj', 'q_proj', 'v_proj', 'down_proj', 'k_proj', 'o_proj'],
    lora_dropout = 0.05,
    bias = "none",
    task_type = "CAUSAL_LM",
)

In [None]:
def print_trainable_parameters(model, use_4bit = False):
    """
    Prints the number of trainable parameters in the model.

    :param model: PEFT model
    """

    trainable_params = 0
    all_param = 0

    for _, param in model.named_parameters():
        num_params = param.numel()
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel
        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params

    if use_4bit:
        trainable_params /= 2

    print(
        f"All Parameters: {all_param:,d} || Trainable Parameters: {trainable_params:,d} || Trainable Parameters %: {100 * trainable_params / all_param}"
    )

In [None]:
# Enable gradient checkpointing to reduce memory usage during fine-tuning
#model.gradient_checkpointing_enable()

# Prepare the model for training
#model = prepare_model_for_kbit_training(model)

In [None]:
# Create PEFT configuration for these modules and wrap the model to PEFT
model = get_peft_model(model, lora_config)

# Print information about the percentage of trainable parameters
print_trainable_parameters(model)

In [None]:
# Output directory where the model predictions and checkpoints will be stored
output_dir = "./ft_model_llama3-8b_instruct_cuad"

# Batch size per GPU for training
per_device_train_batch_size = 1

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1  #4

# Initial learning rate (AdamW optimizer)
learning_rate = 1e-5    # 1e-4 caused ocsillation...

# Optimizer to use
optim = "paged_adamw_32bit"

# Number of training steps (overrides num_train_epochs)
#max_steps = 1000

# Linear warmup steps from 0 to learning_rate
warmup_steps = 2

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True  

# Log every X updates steps
epoch = 2  #5
logging_steps = epoch*10

save_steps = 20000

In [None]:
# Training parameters
trainer = Trainer(
    model = model,
    train_dataset = preprocessed_dataset,
    args = TrainingArguments(
        per_device_train_batch_size = per_device_train_batch_size,
        #gradient_accumulation_steps = gradient_accumulation_steps,
        warmup_steps = warmup_steps,
        #max_steps = max_steps,
        num_train_epochs = epoch,
        learning_rate = learning_rate,
        fp16 = fp16,
        logging_steps = logging_steps,
        output_dir = output_dir,
        #optim = optim,
        save_strategy="steps",
        save_steps = save_steps,
    ),
    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm = False)
)

In [None]:
model.config.use_cache = False

do_train = True

# Launch training and log metrics
print("Training...")

if do_train:
    train_result = trainer.train()
    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()
    print(metrics)

In [None]:
# Save model
print("Saving last checkpoint of the model...")
trainer.model.save_pretrained(output_dir,
                              token = TOKEN,
                              trust_remote_code=True,
                             )

### Test inference 

In [None]:
model_ft = model
tokenizer_ft = tokenizer

In [None]:
query = df_train_data['question'][25]

In [None]:
import time

def Llama_Infer(prompt):

    st = time.time()
    
    batch = tokenizer_ft(prompt, return_tensors="pt")
    input_ids = batch["input_ids"].cuda()

    with torch.no_grad():
    
        output = model_ft.generate(input_ids, 
                                    max_new_tokens=256,
                                    do_sample=True,
                                    temperature = 0.01,
                                    pad_token_id=tokenizer_ft.eos_token_id,
                                    )[0]       

        response = tokenizer_ft.decode(output)



    et = time.time()
    elapsed_time = et - st
    
    #print("generated_text = ", response)
    full_text = response.split('[Response]:')[1].split('[End]')[0].strip()
    answer = full_text
    
    return answer, elapsed_time


In [None]:
answer, elapse_time = Llama_Infer(prompt)
print("Question = ", query, "Answer = ", answer, "\nElapse time = ", elapse_time)

In [None]:
#model --- restart kernel ---

In [1]:
import torch
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          HfArgumentParser,
                          Trainer,
                          TrainingArguments,
                          DataCollatorForLanguageModeling,
                          EarlyStoppingCallback,
                          pipeline,
                          logging,
                          set_seed)

import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel, AutoPeftModelForCausalLM
#from trl import SFTTrainer

In [2]:
model_name = 'meta-llama/Meta-Llama-3-8B-Instruct'

In [3]:
output_dir = "./ft_model_llama3-8b_instruct_cuad"

In [4]:
# Initialize static strings for the prompt template
INTRO_BLURB = 'Below is an instruction that describes a task. Write a response that appropriately completes the request. \n'

INSTRUCTION_KEY = """
[Instruction]: You are a legal AI assistant reviwing commercial contracts. 
Please provide answer to the question listed below about the important contract clauses. 
The questions are provided after the [Question] tag, present your answer after the [Response] tag. 
DO NOT put any premables in the response. If you don't know the answer, just say I don't know, DO NOT make up the answers' 
"""

INPUT_KEY = '[Question]: '
RESPONSE_KEY = '[Response]: '
END_KEY = "[End]"

Load FM and Peft-load adapter then merge

In [5]:
# Activate 4-bit precision base model loading
load_in_4bit = False

# Activate nested quantization for 4-bit base models (double quantization)
bnb_4bit_use_double_quant = True

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Compute data type for 4-bit base models
bnb_4bit_compute_dtype = torch.bfloat16

In [6]:
# use this for qLoRA
bnb_config = BitsAndBytesConfig(
        load_in_4bit = load_in_4bit,
        bnb_4bit_use_double_quant = bnb_4bit_use_double_quant,
        bnb_4bit_quant_type = bnb_4bit_quant_type,
        bnb_4bit_compute_dtype = bnb_4bit_compute_dtype,
)

In [7]:
# Get number of GPU device and set maximum memory
n_gpus = torch.cuda.device_count()
max_memory = f'{40960}MB'

In [8]:
TOKEN = "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"

In [9]:
from peft import PeftModel, PeftConfig

model_ft = AutoModelForCausalLM.from_pretrained(  
    model_name,
    quantization_config = bnb_config,
    return_dict=True,
    low_cpu_mem_usage=True,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [10]:
model_ft = PeftModel.from_pretrained(
    model_ft, 
    output_dir, 
    torch_dtype = torch.float16,
    device_map="auto",
)

In [11]:
model_ft = model_ft.merge_and_unload()



In [12]:
tokenizer_ft = AutoTokenizer.from_pretrained(model_name)
tokenizer_ft.pad_token = tokenizer_ft.eos_token
tokenizer_ft.padding_side = "right"

In [13]:
!nvidia-smi

Tue Aug  6 05:55:31 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-16GB           On  | 00000000:00:1E.0 Off |                    0 |
| N/A   40C    P0              51W / 300W |   8344MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

Prepare data

In [14]:
from datasets import Dataset, load_dataset
import pandas as pd
import numpy as np

In [15]:
CONTRACT_FILES = ["BONTONSTORESINC_04_20_2018-EX-99.3-AGENCY AGREEMENT.PDF",
                  "ENERGOUSCORP_03_16_2017-EX-10.24-STRATEGIC ALLIANCE AGREEMENT.PDF",
                  "MRSFIELDSORIGINALCOOKIESINC_01_29_1998-EX-10-FRANCHISE AGREEMENT.PDF",
                  "PlayboyEnterprisesInc_20090220_10-QA_EX-10.2_4091580_EX-10.2_Content License Agreement_ Marketing Agreement_ Sales-Purchase Agreement1.pdf"
                 ]

In [16]:
TRN_FILE = "../lab-data/"+"master_clauses.csv"  

df = pd.read_csv(TRN_FILE)

question_list = []
answer_list = []

In [17]:
def create_qa_data():
    for k in range(len(CONTRACT_FILES)):
        df_qa = pd.melt(df[df["Filename"]==CONTRACT_FILES[k]], 
                        id_vars=['Filename'], 
                        var_name='title', 
                        value_name='answer'
                       )

        df_qa = df_qa.drop(['Filename'],axis=1)

        for i in range(len(df_qa)):
            if (i%2==0):
                question_list.append("What is the "+df_qa['title'][i]+" in the contract "+CONTRACT_FILES[k].split('.PDF')[0]+" ?")
            else:
                answer_list.append(df_qa['answer'][i])

    return

In [18]:
create_qa_data()

In [19]:
# build dataframe
df_seed_data = pd.DataFrame()
df_seed_data['question'] = question_list
df_seed_data['answer'] = answer_list

# remove nan answer
df_seed_data = df_seed_data[df_seed_data.isna().answer==False]
df_seed_data = df_seed_data.reset_index()
df_seed_data

Unnamed: 0,index,question,answer
0,0,What is the Document Name in the contract BONT...,AGENCY AGREEMENT
1,1,What is the Parties in the contract BONTONSTOR...,"The Bon-Ton Stores, Inc. and its associated ch..."
2,2,What is the Agreement Date in the contract BON...,4/18/18
3,7,What is the Governing Law in the contract BONT...,Delaware
4,8,What is the Most Favored Nation in the contrac...,No
...,...,...,...
153,159,What is the Liquidated Damages in the contract...,No
154,160,What is the Warranty Duration in the contract ...,No
155,161,What is the Insurance in the contract PlayboyE...,Yes
156,162,What is the Covenant Not To Sue in the contrac...,No


In [20]:
df_train_data = df_seed_data

sample = Dataset.from_pandas(df_train_data)
sample

Dataset({
    features: ['index', 'question', 'answer'],
    num_rows: 158
})

Prepare prompt

In [25]:
query = df_train_data['question'][20]

In [26]:
blurb = f"{INTRO_BLURB}"
instruction = f"{INSTRUCTION_KEY}"
input_context = f'{INPUT_KEY}{query}\n\n{RESPONSE_KEY}'

prompt = blurb+'\n'+instruction+'\n'+input_context
print(prompt)

Below is an instruction that describes a task. Write a response that appropriately completes the request. 


[Instruction]: You are a legal AI assistant reviwing commercial contracts. 
Please provide answer to the question listed below about the important contract clauses. 
The questions are provided after the [Question] tag, present your answer after the [Response] tag. 
DO NOT put any premables in the response. If you don't know the answer, just say I don't know, DO NOT make up the answers' 

[Question]: What is the Joint Ip Ownership in the contract BONTONSTORESINC_04_20_2018-EX-99.3-AGENCY AGREEMENT ?

[Response]: 


Inference

In [27]:
import time

def Llama_Infer(prompt):

    st = time.time()
    
    batch = tokenizer_ft(prompt, return_tensors="pt")
    input_ids = batch["input_ids"].cuda()

    with torch.no_grad():
    
        output = model_ft.generate(input_ids, 
                                    max_new_tokens=256,
                                    do_sample=True,
                                    temperature = 0.01,
                                    pad_token_id=tokenizer_ft.eos_token_id,
                                    )[0]       

        response = tokenizer_ft.decode(output)



    et = time.time()
    elapsed_time = et - st
    
    #print("generated_text = ", response)
    full_text = response.split('[Response]:')[1].split('[End]')[0].strip()
    answer = full_text
    
    return answer, elapsed_time


In [28]:
answer, elapse_time = Llama_Infer(prompt)
print("Question = ", query, "Answer = ", answer, "\nElapse time = ", elapse_time)

Question =  What is the Joint Ip Ownership in the contract BONTONSTORESINC_04_20_2018-EX-99.3-AGENCY AGREEMENT ? Answer =  The Joint IP Ownership clause in the contract BONTONSTORESINC_04_20_2018-EX-99.3-AGENCY AGREEMENT states that the parties agree to jointly own and share the intellectual property rights (IP) created during the term of the agreement. This means that both parties will have equal rights to use, modify, and distribute the IP, and any profits generated from the IP will be shared equally. The clause also specifies that the parties will work together to maintain and defend the IP, and that any disputes arising from the IP will be resolved through arbitration. 

Please note that the above response is based on the provided contract and may not be applicable to other contracts or situations. It is recommended to consult with a legal expert or review the contract in its entirety before making any decisions or taking any actions. 

I hope this response meets your requirements.