# Fine Tuning Large Language Model - Model

In this workshop, you will learn how to fine tune the prompts and the LLMs to enhance and improves its response.

In [1]:
# Import libraries
import torch, time
import pandas as pd
import numpy as np
import evaluate
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, GenerationConfig, TrainingArguments

from peft import PeftModel, LoraConfig, get_peft_model, TaskType

In [2]:
# Load and explore the following datasets
# Q: Number of sets? 
# Q: How many records in each of these sets?
# Q: What are the column names?

dataset_name = "knkarthick/dialogsum"
model_name = "google/flan-t5-small"
model_name = "google/flan-t5-base"

dataset = load_dataset(dataset_name)

print(dataset)
print(dataset.shape)

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})
{'train': (12460, 4), 'validation': (500, 4), 'test': (1500, 4)}


In [4]:
# Print a record
idx = 300

for k, v in dataset['train'][idx].items():
   print(f'{k.upper()}\n{v}\n')

ID
train_300

DIALOGUE
#Person1#: Would you like me to show you our new cleaning unit? It's a clever design.
#Person2#: Yes, I'd like to see that. What does it clean exactly?
#Person1#: It washes the solvent off all the metal parts - the blades, trays etc. - and then sends it back into the system.
#Person2#: What does the unit consist of?
#Person1#: Well, it's basically two tanks, one for the dirty solvent and one for the clean solvent, a pump and a washing unit. Oh, and there's a cooling system and a filter. It's all controlled by a PLC system, that stands for Process Logic Control.

SUMMARY
#Person1# introduces a new cleaning unit to #Person2# and explains it.

TOPIC
cleaning unit introduction



## Fine tuning the LLM model

In this workshop we will be turning the <code>google/flan-t5-base</code> model.

In [5]:
# Utility function to dump a model's tunable parameters

def print_trainable_model_params(model):
   trainable_model_params = 0
   all_model_params = 0
   for _, param in model.named_parameters():
      all_model_params += param.numel()
      if param.requires_grad:
         trainable_model_params += param.numel()
   return f"Trainable parameters: {trainable_model_params}\nTotal parameters: {all_model_params}\nPercentable of trainable parameters: {100 * trainable_model_params / all_model_params:.2f}%"

In [9]:
# TODO: Load model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


In [10]:
# TODO: Print number of trainable parameters
print(print_trainable_model_params(model))

Trainable parameters: 247577856
Total parameters: 247577856
Percentable of trainable parameters: 100.00%


In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

### Preprocess the dialogue dataset

We will train the model to summarize dialogue by creating a dialogue-summary pair for the LLM to process. The dialogue is the training data and the summary is the label. This is supervized learning.

The prompt will be as follows

```
Summarize the following dialogue.\n
\n
Fred: ...\n
Barney: ...\n
\n
Summary:\n
Summary of the conversation between Fred and Barney
```

The prompt and the summary will be tokenized for the LLM

In [13]:
# Utitlity function to prepare the data for training 
# Tokenize function
# Need to create a tokenizer before calling this
def tokenize_fn(data):
   start_prompt = 'Summarize the following dialogue.\n\n'
   end_prompt = '\n\nSummary:'
   prompt = [ start_prompt + d + end_prompt for d in data['dialogue'] ]
   summary = data['summary']
   data['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
   data['labels'] = tokenizer(summary, padding="max_length", truncation=True, return_tensors="pt").input_ids
   return data


In [14]:
# TODO: prepare the data for training with the tokenize_fn function
# Tokenize the 3 splits of the dataset: train, validation, test

tokenized_dataset = dataset.map(tokenize_fn, batched=True)

Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [20]:
# TODO: Verify prepared data
for _, k in enumerate(tokenized_dataset['train'][idx]):
   print(k, tokenized_dataset['train'][idx][k])

dec_prompt = tokenizer.decode(tokenized_dataset['train'][idx]['input_ids'], skip_special_tokens=True)

print('\n\n')
print(dec_prompt)

id train_300
dialogue #Person1#: Would you like me to show you our new cleaning unit? It's a clever design.
#Person2#: Yes, I'd like to see that. What does it clean exactly?
#Person1#: It washes the solvent off all the metal parts - the blades, trays etc. - and then sends it back into the system.
#Person2#: What does the unit consist of?
#Person1#: Well, it's basically two tanks, one for the dirty solvent and one for the clean solvent, a pump and a washing unit. Oh, and there's a cooling system and a filter. It's all controlled by a PLC system, that stands for Process Logic Control.
summary #Person1# introduces a new cleaning unit to #Person2# and explains it.
topic cleaning unit introduction
input_ids [12198, 1635, 1737, 8, 826, 7478, 5, 1713, 345, 13515, 536, 4663, 10, 5328, 25, 114, 140, 12, 504, 25, 69, 126, 2327, 1745, 58, 94, 31, 7, 3, 9, 13183, 408, 5, 1713, 345, 13515, 357, 4663, 10, 2163, 6, 27, 31, 26, 114, 12, 217, 24, 5, 363, 405, 34, 1349, 1776, 58, 1713, 345, 13515, 536, 

In [21]:
# TODO: Remove id, dialogue, summary and topic columns from dataset. We only want input_ids and labels
drop_cols = [ 'id', 'dialogue', 'summary', 'topic' ]
clean_tokenized_dataset = tokenized_dataset.remove_columns(drop_cols)


In [29]:
# TODO: Verify dataset again

print('input_ids = ', clean_tokenized_dataset['train'][idx]['input_ids'])
print('labels = ', clean_tokenized_dataset['train'][idx]['labels'])


input_ids =  [12198, 1635, 1737, 8, 826, 7478, 5, 1713, 345, 13515, 536, 4663, 10, 5328, 25, 114, 140, 12, 504, 25, 69, 126, 2327, 1745, 58, 94, 31, 7, 3, 9, 13183, 408, 5, 1713, 345, 13515, 357, 4663, 10, 2163, 6, 27, 31, 26, 114, 12, 217, 24, 5, 363, 405, 34, 1349, 1776, 58, 1713, 345, 13515, 536, 4663, 10, 94, 47, 88, 7, 8, 23915, 326, 66, 8, 1946, 1467, 3, 18, 8, 8720, 7, 6, 3, 28501, 672, 5, 3, 18, 11, 258, 1299, 7, 34, 223, 139, 8, 358, 5, 1713, 345, 13515, 357, 4663, 10, 363, 405, 8, 1745, 5608, 13, 58, 1713, 345, 13515, 536, 4663, 10, 1548, 6, 34, 31, 7, 6171, 192, 16007, 6, 80, 21, 8, 13086, 23915, 11, 80, 21, 8, 1349, 23915, 6, 3, 9, 5013, 11, 3, 9, 9834, 1745, 5, 3359, 6, 11, 132, 31, 7, 3, 9, 9243, 358, 11, 3, 9, 4191, 5, 94, 31, 7, 66, 6478, 57, 3, 9, 276, 6480, 358, 6, 24, 5024, 21, 10272, 3, 20641, 4330, 5, 20698, 10, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

### Tune model with pre-processed dataset

We will use [<code>Trainer</code>](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#api-reference%20][%20transformers.Trainer) to train the original model. The training result will be written out. The trainer will be configure with [<code>TrainingArgument</code>](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments)

In [30]:
# CUDA information
# We have install the torch CPU version
# pip3 install torch==2.5.1+cpu --index-url https://download.pytorch.org/whl/cpu
# To install CUDA version
# pip3 install torch

print('CUDA available: ', torch.cuda.is_available())
if torch.cuda.is_available():
   print('B16 supported: ', torch.cuda.is_bf16_supported())
   torch.cuda.set_device(0)
   print('Current device: ', torch.cuda.current_device())
   print('CUDA device name: ', torch.cuda.get_device_name(0))

CUDA available:  False


## Fine tuning the LLM Model with Low-Rank Adaptation (LoRA) / Parameter Efficient Fine Tuning (PEFT)

We will add a LoRA adapter to the LLM (flan-t5-base) and train the adapter. The original LLM will be frozen. The adapter can be combined with the original LLM during inferencing. 

In [None]:
# TODO: Configure LoRA
lora_config = LoraConfig(
   r=32,
   lora_alpha=32,
   target_modules=[ 'q', 'v' ],
   lora_dropout=0.05,
   bias='none',
   task_type=TaskType.SEQ_2_SEQ_LM
)

In [38]:
# TODO: Add LoRA to the LLM model to be trained
# load the original model. 
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype = torch.bfloat16)

# create the LoRA model 
lora_model = get_peft_model(original_model, lora_config)


In [35]:
# TODO: Print number of parameters, compare LoRA to the original model
print('LoRA: ', print_trainable_model_params(lora_model))

# LoRA fuses the adaptation to the original model
#print('Original model: ', print_trainable_model_params(original_model))

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
print('Original model: ', print_trainable_model_params(model))

LoRA:  Trainable parameters: 3538944
Total parameters: 251116800
Percentable of trainable parameters: 1.41%
Original model:  Trainable parameters: 247577856
Total parameters: 247577856
Percentable of trainable parameters: 100.00%


In [36]:
# TODO: Train model with LoRA
output_dir = f'peft-dialog-summary-training-{str(int(time.time()))}'

lora_training_args = TrainingArguments(
   output_dir=output_dir, 
   auto_find_batch_size=True,
   learning_rate=1e-3,
   num_train_epochs=1,
   logging_first_step=1,
   max_steps=1
)

In [None]:
# TODO: Create trainer and train model
lora_trainer = Trainer(
   model = lora_model,
   args = lora_training_args,
   train_dataset = clean_tokenized_dataset['train'],
   eval_dataset = clean_tokenized_dataset['validation']
)

# Start the training
lora_trainer.train()

In [None]:
# TODO: Save trained model
lora_model_path = 'lora_dialogue_summary_checkpoint'

# Save the model
lora_trainer.model.save_pretrain(lora_model_path)
# Save the tokenizer
tokenizer.save_pretrained(lora_model_path)

### Use a trained LoRA model

The training will take a few hours and over many iterations.

For the purpose of this workshop we use a save model [intotheverse/peft-dialogue-summary-checkpoint](https://huggingface.co/intotheverse/peft-dialogue-summary-checkpoint).

In [45]:
#TODO: Load the original model and add the pre-trained LoRA adaptation to the model
peft_dialogue_summary_checkpoint = 'intotheverse/peft-dialogue-summary-checkpoint'

# Load the base model
original_model = AutoModelForSeq2SeqLM.from_pretrained(
   model_name, torch_dtype = torch.bfloat16
)

# Load the LoRA model
lora_model = PeftModel.from_pretrained(
               original_model, # the original model
               peft_dialogue_summary_checkpoint, # the LoRA adaptation
               torch_dtype=torch.bfloat16, # quantization
               is_trainable=False) # not trainable 

print(print_trainable_model_params(lora_model))

Trainable parameters: 0
Total parameters: 251116800
Percentable of trainable parameters: 0.00%


## Evaluate LoRA model

In [52]:
# Evaluate LoRA model with a single sample
# Pick a sample from the test dataset, 
# compare the completions between model, lora_model

idx = 500

def create_prompt(data):
   start_prompt = 'Summarize the following dialogue.\n\n'
   end_prompt = '\n\nSummary:'
   return f"{start_prompt}{data['dialogue']}{end_prompt}"

#print(create_prompt(dataset['test'][idx]))

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Create and encode the prompt
prompt = create_prompt(dataset['test'][idx])
enc_prompt = tokenizer(prompt, return_tensors='pt')

# Get completion from the original model
comp_original_model = model.generate(input_ids=enc_prompt['input_ids'])
comp_original_model_resp = tokenizer.decode(comp_original_model[0], skip_special_tokens=True)

print("Calling LoRA model")
comp_lora_model = lora_model.generate(input_ids=enc_prompt['input_ids'])
comp_lora_model_resp = tokenizer.decode(comp_lora_model[0], skip_special_tokens=True)

print(f"Label: ${dataset['test'][idx]['summary']}")

print(f"Original: ${comp_original_model_resp}")

print(f"LoRA: ${comp_lora_model_resp}")

Calling LoRA model
Label: $#Person2# tells David the plan for a tour and #Person2# will celebrate #Person2#'s brother's fortieth birthday when at Salt Lake City.
Original: $David and his brother are going on a four day drive to Salt Lake City this Friday.
LoRA: $David is planning a tour for his vacation. He will start out from Long Island this Friday and


In [54]:
# TODO: Compare LoRA against the original model 

dialogues = []
summaries = []
orig_model_summaries = []
lora_model_summaries = []
config = GenerationConfig(max_new_tokens=512)

for idx in range(300, 305):
   print('idx: ', idx)
   # create and encode prompt
   prompt = create_prompt(dataset['test'][idx])
   enc_prompt = tokenizer(prompt, return_tensors='pt')

   # Get completion from the original model
   comp_original_model = model.generate(input_ids=enc_prompt['input_ids'], generation_config=config)
   comp_original_model_resp = tokenizer.decode(comp_original_model[0], skip_special_tokens=True)

   comp_lora_model = lora_model.generate(input_ids=enc_prompt['input_ids'], generation_config=config)
   comp_lora_model_resp = tokenizer.decode(comp_lora_model[0], skip_special_tokens=True)

   summaries.append(dataset['test'][idx]['summary'])
   orig_model_summaries.append(comp_original_model_resp)
   lora_model_summaries.append(comp_lora_model_resp)



idx:  300
idx:  301
idx:  302
idx:  303
idx:  304


In [67]:
# Create the df for comparison
zip_summaries = list(zip(summaries, orig_model_summaries, lora_model_summaries))

pd.options.display.max_colwidth = 500

cols = [ 'label', 'original_summaries', 'lora_summaries']
df = pd.DataFrame(zip_summaries, columns=cols)

df

Unnamed: 0,label,original_summaries,lora_summaries
0,#Person1# is crazy for Trump and voted for him. #Person2# doesn't agree with #Person1# on Trump and will vote for Biden.,"Person1 is proud of Trump, and is happy if he can be re-elected.",#Person1# and #Person2# are happy if Trump could be our President again. #Person2# believes Trump will make America great again but #Person1# doesn't think he is the right person. #Person2# will vote for Biden instead.
1,#Person1# is a crazy fan of Trump and wants him to be re-elected. #Person2# will vote for Biden.,"Person1 is proud of Trump, and is happy if he can be re-elected.",#Person1# and #Person2# are happy if Trump could be our President again. #Person2# believes Trump will make America great again but #Person1# doesn't think he is the right person. #Person2# will vote for Biden instead.
2,#Person1# is crazy for Trump and voted for him but #Person2# will vote for Biden.,"Person1 is proud of Trump, and is happy if he can be re-elected.",#Person1# and #Person2# are happy if Trump could be our President again. #Person2# believes Trump will make America great again but #Person1# doesn't think he is the right person. #Person2# will vote for Biden instead.
3,#Person1# doesn't know how to use the ATM. #Person2# teaches #Person1# step by step.,#Person1#: I need to use the ATM. #Person2#: OK.,#Person1# needs to use the ATM. #Person2# helps #Person1# figure out how to use it.
4,#Person1# doesn't know how to use an ATM. #Person2# teaches #Person1#.,#Person1#: I need to use the ATM. #Person2#: OK.,#Person1# needs to use the ATM. #Person2# helps #Person1# figure out how to use it.


### Evaluate models with ROUGE/Bleu metrics

Recall-Oriented Understudy for Gisting Evaluate ([ROUGE](https://pub.aimind.so/unveiling-the-power-of-rouge-metrics-in-nlp-b6d3f96d3363)) is a set of metrics used to evaluate the quality of machine-generated text, such as summaries and translations. ROUGE metrics compare the generated text to a human-written reference and measure the overlap between the two. 

The metrics range between 0 and 1, with higher scores indicating higher similarity between the baseline and generated text.

In [None]:
# TODO: create ROUGE
rouge = evaluate.load('rouge')

orig_model_results = rouge.compute(
   references=summaries,
   predictions=orig_model_summaries,
   use_stemmer=True
)

lora_model_results = rouge.compute(
   references=summaries,
   predictions=lora_model_summaries,
   use_stemmer=True
)

print('ROUGE - Original model results')
print(orig_model_results)

print()

print('ROUGE - LoRA model results')
print(lora_model_results)

Original model results
{'rouge1': 0.3918246504453401, 'rouge2': 0.23250770631176104, 'rougeL': 0.39355625217694185, 'rougeLsum': 0.39052594914663874}

LoRA model results
{'rouge1': 0.45118418806045585, 'rouge2': 0.20837735849056602, 'rougeL': 0.3652181039604975, 'rougeLsum': 0.36466080685147617}


In [74]:
# TODO: create Bleu
bleu = evaluate.load('bleu')

orig_model_results = bleu.compute(
   references=summaries,
   predictions=orig_model_summaries,
)

lora_model_results = bleu.compute(
   references=summaries,
   predictions=lora_model_summaries
)

print('BLEU - Original model results')
print(orig_model_results)

print()

print('BLEU - LoRA model results')
print(lora_model_results)

BLEU - Original model results
{'bleu': 0.14406990484132043, 'precisions': [0.5189873417721519, 0.25675675675675674, 0.17391304347826086, 0.109375], 'brevity_penalty': 0.6420828237327826, 'length_ratio': 0.6929824561403509, 'translation_length': 79, 'reference_length': 114}

BLEU - LoRA model results
{'bleu': 0.21926781650570026, 'precisions': [0.43157894736842106, 0.2756756756756757, 0.18888888888888888, 0.10285714285714286], 'brevity_penalty': 1.0, 'length_ratio': 1.6666666666666667, 'translation_length': 190, 'reference_length': 114}
