# Fine Tuning Large Language Model - Model

In this workshop, you will learn how to fine tune the prompts and the LLMs to enhance and improves its response.

In [1]:
# Import libraries
import torch, time
import pandas as pd
import numpy as np
import evaluate
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, GenerationConfig, TrainingArguments

from peft import PeftModel, LoraConfig, get_peft_model, TaskType

In [3]:
# TODO: Load and explore the following datasets

dataset_name = "knkarthick/dialogsum"
model_name = "google/flan-t5-small"
model_name = "google/flan-t5-base"

dataset = load_dataset(dataset_name)

# print the shape
print(dataset.shape)

# print dataset keys
print(dataset.keys())

{'train': (12460, 4), 'validation': (500, 4), 'test': (1500, 4)}
dict_keys(['train', 'validation', 'test'])


In [4]:
# TODO: Print a record
idx = 50
for k, v in dataset['train'][idx].items():
   print(f'k = {k}\nv = {v}')


k = id
v = train_50
k = dialogue
v = #Person1#: You have the right to remain silent. Anything you say can and will be used against you in a court of law. You have the right to have an attorney present during questioning. If you cannot afford an attorney, one will be appointed for you. Do you understand?
#Person2#: Yes.
#Person1#: What's your name?
#Person2#: My name is James.
#Person1#: What's your nationality?
#Person2#: American.
#Person1#: What's your relationship with the victim?
#Person2#: I don't know him.
#Person1#: Why did you attack the victim?
#Person2#: Because he beat me first when I tried to stop him from grabbing my bag and running away.
#Person1#: How many times did you stab the victim?
#Person2#: I stabbed his belly three times.
#Person1#: Did you know that your actions might cause serous injuries or death?
#Person2#: I knew, but I couldn't control myself.
#Person1#: Was it your intention to kill the victim?
#Person2#: No. I didn't kill him on purpose, madam. It's him w

## Fine tuning the LLM model

In this workshop we will be turning the <code>google/flan-t5-base</code> model.

In [5]:
# Utility function to dump a model's tunable parameters
def print_trainable_model_params(model):
   trainable_model_params = 0
   all_model_params = 0
   for _, param in model.named_parameters():
      all_model_params += param.numel()
      if param.requires_grad:
         trainable_model_params += param.numel()
   return f"Trainable parameters: {trainable_model_params}\nTotal parameters: {all_model_params}\nPercentable of trainable parameters: {100 * trainable_model_params / all_model_params:.2f}%"

In [6]:
# TODO: Load model
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [8]:
# TODO: Print number of trainable parameters
print(print_trainable_model_params(base_model))

Trainable parameters: 247577856
Total parameters: 247577856
Percentable of trainable parameters: 100.00%


### Preprocess the dialogue dataset

We will train the model to summarize dialogue by creating a dialogue-summary pair for the LLM to process. The dialogue is the training data and the summary is the label. This is supervized learning.

The prompt will be as follows

```
Summarize the following dialogue.\n
\n
Fred: ...\n
Barney: ...\n
\n
Summary:\n
Summary of the conversation between Fred and Barney
```

The prompt and the summary will be tokenized for the LLM

In [16]:
# Utitlity function to prepare the data for training 
# Tokenize function
def tokenize_fn(data):
   start_prompt = 'Summarize the following dialogue.\n\n'
   end_prompt = '\n\nSummary:'
   prompt = [ start_prompt + d + end_prompt for d in data['dialogue'] ]
   summary = data['summary']
   #data['proc_prompt'] = prompt
   data['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
   data['labels'] = tokenizer(summary, padding="max_length", truncation=True, return_tensors="pt").input_ids
   return data


In [17]:
# TODO: prepare the data for training with the tokenize_fn function
tokenized_dataset = dataset.map(tokenize_fn, batched=True)



Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [18]:
# TODO: Verify prepared data
for k, v in tokenized_dataset['train'][idx].items():
   print(f'k = {k}, v = {v}')


k = id, v = train_50
k = dialogue, v = #Person1#: You have the right to remain silent. Anything you say can and will be used against you in a court of law. You have the right to have an attorney present during questioning. If you cannot afford an attorney, one will be appointed for you. Do you understand?
#Person2#: Yes.
#Person1#: What's your name?
#Person2#: My name is James.
#Person1#: What's your nationality?
#Person2#: American.
#Person1#: What's your relationship with the victim?
#Person2#: I don't know him.
#Person1#: Why did you attack the victim?
#Person2#: Because he beat me first when I tried to stop him from grabbing my bag and running away.
#Person1#: How many times did you stab the victim?
#Person2#: I stabbed his belly three times.
#Person1#: Did you know that your actions might cause serous injuries or death?
#Person2#: I knew, but I couldn't control myself.
#Person1#: Was it your intention to kill the victim?
#Person2#: No. I didn't kill him on purpose, madam. It's him

In [19]:
# TODO: Remove id, dialogue, summary and topic columns from dataset. We only want input_ids and labels
cleaned_dataset = tokenized_dataset.remove_columns(
   ['id', 'dialogue', 'summary', 'topic' ]
)

In [20]:
# TODO: Verify dataset again
for k, v in cleaned_dataset['train'][idx].items():
   print(f'k = {k}, v = {v}')

k = input_ids, v = [12198, 1635, 1737, 8, 826, 7478, 5, 1713, 345, 13515, 536, 4663, 10, 148, 43, 8, 269, 12, 2367, 11237, 5, 21345, 25, 497, 54, 11, 56, 36, 261, 581, 25, 16, 3, 9, 1614, 13, 973, 5, 148, 43, 8, 269, 12, 43, 46, 4917, 915, 383, 822, 53, 5, 156, 25, 1178, 5293, 46, 4917, 6, 80, 56, 36, 7817, 21, 25, 5, 531, 25, 734, 58, 1713, 345, 13515, 357, 4663, 10, 2163, 5, 1713, 345, 13515, 536, 4663, 10, 363, 31, 7, 39, 564, 58, 1713, 345, 13515, 357, 4663, 10, 499, 564, 19, 2549, 5, 1713, 345, 13515, 536, 4663, 10, 363, 31, 7, 39, 1157, 485, 58, 1713, 345, 13515, 357, 4663, 10, 797, 5, 1713, 345, 13515, 536, 4663, 10, 363, 31, 7, 39, 1675, 28, 8, 7584, 58, 1713, 345, 13515, 357, 4663, 10, 27, 278, 31, 17, 214, 376, 5, 1713, 345, 13515, 536, 4663, 10, 1615, 410, 25, 3211, 8, 7584, 58, 1713, 345, 13515, 357, 4663, 10, 2070, 3, 88, 3853, 140, 166, 116, 27, 1971, 12, 1190, 376, 45, 3, 26910, 82, 2182, 11, 1180, 550, 5, 1713, 345, 13515, 536, 4663, 10, 571, 186, 648, 410, 25, 3, 17001

### Tune model with pre-processed dataset

We will use [<code>Trainer</code>](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#api-reference%20][%20transformers.Trainer) to train the original model. The training result will be written out. The trainer will be configure with [<code>TrainingArgument</code>](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments)

In [21]:
# CUDA information

print('CUDA available: ', torch.cuda.is_available())
if torch.cuda.is_available():
   print('B16 supported: ', torch.cuda.is_bf16_supported())
   torch.cuda.set_device(0)
   print('Current device: ', torch.cuda.current_device())
   print('CUDA device name: ', torch.cuda.get_device_name(0))

CUDA available:  False


## Fine tuning the LLM Model with Low-Rank Adaptation (LoRA) / Parameter Efficient Fine Tuning (PEFT)

We will add a LoRA adapter to the LLM (flan-t5-base) and train the adapter. The original LLM will be frozen. The adapter can be combined with the original LLM during inferencing. 

In [22]:
# TODO: Configure LoRA
lora_config = LoraConfig(
   r=32,
   lora_alpha = 32,
   target_modules = [ 'q', 'v' ],
   lora_dropout=0.05, 
   bias = "none",
   task_type = TaskType.SEQ_2_SEQ_LM
)

In [23]:
# TODO: Add LoRA to the LLM model to be trained
lora_model = get_peft_model(base_model, lora_config)

In [26]:
# TODO: Print number of parameters, compare LoRA to the original model
lora_model.print_trainable_parameters()
print()

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
print(print_trainable_model_params(model))

trainable params: 3,538,944 || all params: 251,116,800 || trainable%: 1.4093

Trainable parameters: 247577856
Total parameters: 247577856
Percentable of trainable parameters: 100.00%


In [28]:
# TODO: Train model with LoRA
output_dir = "lora-training-summary"

lora_training_args = TrainingArguments(
   output_dir = output_dir,
   auto_find_batch_size = True,
   learning_rate = 1e-3,
   num_train_epochs = 1,
   logging_first_step = 1,
   max_steps = 1
)

In [29]:
# TODO: Create trainer and train model
trainer = Trainer(
   model = lora_model, 
   args = lora_training_args,
   train_dataset = cleaned_dataset['train'],
   eval_dataset = cleaned_dataset['validation']
)


No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
# Perform training
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


In [None]:
# Save your model 
trained_model_name = "my-flan-t5-base"

trainer.model.save_pretrained(trained_model_name)

### Use a trained LoRA model

The training will take a few hours and over many iterations.

For the purpose of this workshop we use a save model [intotheverse/peft-dialogue-summary-checkpoint](https://huggingface.co/intotheverse/peft-dialogue-summary-checkpoint).

In [4]:
#TODO: Load the original model and add the pre-trained LoRA adaptation to the model
peft_dialogue_summary_checkpoint = 'intotheverse/peft-dialogue-summary-checkpoint'

# load the base model
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
lora_model = PeftModel.from_pretrained(
   base_model, 
   peft_dialogue_summary_checkpoint,
   torch_dtype=torch.bfloat16,
   is_trainable=False
)

lora_model.print_trainable_parameters()

adapter_config.json:   0%|          | 0.00/334 [00:00<?, ?B/s]

adapter_model.bin:   0%|          | 0.00/14.2M [00:00<?, ?B/s]

trainable params: 0 || all params: 251,116,800 || trainable%: 0.0000


In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

## Evaluate LoRA model

In [19]:
# TODO: Evaluate LoRA model against the original 

idx = 50
dialogue = dataset['test'][idx]['dialogue']
summary = dataset['test'][idx]['summary']

# start_prompt = 'Summarize the following dialogue.\n\n'
# end_prompt = '\n\nSummary:'

prompt = f""" 
Summarize the following dialogue.

{dialogue}

Summary:
"""

tokenized_prompt = tokenizer(prompt, return_tensors='pt')
#print(tokenized_prompt)

result_enc = lora_model.generate(input_ids=tokenized_prompt['input_ids'])

result = tokenizer.decode(result_enc[0], skip_special_tokens=True)
print(f'Completed: {result}')

print(f'Actual: {summary}')


Completed: #Person1# tells #Person2# to pull on the strip and peel off
Actual: #Person1# is about to make a prank. #Person2# thinks it's cruel at first but then joins.


In [20]:
# Prepare data for evaluation
dialogues = []
summaries = []
orig_model_summaries = []
lora_model_summaries = []
config = GenerationConfig(max_new_tokens=200)

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

for i in range(5):
   print(f'i = {i}')
   d = dataset['test'][i]['dialogue']
   s = dataset['test'][i]['summary']
   prompt = f"Summarize the following conversation.\n\n{d}\n\nSummary:"
   tokenized_prompt = tokenizer(prompt, return_tensors='pt').input_ids
   orig_resp = original_model.generate(input_ids=tokenized_prompt, generation_config=config)
   orig_resp_text = tokenizer.decode(orig_resp[0], skip_special_tokens=True)
   lora_resp = lora_model.generate(input_ids=tokenized_prompt, generation_config=config)
   lora_resp_text = tokenizer.decode(lora_resp[0], skip_special_tokens=True)

   summaries.append(s)
   orig_model_summaries.append(orig_resp_text)
   lora_model_summaries.append(lora_resp_text)

zipped_summaries = list(zip(summaries, orig_model_summaries, lora_model_summaries))
df = pd.DataFrame(zipped_summaries, columns=['label', 'original_model_summary', 'lora_model_summary'])
df

i = 0
i = 1
i = 2
i = 3
i = 4


Unnamed: 0,label,original_model_summary,lora_model_summary
0,Ms. Dawson helps #Person1# to write a memo to ...,The memo is to be distributed to all employees...,#Person1# asks Ms. Dawson to take a dictation ...
1,In order to prevent employees from wasting tim...,The memo is to be distributed to all employees...,#Person1# asks Ms. Dawson to take a dictation ...
2,Ms. Dawson takes a dictation for #Person1# abo...,The memo is to be distributed to all employees...,#Person1# asks Ms. Dawson to take a dictation ...
3,#Person2# arrives late because of traffic jam....,The traffic jam at the Carrefour intersection ...,#Person2# got stuck in traffic and #Person1# s...
4,#Person2# decides to follow #Person1#'s sugges...,The traffic jam at the Carrefour intersection ...,#Person2# got stuck in traffic and #Person1# s...


### Evaluate models with ROUGE/Bleu metrics

Recall-Oriented Understudy for Gisting Evaluate ([ROUGE](https://pub.aimind.so/unveiling-the-power-of-rouge-metrics-in-nlp-b6d3f96d3363)) is a set of metrics used to evaluate the quality of machine-generated text, such as summaries and translations. ROUGE metrics compare the generated text to a human-written reference and measure the overlap between the two. 

The metrics range between 0 and 1, with higher scores indicating higher similarity between the baseline and generated text.

In [None]:
# TODO: create ROUGE metrics


In [None]:
# TODO: Evaluate the original model's result


In [None]:
# TODO: Evaluate with Bleu metrics
