# Fine Tuning Large Language Model - Model

In this workshop, you will learn how to fine tune the prompts and the LLMs to enhance and improves its response.

In [1]:
# Import libraries
import torch, time
import pandas as pd
import numpy as np
import evaluate
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, GenerationConfig, TrainingArguments

from peft import PeftModel, LoraConfig, get_peft_model, TaskType

In [3]:
# TODO: Load and explore the following datasets

dataset_name = "knkarthick/dialogsum"
model_name = "google/flan-t5-small"
model_name = "google/flan-t5-base"

dataset = load_dataset(dataset_name)


In [4]:
# TODO: Print a record
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})


In [5]:
# Examine a record from the training set
idx = 5000
for k, v in dataset['train'][idx].items():
   print(f'key: {k}, {v}')

key: id, train_5000
key: dialogue, #Person1#: do you like animals? I really like dogs.
#Person2#: so do i. I don't like cats.
#Person1#: why? I think cats are ok.
#Person2#: I can't bear being near cats. They don't seem to like me either.
#Person1#: I like wild animals. I don't like spiders and snakes. I think spiders and snakes are disgusting.
#Person2#: I'm fond of snakes. I think they're great. I agree with you about spiders though. I think spiders are horrible. I think it's because they have so many legs.
#Person1#: I think bears are wonderful. Pandas are fantastic. I low the people who kill them for their fur.
#Person2#: I agree. I'm carry about mice. I think they're so cute!
#Person1#: really? I don't see the attraction. I'm afraid of mice.
key: summary, #Person1# likes dogs, wild animals but doesn't like spiders and snakes. #Person2# doesn't like cats but likes snakes and mice.
key: topic, animals


## Fine tuning the LLM model

In this workshop we will be turning the <code>google/flan-t5-base</code> model.

In [6]:
# Utility function to dump a model's tunable parameters

def print_trainable_model_params(model):
   trainable_model_params = 0
   all_model_params = 0
   for _, param in model.named_parameters():
      all_model_params += param.numel()
      if param.requires_grad:
         trainable_model_params += param.numel()
   return f"Trainable parameters: {trainable_model_params}\nTotal parameters: {all_model_params}\nPercentable of trainable parameters: {100 * trainable_model_params / all_model_params:.2f}%"

In [7]:
# TODO: Load model
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [8]:
# TODO: Print number of trainable parameters
print(print_trainable_model_params(original_model))

Trainable parameters: 247577856
Total parameters: 247577856
Percentable of trainable parameters: 100.00%


### Preprocess the dialogue dataset

We will train the model to summarize dialogue by creating a dialogue-summary pair for the LLM to process. The dialogue is the training data and the summary is the label. This is supervized learning.

The prompt will be as follows

```
Summarize the following dialogue.\n
\n
Fred: ...\n
Barney: ...\n
\n
Summary:\n
Summary of the conversation between Fred and Barney
```

The prompt and the summary will be tokenized for the LLM

In [10]:
# Utitlity function to prepare the data for training 
# Tokenize function
def tokenize_fn(data):
   start_prompt = 'Summarize the following dialogue.\n\n'
   end_prompt = '\n\nSummary:'
   prompt = [ start_prompt + d + end_prompt for d in data['dialogue'] ]
   summary = data['summary']
   data['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
   data['labels'] = tokenizer(summary, padding="max_length", truncation=True, return_tensors="pt").input_ids
   return data


In [16]:
# TODO: prepare the data for training with the tokenize_fn function
tokenized_dataset = dataset.map(tokenize_fn, batched=True)



Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

In [17]:
# Examine a record from the training set
idx = 5000
for k, v in tokenized_dataset['train'][idx].items():
   print(f'key: {k}, {v}')

key: id, train_5000
key: dialogue, #Person1#: do you like animals? I really like dogs.
#Person2#: so do i. I don't like cats.
#Person1#: why? I think cats are ok.
#Person2#: I can't bear being near cats. They don't seem to like me either.
#Person1#: I like wild animals. I don't like spiders and snakes. I think spiders and snakes are disgusting.
#Person2#: I'm fond of snakes. I think they're great. I agree with you about spiders though. I think spiders are horrible. I think it's because they have so many legs.
#Person1#: I think bears are wonderful. Pandas are fantastic. I low the people who kill them for their fur.
#Person2#: I agree. I'm carry about mice. I think they're so cute!
#Person1#: really? I don't see the attraction. I'm afraid of mice.
key: summary, #Person1# likes dogs, wild animals but doesn't like spiders and snakes. #Person2# doesn't like cats but likes snakes and mice.
key: topic, animals
key: input_ids, [12198, 1635, 1737, 8, 826, 7478, 5, 1713, 345, 13515, 536, 4663, 

In [21]:
# TODO: Verify prepared data
print(tokenizer.decode(tokenized_dataset['train'][idx]['input_ids'], skip_special_tokens=True))
print(tokenizer.decode(tokenized_dataset['train'][idx]['labels'], skip_special_tokens=True))


Summarize the following dialogue. #Person1#: do you like animals? I really like dogs. #Person2#: so do i. I don't like cats. #Person1#: why? I think cats are ok. #Person2#: I can't bear being near cats. They don't seem to like me either. #Person1#: I like wild animals. I don't like spiders and snakes. I think spiders and snakes are disgusting. #Person2#: I'm fond of snakes. I think they're great. I agree with you about spiders though. I think spiders are horrible. I think it's because they have so many legs. #Person1#: I think bears are wonderful. Pandas are fantastic. I low the people who kill them for their fur. #Person2#: I agree. I'm carry about mice. I think they're so cute! #Person1#: really? I don't see the attraction. I'm afraid of mice. Summary:
#Person1# likes dogs, wild animals but doesn't like spiders and snakes. #Person2# doesn't like cats but likes snakes and mice.


In [22]:
# TODO: Remove id, dialogue, summary and topic columns from dataset. We only want input_ids and labels
tokenized_dataset = tokenized_dataset.remove_columns(
   ['id', 'dialogue', 'summary', 'topic']
)


In [23]:
# TODO: Verify dataset again
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 500
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1500
    })
})


### Tune model with pre-processed dataset

We will use [<code>Trainer</code>](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#api-reference%20][%20transformers.Trainer) to train the original model. The training result will be written out. The trainer will be configure with [<code>TrainingArgument</code>](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments)

In [25]:
# CUDA information

print('CUDA available: ', torch.cuda.is_available())
if torch.cuda.is_available():
   print('B16 supported: ', torch.cuda.is_bf16_supported())
   torch.cuda.set_device(0)
   print('Current device: ', torch.cuda.current_device())
   print('CUDA device name: ', torch.cuda.get_device_name(0))

CUDA available:  False


In [26]:
print(original_model)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

## Fine tuning the LLM Model with Low-Rank Adaptation (LoRA) / Parameter Efficient Fine Tuning (PEFT)

We will add a LoRA adapter to the LLM (flan-t5-base) and train the adapter. The original LLM will be frozen. The adapter can be combined with the original LLM during inferencing. 

In [30]:
# TODO: Configure LoRA
lora_config = LoraConfig(
   r=16,
   lora_alpha=16, 
   lora_dropout=.05, 
   bias="none",
   target_modules=['q', 'v'],
   task_type= TaskType.SEQ_2_SEQ_LM
)

In [31]:
# TODO: Add LoRA to the LLM model to be trained
lora_model = get_peft_model(original_model, lora_config)



In [32]:
# TODO: Print number of parameters, compare LoRA to the original model
print(print_trainable_model_params(lora_model))

Trainable parameters: 1769472
Total parameters: 249347328
Percentable of trainable parameters: 0.71%


In [33]:
# TODO: Train model with LoRA
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [36]:
# TODO: Create trainer and train model
lora_train_args = TrainingArguments(
   output_dir="train_dir",
   auto_find_batch_size=True,
   learning_rate=1e-3,
   num_train_epochs=1
)

In [37]:
lora_trainer = Trainer(
   model = lora_model,
   args = lora_train_args,
   train_dataset=tokenized_dataset['train'],
   eval_dataset=tokenized_dataset['validation']
)

No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
# start the training cycles
#lora_trainer.train()

my_model = "chukmunnlee/flan-small-summary"
lora_trainer.model.save_pretrained("chukmunnlee/flan-small-summary")
tokenizer.save_pretrained("chukmunnlee/flan-small-summary")

### Use a trained LoRA model

The training will take a few hours and over many iterations.

For the purpose of this workshop we use a save model [intotheverse/peft-dialogue-summary-checkpoint](https://huggingface.co/intotheverse/peft-dialogue-summary-checkpoint).

In [38]:
#TODO: Load the original model and add the pre-trained LoRA adaptation to the model
peft_dialogue_summary_checkpoint = 'intotheverse/peft-dialogue-summary-checkpoint'

# base model 
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

lora_model = PeftModel.from_pretrained(
   base_model,
   peft_dialogue_summary_checkpoint,
   torch_dtype=torch.bfloat16, 
   is_trainable=False)


In [40]:
print(print_trainable_model_params(lora_model))

Trainable parameters: 0
Total parameters: 251116800
Percentable of trainable parameters: 0.00%


In [48]:
idx = 10
dialogue = dataset['test'][idx]['dialogue']
prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

print(prompt)
prompt_enc = tokenizer(prompt, return_tensors='pt').input_ids
enc_output = lora_model.generate(input_ids=prompt_enc)


Summarize the following conversation.

#Person1#: Happy Birthday, this is for you, Brian.
#Person2#: I'm so happy you remember, please come in and enjoy the party. Everyone's here, I'm sure you have a good time.
#Person1#: Brian, may I have a pleasure to have a dance with you?
#Person2#: Ok.
#Person1#: This is really wonderful party.
#Person2#: Yes, you are always popular with everyone. and you look very pretty today.
#Person1#: Thanks, that's very kind of you to say. I hope my necklace goes with my dress, and they both make me look good I feel.
#Person2#: You look great, you are absolutely glowing.
#Person1#: Thanks, this is a fine party. We should have a drink together to celebrate your birthday

Summary:



In [51]:
print(tokenizer.decode(enc_output[0], skip_special_tokens=True))

print(dataset['test'][idx]['summary'])

Brian remembers his birthday and invites #Person1# to the party. Brian is popular
#Person1# attends Brian's birthday party. Brian thinks #Person1# looks great and charming.


## Evaluate LoRA model

In [None]:
# TODO: Evaluate LoRA model against the original 



In [None]:
# Prepare data for evaluation
dialogues = []
summaries = []
orig_model_summaries = []
lora_model_summaries = []
config = GenerationConfig(max_new_tokens=200)

for i in range(5):
   print(f'i = {i}')
   d = dataset['test'][i]['dialogue']
   s = dataset['test'][i]['summary']
   prompt = f"Summarize the following conversation.\n\n{d}\n\nSummary:"
   tokenized_prompt = tokenizer(prompt, return_tensors='pt').input_ids
   orig_resp = original_model.generate(input_ids=tokenized_prompt, generation_config=config)
   orig_resp_text = tokenizer.decode(orig_resp[0], skip_special_tokens=True)
   lora_resp = lora_model.generate(input_ids=tokenized_prompt, generation_config=config)
   lora_resp_text = tokenizer.decode(lora_resp[0], skip_special_tokens=True)

   summaries.append(s)
   orig_model_summaries.append(orig_resp_text)
   lora_model_summaries.append(lora_resp_text)

zipped_summaries = list(zip(summaries, orig_model_summaries, lora_model_summaries))
df = pd.DataFrame(zipped_summaries, columns=['label', 'original_model_summary', 'lora_model_summary'])
df

### Evaluate models with ROUGE/Bleu metrics

Recall-Oriented Understudy for Gisting Evaluate ([ROUGE](https://pub.aimind.so/unveiling-the-power-of-rouge-metrics-in-nlp-b6d3f96d3363)) is a set of metrics used to evaluate the quality of machine-generated text, such as summaries and translations. ROUGE metrics compare the generated text to a human-written reference and measure the overlap between the two. 

The metrics range between 0 and 1, with higher scores indicating higher similarity between the baseline and generated text.

In [None]:
# TODO: create ROUGE metrics


In [None]:
# TODO: Evaluate the original model's result


In [None]:
# TODO: Evaluate with Bleu metrics
