# Fine-tune GenAI Model for Summarization

## TODOs:
* Upload model artifacts to github
* Write code to do a short training job then load full training checkpoint
* Upload full summarization results for evaluation to github
* Write code to do a short evaluation job then load full evaluation results
* Pin versions
* Documentation

## Setup

In [4]:
%pip install transformers==4.27.2
%pip install torch
%pip install py7zr
%pip install datasets
%pip install sentencepiece
%pip install evaluate
%pip install rouge_score

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
[0mNote: you may need to restart the kernel to use updated packages.
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
[0mNote: you may need to restart the kernel to use updated packages.
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
[0mNote: you may need to restart the kernel to use updated packages.
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
[0mNote: you may need to restart the kernel to use updated packages.
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
[0mNote: you may need to restart the kernel to use updated packages.
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
[0mNote: you may need to restart the kernel to use updated packages.
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazon

In [45]:
from transformers import AutoTokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer, GenerationConfig
from datasets import load_dataset
import datasets
import torch
import time
import evaluate
import numpy as np

# Load T5 Model

In [10]:
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")

# Load Dataset

In [11]:
dataset = load_dataset("samsum")

Found cached dataset samsum (/root/.cache/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e)


  0%|          | 0/3 [00:00<?, ?it/s]

# Prompt Before Tuning

In [12]:
ind = 60
diag = dataset['test'][ind]['dialogue']
summary = dataset['test'][ind]['summary']
prompt = f'Summarize the following conversation.\n\nConversation:\n{diag}\n\nSummary:'
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
outputs = model.generate(input_ids)
text_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f'Prompt:\n\n{prompt}')
print(f'\nResponse: {text_output}')
print(f'Baseline Summary: {summary}')



Prompt:

Summarize the following conversation.

Conversation:
Salma: <file_video>
Salma: the latest cat meme
Hugh: oh sweet, I can never get enough of those lol
Hugh: <file_photo>
Salma: hahaha same

Summary:

Response: Salma is a cat meme.
Baseline Summary: Salma and Hugh like cat memes.


# Preprocessing

In [13]:
def tokenize_function(example):
    prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    inp = [prompt + i + end_prompt for i in example["dialogue"]]
    example['input_ids'] = tokenizer(inp, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    return example

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'dialogue', 'summary',])

Loading cached processed dataset at /root/.cache/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-d314a4bfc90f3eef.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-f2114cfb6c8e86b4.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-e06fd042e47516ad.arrow


In [14]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 818
    })
})

# Training with Prompt

In [9]:
output_dir = f'./summary-long-training-{str(int(time.time()))}'
training_args = TrainingArguments(
    output_dir='./summary-long-training',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    num_train_epochs=1,
    # num_train_epochs=15, # Use a higher number of epochs when you are not in the lab and have more time to experiment
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss


# Load the Trained Model and Original Model

In [15]:
tuned_model = T5ForConditionalGeneration.from_pretrained("./summary-long-training/checkpoint-55245")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")

# Check Example Result

In [16]:
ind = 60
diag = dataset['test'][ind]['dialogue']
summary = dataset['test'][ind]['summary']

prompt = f'Summarize the following conversation.\n\nConversation:\n{diag}\n\nSummary:'
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

original_outputs = model.to('cpu').generate(input_ids, GenerationConfig(max_new_tokens=200))
original_text_output = tokenizer.decode(original_outputs[0], skip_special_tokens=True)

outputs = tuned_model.to('cpu').generate(input_ids, GenerationConfig(max_new_tokens=200))
text_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f'Prompt:\n--------------------------\n{prompt}\n--------------------------')
print(f'\nOriginal Response: {original_text_output}')
print(f'Tuned Response   : {text_output}')
print(f'Baseline Summary : {summary}')

Prompt:
--------------------------
Summarize the following conversation.

Conversation:
Salma: <file_video>
Salma: the latest cat meme
Hugh: oh sweet, I can never get enough of those lol
Hugh: <file_photo>
Salma: hahaha same

Summary:
--------------------------

Original Response: Salma is a cat meme.
Tuned Response   : Hugh sent Salma a video of the latest cat meme.
Baseline Summary : Salma and Hugh like cat memes.


# Evaluate the Model with ROGUE Metric

In [17]:
rouge = evaluate.load('rouge')

In [57]:
dialogues = dataset['test'][:]['dialogue']
human_baseline_summaries = dataset['test'][:]['summary']

original_model_summaries = []
tuned_model_summaries = []

for ind, diag in enumerate(dialogues):
    prompt = f'Summarize the following conversation.\n\nConversation:\n{diag}\n\nSummary:'
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    original_outputs = model.generate(input_ids, GenerationConfig(max_new_tokens=200))
    original_text_output = tokenizer.decode(original_outputs[0], skip_special_tokens=True)

    outputs = tuned_model.generate(input_ids, GenerationConfig(max_new_tokens=200))
    text_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    original_model_summaries.append(original_text_output)
    tuned_model_summaries.append(text_output)

In [41]:
original_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

In [42]:
tuned_results = rouge.compute(
    predictions=tuned_model_summaries,
    references=human_baseline_summaries[0:len(tuned_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

In [43]:
original_results

{'rouge1': 0.42192639612156546,
 'rouge2': 0.17338854920373217,
 'rougeL': 0.343845149937151,
 'rougeLsum': 0.34375334441513317}

In [44]:
tuned_results

{'rouge1': 0.4787605782107237,
 'rouge2': 0.22490002877634402,
 'rougeL': 0.39044372464176635,
 'rougeLsum': 0.38998958779126414}

In [58]:
improvement = (np.array(list(tuned_results.values())) - np.array(list(original_results.values()))) / np.array(list(original_results.values()))
for key, value in zip(tuned_results.keys(), improvement):
    print(f'{key} different after tuning: {value*100:.2f}%')

rouge1 different after tuning: 13.47%
rouge2 different after tuning: 29.71%
rougeL different after tuning: 13.55%
rougeLsum different after tuning: 13.45%
