# Summarisation: Train T5 model

**Author:** Eva Rombouts  
**Date:** 2024-07-21  
**Version:** 0.2

### Description


In [None]:
# Install necessary libraries
%%capture
!pip install datasets
!pip install transformers
!pip install transformers[torch]
!pip install evaluate
!pip install peft
!pip install rouge_score

In [None]:
# Mount Google Drive and set up environment
import os
from google.colab import drive, userdata

# Mount Google Drive
drive.mount('/content/drive')

# Change directory to your project folder
os.chdir('/content/drive/My Drive/Colab Notebooks/GenCareAI/scripts')

# Retrieve Hugging Face token
HF_TOKEN = userdata.get('HF_TOKEN')

In [None]:
# Import necessary libraries
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import pandas as pd
import numpy as np
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.model_selection import train_test_split
import evaluate

In [None]:
# Load and preprocess data

# Load the dataset
summaries_df = pd.read_csv('../data/galaxy_summaries.csv')

# Set parameters
random_seed = 6
sample_size = 1000

# Sample the dataset
# df = summaries_df.sample(sample_size, random_state=random_seed)
df = summaries_df

# Split data into train, validation, and test sets
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=random_seed)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=random_seed)

# Convert pandas DataFrames to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)
test_dataset = Dataset.from_pandas(test_df)

# Create a DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': valid_dataset,
    'test': test_dataset
})

In [None]:
# Load the model and tokenizer
model_name = 'flax-community/t5-base-dutch-demo'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Move model to GPU
model.to('cuda')

# Display model configuration
model.config

In [None]:
# Analyze token lengths
# Tokenize the input reviews
# token_lengths = [len(tokenizer.encode(review, add_special_tokens=True)) for review in df['input']]
token_lengths = [len(tokenizer.encode(review, add_special_tokens=True)) for review in df['summary']]

# Calculate token length statistics
max_length = np.max(token_lengths)
mean_length = np.mean(token_lengths)
median_length = np.median(token_lengths)
percentile_95_length = np.percentile(token_lengths, 95)

print(f"Max length: {max_length}")
print(f"Mean length: {mean_length}")
print(f"Median length: {median_length}")
print(f"95th percentile length: {percentile_95_length}")

# Set chosen max length
chosen_max_length = int(percentile_95_length)
print(f"Chosen max length: {chosen_max_length}")

In [None]:
# Tokenize and filter the dataset for fine-tuning
start_prompt = 'Schrijf een samenvatting:\n'
end_prompt = '\n\nSamenvatting:\n'

def tokenize_function(example, prompt_start, prompt_end):
    # Tokenize the data for fine-tuning
    prompt = [prompt_start + review + prompt_end for review in example["input"]]
    example['input_ids'] = tokenizer(prompt, padding=True, truncation=True, max_length=600, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["summary"], padding=True, truncation=True, max_length=200, return_tensors="pt").input_ids
    return example

# Apply tokenization and filter columns
tokenized_datasets = dataset_dict.map(
    lambda example: tokenize_function(example, start_prompt, end_prompt),
    batched=True
).remove_columns(['input'])

In [None]:
# Set up PEFT configuration
peft_config = LoraConfig(
    r=32,  # Rank
    lora_alpha=16,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM  # Specify task type
)

# Create PEFT model
peft_model = get_peft_model(model, peft_config)
peft_model.to('cuda')

# Display trainable parameters
peft_model.print_trainable_parameters()

In [None]:
# Training setup for PEFT model
output_dir = '../models/peft_model'
training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3,
    num_train_epochs=1,
    logging_steps=100,
    save_steps=200,
    eval_steps=200,
)

# Set up the trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
)

# Train the PEFT model
trainer.train()

In [None]:
# Evaluate the PEFT model qualitatively
index = 10
input_text = dataset_dict['test'][index]['input']
baseline_summary = dataset_dict['test'][index]['summary']

prompt = start_prompt + input_text + end_prompt

# Tokenize the input for the PEFT model
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to('cuda')

# Generate summaries with the original and PEFT models
original_model_outputs = model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_summary = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

peft_model_outputs = trainer.model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
peft_model_summary = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

# Print results
dash_line = '-' * 100
print(dash_line)
print(f'PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE SUMMARY:\n{baseline_summary}')
print(dash_line)
print(f'ORIGINAL MODEL SUMMARY:\n{original_model_summary}')
print(dash_line)
print(f'PEFT MODEL SUMMARY: {peft_model_summary}')

In [None]:
# Evaluate the PEFT model quantitatively with ROUGE
# Select a subset of reviews and their summaries
test_inputs = dataset_dict['test'][0:10]['input']
baseline_summaries = dataset_dict['test'][0:10]['summary']

# Initialize lists for generated summaries
original_model_summaries = []
peft_model_summaries = []

# Generate summaries for the subset of reviews
for idx, review in enumerate(test_inputs):
    prompt = start_prompt + review + end_prompt
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to('cuda')

    original_model_output = model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_summary = tokenizer.decode(original_model_output[0], skip_special_tokens=True)
    original_model_summaries.append(original_model_summary)

    peft_model_output = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    peft_model_summary = tokenizer.decode(peft_model_output[0], skip_special_tokens=True)
    peft_model_summaries.append(peft_model_summary)

# Create a DataFrame with the summaries
summary_df = pd.DataFrame(list(zip(baseline_summaries, original_model_summaries, peft_model_summaries)),
                          columns=['Baseline Summaries', 'Original Model Summaries', 'PEFT Model Summaries'])

# Calculate ROUGE scores for each model
rouge = evaluate.load('rouge')
original_model_rouge = rouge.compute(predictions=original_model_summaries, references=baseline_summaries, use_aggregator=True, use_stemmer=True)
peft_model_rouge = rouge.compute(predictions=peft_model_summaries, references=baseline_summaries, use_aggregator=True, use_stemmer=True)

# Print the ROUGE scores
print('ORIGINAL MODEL ROUGE SCORES:')
print(original_model_rouge)
print('PEFT MODEL ROUGE SCORES:')
print(peft_model_rouge)

# Calculate and print the improvement of the PEFT model over the original model
improvement = (np.array(list(peft_model_rouge.values())) - np.array(list(original_model_rouge.values())))
print("Absolute percentage improvement of PEFT MODEL over ORIGINAL MODEL")
for key, value in zip(peft_model_rouge.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')