In [None]:
# Import Necessary Libraries

import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import Seq2SeqTrainingArguments as TrainingArguments
from transformers import Seq2SeqTrainer as Trainer
from transformers import EarlyStoppingCallback
from transformers import DataCollatorForSeq2Seq as DataCollator
from datasets import Dataset
from evaluate import load

In [None]:
# Read CSV Dataset

df = pd.read_csv(r'dataset_path')

In [None]:
# Remove Null and Duplicate Rows

df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [None]:
# Load Pretrained Model and Tokenizer

model_path = 't5-base'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

In [None]:
# Convert to Hugging Face Dataset

df = Dataset.from_pandas(df)

In [None]:
# Adding 'summarize:' Prefix and Tokenization

def preprocess(text):
    inputs = ["summarize: " + a for a in text['article']]
    input_tokens = tokenizer(inputs, padding='max_length', max_length=1024, truncation=True)
    target_tokens = tokenizer(text_target=text['highlights'], padding='max_length', max_length=128, truncation=True)

    labels = target_tokens['input_ids']
    labels = [[-100 if token == tokenizer.pad_token_id else token for token in seq]
            for seq in labels]

    input_tokens['labels'] = labels
    return input_tokens

tokenized_df = df.map(preprocess, batched=True)

tokenized_df = tokenized_df.remove_columns(['article', 'highlights', 'id'])

In [None]:
# Split Dataset for Training and Testing

split_dataset = tokenized_df.train_test_split(test_size=0.2, seed=42)
train_df = split_dataset['train']
test_df = split_dataset['test']
small_test_df = test_df.shuffle(seed=42).select(range(1000))

In [None]:
# Compute rouge metrics (rouge1, rouge2, rougeL, rougeLsum)

rouge = load('rouge')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions = decoded_preds, references = decoded_labels)

    return {k: round(v * 100, 4) for k, v in result.items()}

In [None]:
# Training Hyperparameters

training_args = TrainingArguments(
    output_dir='checkpoints',
    save_strategy='epoch',
    logging_strategy='epoch',
    eval_strategy='epoch',
    # Memory Optimization
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_total_limit=4,
    # Training
    learning_rate=4e-5,
    num_train_epochs=6,
    weight_decay=0.05,
    # Evaluation
    metric_for_best_model='rougeL',
    greater_is_better=True,
    load_best_model_at_end=True,
    # Generation
    predict_with_generate=True,
    # For GPU
    fp16=True,
    # Other
    report_to='none'
)

In [None]:
# Adjusting Model Generation Settings

model.config.min_new_tokens = 30
model.config.max_new_tokens = 128
model.config.num_beams = 6
model.config.no_repeat_ngram_size = 3
model.config.early_stopping = False

In [None]:
# Define Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    processing_class=tokenizer,
    train_dataset=train_df,
    eval_dataset=small_test_df,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

In [None]:
# Training

trainer.train()

In [None]:
# Final Evaluation

final_results = trainer.evaluate(test_df)
print(final_results)