In [1]:
pip install rouge-score



In [2]:
pip install tf-keras



In [3]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
import nltk
from rouge_score import rouge_scorer
import random

nltk.download("punkt")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
pip install --upgrade datasets



In [5]:
from datasets import load_dataset
import random

dataset = load_dataset("cnn_dailymail",'3.0.0')

sample_fraction = 0.01

train_data_list = list(dataset["train"])
undersampled_data = random.sample(train_data_list, int(len(train_data_list) * sample_fraction))

print("\n Undersampled Dataset Size:", len(undersampled_data))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



 Undersampled Dataset Size: 2871


In [6]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [7]:
# Extract a sample article and summary
sample_article = undersampled_data[0]["article"]
sample_summary = undersampled_data[0]["highlights"]

print("\n Sample Article:\n", sample_article[:500])  # Display first 500 characters
print("\nSample Summary:\n", sample_summary)

# Tokenize article into sentences
def preprocess_text(text):
    sentences = sent_tokenize(text)
    return sentences

sentences = preprocess_text(sample_article)
print("\n Tokenized Sentences:\n", sentences[:5])  # Display first 5 sentences



 Sample Article:
 (CNN) -- Yes, it's great to travel light. Sure, too much tech can make life trickier, not easier. No, that's not going to stop us listing some of the coolest gadgets, gizmos and accessories that could just make you the happiest traveler this side of the Apple store. (If only till you lose them/have them stolen.) Narrative clip-on camera . This is a tiny five-megapixel camera that clips onto your clothes and does the work for you, automatically taking two photos every minute when turned on. The a

Sample Summary:
 Narrative is a hands-free (it clips to your clothes) camera that snaps pics automatically every 30 seconds .
SleepPhones are headphones in a headband -- perfect for falling asleep to tunes without your bulky earpiece falling out .
LV's shower in a trunk is a bit ambitious, not to say impractical, but what an eye-catcher .
Steripen Ultra kills bacteria in water in 48 seconds .

 Tokenized Sentences:
 ["(CNN) -- Yes, it's great to travel light.", 'Sure, too mu

In [8]:
vectorizer = TfidfVectorizer()
sentence_vectors = vectorizer.fit_transform(sentences).toarray()

print("\nTF-IDF Matrix Shape:", sentence_vectors.shape)  # Shape of TF-IDF matrix



TF-IDF Matrix Shape: (104, 647)


In [9]:
class LSTMSummarizer(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super(LSTMSummarizer, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)  # Output single score per sentence

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        scores = self.fc(lstm_out[:, -1, :])  # Use last hidden state
        return scores

# Model parameters
input_dim = sentence_vectors.shape[1]
hidden_dim = 128
num_layers = 1

# Initialize model
lstm_model = LSTMSummarizer(input_dim, hidden_dim, num_layers)
print(lstm_model)


LSTMSummarizer(
  (lstm): LSTM(647, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=1, bias=True)
)


In [10]:
# Convert TF-IDF vectors to PyTorch tensors
X = torch.tensor(sentence_vectors, dtype=torch.float32)
y = torch.tensor([1 if i < 3 else 0 for i in range(len(sentences))], dtype=torch.float32)  # Assume first 3 sentences are important

# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=0.01)

# Training loop
epochs = 10
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = lstm_model(X.unsqueeze(1))
    loss = criterion(outputs.squeeze(), y)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")


Epoch 1/10, Loss: 0.6780
Epoch 2/10, Loss: 0.6307
Epoch 3/10, Loss: 0.5821
Epoch 4/10, Loss: 0.5286
Epoch 5/10, Loss: 0.4694
Epoch 6/10, Loss: 0.4057
Epoch 7/10, Loss: 0.3405
Epoch 8/10, Loss: 0.2776
Epoch 9/10, Loss: 0.2204
Epoch 10/10, Loss: 0.1715


In [11]:
with torch.no_grad():
    scores = lstm_model(X.unsqueeze(1)).squeeze()
    top_indices = torch.argsort(scores, descending=True)[:3]

extractive_summary_lstm = [sentences[i] for i in top_indices]
print("\n🔹 Extractive Summary (LSTM):\n", " ".join(extractive_summary_lstm))



🔹 Extractive Summary (LSTM):
 Sure, too much tech can make life trickier, not easier. No, that's not going to stop us listing some of the coolest gadgets, gizmos and accessories that could just make you the happiest traveler this side of the Apple store. (CNN) -- Yes, it's great to travel light.


In [12]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")


In [13]:
def get_bert_embeddings(sentences):
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state[:, 0, :]  # Extract CLS token embeddings

sentence_embeddings = get_bert_embeddings(sentences)
print("\nBERT Embeddings Shape:", sentence_embeddings.shape)



BERT Embeddings Shape: torch.Size([104, 768])


In [14]:
sentence_scores = torch.mean(sentence_embeddings, dim=1)  # Compute sentence importance

top_indices = torch.argsort(sentence_scores, descending=True)[:3]
extractive_summary_bert = [sentences[i] for i in top_indices]

print("\n🔹 Extractive Summary (BERT):\n", " ".join(extractive_summary_bert))



🔹 Extractive Summary (BERT):
 This towel was reportedly developed for military use, made from a woven polyurethane material that sand won't stick to. It's part of the company's Guaranteed on Board program, ensuring acceptance on Delta, Southwest and most other major airlines. This multipurpose suitcase is designed for businessmen making speeches on short notice.


In [15]:
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

# Evaluate LSTM summary
rouge_lstm = scorer.score(sample_summary, " ".join(extractive_summary_lstm))
print("\n ROUGE Scores (LSTM):", rouge_lstm)

# Evaluate BERT summary
rouge_bert = scorer.score(sample_summary, " ".join(extractive_summary_bert))
print("\n ROUGE Scores (BERT):", rouge_bert)



 ROUGE Scores (LSTM): {'rouge1': Score(precision=0.12244897959183673, recall=0.09375, fmeasure=0.10619469026548671), 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0), 'rougeL': Score(precision=0.08163265306122448, recall=0.0625, fmeasure=0.07079646017699115)}

 ROUGE Scores (BERT): {'rouge1': Score(precision=0.1320754716981132, recall=0.109375, fmeasure=0.11965811965811966), 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0), 'rougeL': Score(precision=0.09433962264150944, recall=0.078125, fmeasure=0.08547008547008547)}


In [16]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load T5 model and tokenizer
model_name = "t5-small"  # Change to "t5-base" or "t5-large" for better results
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def generate_abstractive_summary(text, max_input_length=512, max_output_length=150):
    # Prepend "summarize: " to the input text
    input_text = "summarize: " + text

    # Tokenize and truncate input
    inputs = tokenizer(input_text, return_tensors="pt", max_length=max_input_length, truncation=True)

    # Generate summary
    summary_ids = model.generate(inputs.input_ids, max_length=max_output_length, min_length=50, length_penalty=2.0)

    # Decode summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Example usage on an article
sample_article = undersampled_data[0]['article']  # Selecting one article from undersampled dataset
generated_summary_t5 = generate_abstractive_summary(sample_article)

print("\n **Generated Abstractive Summary (T5):**")
print(generated_summary_t5)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565



 **Generated Abstractive Summary (T5):**
a five-megapixel camera clips onto your clothes and does the work for you. a washable mark-mat is a great time-waster for kids to draw on. a wireless version ($99.95) syncs with your smart phone or other Bluetooth-enabled device.


In [17]:
def generate_gpt2_summary(text, max_input_length=300, max_new_tokens=100):
    # Tokenize and truncate input
    inputs = tokenizer(text, return_tensors="pt", max_length=max_input_length, truncation=True)

    # Generate summary with corrected parameters
    summary_ids = model.generate(
        inputs.input_ids,
        max_new_tokens=max_new_tokens,  # Controls only the newly generated text
        do_sample=True,
        temperature=0.7,
        top_k=50
    )

    # Decode summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Example usage
generated_summary_gpt = generate_gpt2_summary(sample_article)

print("\n **Generated Abstractive Summary (GPT-2):**")
print(generated_summary_gpt)



 **Generated Abstractive Summary (GPT-2):**
travel light. It's great to travel light. Sure, too much tech can make life trickier, not easier. Yes, it's great to travel light. Yes, it's great to travel light. Sure, too much tech can make life trickier, not easier. no, that's not going to stop us listing some of the coolest gadgets, gizmos and accessories that could just make you the happiest traveler this side


In [18]:
print("Type of LSTM Summary:", type(extractive_summary_lstm))
print("Type of BERT Summary:", type(extractive_summary_bert))



Type of LSTM Summary: <class 'list'>
Type of BERT Summary: <class 'list'>


In [19]:
from rouge_score import rouge_scorer
def compute_rouge_scores(reference_summary, generated_summary):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    scores = scorer.score(reference_summary, generated_summary)

    # Extract F1 scores
    rouge1 = scores["rouge1"].fmeasure
    rouge2 = scores["rouge2"].fmeasure
    rougeL = scores["rougeL"].fmeasure

    return {"ROUGE-1": rouge1, "ROUGE-2": rouge2, "ROUGE-L": rougeL}


In [20]:
extractive_summary_lstm = str(extractive_summary_lstm).strip()
extractive_summary_bert = str(extractive_summary_bert).strip()

In [21]:
# Reference summary (Ground truth from dataset)
reference_summary = sample_summary # The actual summary from CNN/DailyMail dataset




# Compute ROUGE scores for each model
rouge_lstm = compute_rouge_scores(reference_summary, extractive_summary_lstm)
rouge_bert = compute_rouge_scores(reference_summary, extractive_summary_bert)
rouge_t5 = compute_rouge_scores(reference_summary, generated_summary_t5)
rouge_gpt = compute_rouge_scores(reference_summary, generated_summary_gpt)


In [22]:
# Print ROUGE scores for each model
print("\nROUGE Scores Comparison:")
print(f" Extractive LSTM: {rouge_lstm}")
print(f" Extractive BERT: {rouge_bert}")
print(f" Abstractive T5: {rouge_t5}")
print(f" Abstractive GPT: {rouge_gpt}")



ROUGE Scores Comparison:
 Extractive LSTM: {'ROUGE-1': 0.10619469026548671, 'ROUGE-2': 0.0, 'ROUGE-L': 0.07079646017699115}
 Extractive BERT: {'ROUGE-1': 0.11965811965811966, 'ROUGE-2': 0.0, 'ROUGE-L': 0.08547008547008547}
 Abstractive T5: {'ROUGE-1': 0.22429906542056074, 'ROUGE-2': 0.03809523809523809, 'ROUGE-L': 0.16822429906542055}
 Abstractive GPT: {'ROUGE-1': 0.10526315789473685, 'ROUGE-2': 0.0, 'ROUGE-L': 0.09022556390977443}


In [23]:
# Print results
print("\nROUGE Scores Comparison:")
print(f"Extractive LSTM: {rouge_lstm}")
print(f"Extractive BERT: {rouge_bert}")
print(f"Abstractive T5: {rouge_t5}")
print(f"Abstractive GPT: {rouge_gpt}")

# Compare models based on ROUGE scores
models_rouge = {
    "Extractive LSTM": rouge_lstm,
    "Extractive BERT": rouge_bert,
    "Abstractive T5": rouge_t5,
    "Abstractive GPT": rouge_gpt
}

# Find the best model for each ROUGE metric
best_rouge1 = max(models_rouge, key=lambda x: models_rouge[x]['ROUGE-1'])
best_rouge2 = max(models_rouge, key=lambda x: models_rouge[x]['ROUGE-2'])
best_rougeL = max(models_rouge, key=lambda x: models_rouge[x]['ROUGE-L'])

print("\nBest Model per ROUGE Score:")
print(f"ROUGE-1: {best_rouge1} with score {models_rouge[best_rouge1]['ROUGE-1']:.4f}")
print(f"ROUGE-2: {best_rouge2} with score {models_rouge[best_rouge2]['ROUGE-2']:.4f}")
print(f"ROUGE-L: {best_rougeL} with score {models_rouge[best_rougeL]['ROUGE-L']:.4f}")

# Determine overall best model (average ROUGE scores)
average_rouge = {model: sum(scores.values()) / len(scores) for model, scores in models_rouge.items()}
best_model_overall = max(average_rouge, key=average_rouge.get)

print(f"\nOverall Best Model: {best_model_overall} with an average ROUGE score of {average_rouge[best_model_overall]:.4f}")

# Compare Extractive vs Abstractive models
avg_extractive = (average_rouge["Extractive LSTM"] + average_rouge["Extractive BERT"]) / 2
avg_abstractive = (average_rouge["Abstractive T5"] + average_rouge["Abstractive GPT"]) / 2

print("\nExtractive vs. Abstractive Comparison:")
print(f"Average ROUGE Score (Extractive Models): {avg_extractive:.4f}")
print(f"Average ROUGE Score (Abstractive Models): {avg_abstractive:.4f}")

if avg_extractive > avg_abstractive:
    print("Extractive models perform better based on ROUGE scores.")
else:
    print("Abstractive models perform better based on ROUGE scores.")



ROUGE Scores Comparison:
Extractive LSTM: {'ROUGE-1': 0.10619469026548671, 'ROUGE-2': 0.0, 'ROUGE-L': 0.07079646017699115}
Extractive BERT: {'ROUGE-1': 0.11965811965811966, 'ROUGE-2': 0.0, 'ROUGE-L': 0.08547008547008547}
Abstractive T5: {'ROUGE-1': 0.22429906542056074, 'ROUGE-2': 0.03809523809523809, 'ROUGE-L': 0.16822429906542055}
Abstractive GPT: {'ROUGE-1': 0.10526315789473685, 'ROUGE-2': 0.0, 'ROUGE-L': 0.09022556390977443}

Best Model per ROUGE Score:
ROUGE-1: Abstractive T5 with score 0.2243
ROUGE-2: Abstractive T5 with score 0.0381
ROUGE-L: Abstractive T5 with score 0.1682

Overall Best Model: Abstractive T5 with an average ROUGE score of 0.1435

Extractive vs. Abstractive Comparison:
Average ROUGE Score (Extractive Models): 0.0637
Average ROUGE Score (Abstractive Models): 0.1044
Abstractive models perform better based on ROUGE scores.


In [25]:
from transformers import pipeline
from rouge_score import rouge_scorer

def compute_rouge_scores(reference, generated):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    return scorer.score(reference, generated)

# Load Summarization Pipelines
bert_summarizer = pipeline("summarization", model="facebook/bart-large-cnn")  # BART (BERT-based)
t5_summarizer = pipeline("summarization", model="t5-small")  # T5 model

# Define batch sizes
batch_sizes = [8, 16, 32]

def evaluate_batch_sizes(summarizer, reference_summary, batch_sizes):
    results = {}
    for batch_size in batch_sizes:
        try:
            generated_summary_list = summarizer(reference_summary, batch_size=batch_size, truncation=True)
            if not generated_summary_list:  # Check if list is empty
                print(f"Warning: No summary generated for batch size {batch_size}")
                continue

            generated_summary = generated_summary_list[0].get('summary_text', '')  # Safe dictionary access
            if not generated_summary:  # Check for empty summary
                print(f"Warning: Empty summary for batch size {batch_size}")
                continue

            scores = compute_rouge_scores(reference_summary, generated_summary)
            results[batch_size] = scores
        except Exception as e:
            print(f"Error with batch size {batch_size}: {e}")

    return results

# Select a sample article and its summary
sample_article = undersampled_data[0]["article"]
reference_summary = undersampled_data[0]["highlights"]

# Run evaluation
rouge_bert_batches = evaluate_batch_sizes(bert_summarizer, reference_summary, batch_sizes)
rouge_t5_batches = evaluate_batch_sizes(t5_summarizer, reference_summary, batch_sizes)

# Print Results
print("\n🔹 BERT-Based (BART) ROUGE Scores by Batch Size:")
for batch, scores in rouge_bert_batches.items():
    print(f"Batch {batch}: ROUGE-1: {scores['rouge1'].fmeasure:.4f}, ROUGE-2: {scores['rouge2'].fmeasure:.4f}, ROUGE-L: {scores['rougeL'].fmeasure:.4f}")

print("\n🔹 T5 ROUGE Scores by Batch Size:")
for batch, scores in rouge_t5_batches.items():
    print(f"Batch {batch}: ROUGE-1: {scores['rouge1'].fmeasure:.4f}, ROUGE-2: {scores['rouge2'].fmeasure:.4f}, ROUGE-L: {scores['rougeL'].fmeasure:.4f}")

Device set to use cpu
Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Your max_length is set to 142, but your input_length is only 88. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=44)
Your max_length is set to 142, but your input_length is only 88. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=44)
Your max_length is set to 142, but your input_length is only 88. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=44)
Your max_length is set to 200, but your input_length is only 105. Si


🔹 BERT-Based (BART) ROUGE Scores by Batch Size:
Batch 8: ROUGE-1: 0.8364, ROUGE-2: 0.8333, ROUGE-L: 0.8364
Batch 16: ROUGE-1: 0.8364, ROUGE-2: 0.8333, ROUGE-L: 0.8364
Batch 32: ROUGE-1: 0.8364, ROUGE-2: 0.8333, ROUGE-L: 0.8364

🔹 T5 ROUGE Scores by Batch Size:
Batch 8: ROUGE-1: 0.6087, ROUGE-2: 0.6000, ROUGE-L: 0.6087
Batch 16: ROUGE-1: 0.6087, ROUGE-2: 0.6000, ROUGE-L: 0.6087
Batch 32: ROUGE-1: 0.6087, ROUGE-2: 0.6000, ROUGE-L: 0.6087


In [24]:
import torch
from torch.optim import SGD, Adam
from transformers import BartForConditionalGeneration, T5ForConditionalGeneration, BartTokenizer, T5Tokenizer
from rouge_score import rouge_scorer
from datasets import load_dataset

# Load dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")["train"]
sample_article = dataset[0]["article"][:256]  # Reduce max tokens
reference_summary = dataset[0]["highlights"][:256]  # Reduce max tokens

# Load Tokenizers
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Load Models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(device)
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

# ROUGE Score Function
def compute_rouge_scores(reference, generated):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    return scorer.score(reference, generated)

# Training Function
def fine_tune_and_evaluate(model, tokenizer, optimizer_type, num_epochs=1):  # Reduce epochs
    optimizer = optimizer_type(model.parameters(), lr=5e-5)
    model.train()

    for epoch in range(num_epochs):
        torch.cuda.empty_cache()  # Prevent memory overflow

        inputs = tokenizer(sample_article, return_tensors="pt", max_length=256, truncation=True, padding="max_length")
        inputs = {key: value.to(device) for key, value in inputs.items()}
        labels = tokenizer(reference_summary, return_tensors="pt", max_length=150, truncation=True, padding="max_length").input_ids.to(device)

        optimizer.zero_grad()
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch + 1}, Loss: {loss.item():.4f}")

    # Evaluate model
    model.eval()
    with torch.no_grad():
        inputs = tokenizer(sample_article, return_tensors="pt", max_length=256, truncation=True).to(device)
        summary_ids = model.generate(inputs.input_ids, max_length=150, num_beams=5, early_stopping=True)
        generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return compute_rouge_scores(reference_summary, generated_summary)

# Fine-tune and evaluate with SGD & Adam
optimizers = {"SGD": SGD, "Adam": Adam}
results = {}

for optimizer_name, optimizer in optimizers.items():
    print(f"\n🔹 Fine-tuning BART with {optimizer_name}")
    results[f"BART_{optimizer_name}"] = fine_tune_and_evaluate(bart_model, bart_tokenizer, optimizer)

    print(f"\n🔹 Fine-tuning T5 with {optimizer_name}")
    results[f"T5_{optimizer_name}"] = fine_tune_and_evaluate(t5_model, t5_tokenizer, optimizer)

# Print Results
print("\n🔹 ROUGE Scores for Different Optimizers:")
for model_optimizer, scores in results.items():
    print(f"\n{model_optimizer}:")
    print(f"  ROUGE-1: {scores['rouge1'].fmeasure:.4f}")
    print(f"  ROUGE-2: {scores['rouge2'].fmeasure:.4f}")
    print(f"  ROUGE-L: {scores['rougeL'].fmeasure:.4f}")



🔹 Fine-tuning BART with SGD
Epoch 1, Loss: 9.1117

🔹 Fine-tuning T5 with SGD


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1, Loss: 16.7363

🔹 Fine-tuning BART with Adam
Epoch 1, Loss: 9.0800

🔹 Fine-tuning T5 with Adam
Epoch 1, Loss: 15.9017

🔹 ROUGE Scores for Different Optimizers:

BART_SGD:
  ROUGE-1: 0.3441
  ROUGE-2: 0.1758
  ROUGE-L: 0.2581

T5_SGD:
  ROUGE-1: 0.3611
  ROUGE-2: 0.2286
  ROUGE-L: 0.3333

BART_Adam:
  ROUGE-1: 0.4706
  ROUGE-2: 0.2410
  ROUGE-L: 0.3529

T5_Adam:
  ROUGE-1: 0.3611
  ROUGE-2: 0.2286
  ROUGE-L: 0.3333
