# Import Necessary Libraries

In [None]:
# Imports
from google.colab import drive
import pandas as pd
from datasets import Dataset
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
from rouge_score import rouge_scorer
import evaluate
import pickle

# Data Preprocessing

In [None]:
drive.mount('/content/drive')
train_path = "/content/drive/MyDrive/Fine-Tuning/cnn_dailymail/train.csv"
df = pd.read_csv(train_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...
...,...,...,...
287108,fffdfb56fdf1a12d364562cc2b9b1d4de7481dee,By . James Rush . Former first daughter Chelse...,Chelsea Clinton said question of running for o...
287109,fffeecb8690b85de8c3faed80adbc7a978f9ae2a,An apologetic Vanilla Ice has given his first ...,"Vanilla Ice, 47 - real name Robert Van Winkle ..."
287110,ffff5231e4c71544bc6c97015cdb16c60e42b3f4,America's most lethal sniper claimed he wished...,America's most lethal sniper made comment in i...
287111,ffff924b14a8d82058b6c1c5368ff1113c1632af,"By . Sara Malm . PUBLISHED: . 12:19 EST, 8 Mar...",A swarm of more than one million has crossed b...


In [None]:
# Function to count tokens
def count_tokens(text):
    return len(text.split())

# Perform a descriptive analysis of the token counts in both 'article' and 'highlights' columns
article_token_stats = df['article'].apply(count_tokens).describe()
highlights_token_stats = df['highlights'].apply(count_tokens).describe()

In [None]:
article_token_stats

Unnamed: 0,article
count,287113.0
mean,691.869494
std,336.500035
min,8.0
25%,443.0
50%,632.0
75%,877.0
max,2347.0


In [None]:
highlights_token_stats

Unnamed: 0,highlights
count,287113.0
mean,51.574101
std,21.256336
min,4.0
25%,38.0
50%,48.0
75%,60.0
max,1296.0


In [None]:
# Filtering the DataFrame without adding a new column to the filtered DataFrame
filtered_df = df[
    (df['article'].apply(count_tokens) <= 512) &
    (df['highlights'].apply(count_tokens) <= 150)
]

# Display the filtered DataFrame
filtered_df


Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
5,0004306354494f090ee2d7bc5ddbf80b63e80de6,He's been accused of making many a fashion fau...,Prime Minister and his family are enjoying an ...
13,000cd1ee0098c4d510a03ddc97d11764448ebac2,Louis van Gaal said he had no option but to su...,Manchester United beat Southampton 2-1 at St M...
15,001097a19e2c96de11276b3cce11566ccfed0030,"For most people, it has become a travel essent...",Half of Brits admit to checking work e-mails w...
...,...,...,...
287096,fffb01b12e9c495e127eeab0bfe70ac5b2066fe0,A 27-year-old Catholic charity worker has been...,Kristel Padasas was volunteering at a mass off...
287099,fffc255b6446f381c781b6d8b5aa3651db333d91,By . Joshua Gardner . An 89-year-old Washingto...,"Miyo Koba of Moses Lake, Washington was threat..."
287101,fffc526273772fe55ebd9feb2deb5bf18e1dc258,"By . Kerry Mcdermott . PUBLISHED: . 06:15 EST,...",Clockwise M25 closed between junctions 25 and ...
287102,fffc82af3daa218f2e08e69ec325c2e1847a5490,Real Madrid are looking to extend their 21-gam...,Real Madrid beat San Lorenzo in Club World Cup...


In [None]:
## Select a subset of the train dataset if needed
sample_filtered_df = filtered_df.sample(n=5000, random_state=42)
sample_filtered_df

Unnamed: 0,id,article,highlights
203154,930221ca15ad24fa0b9a9d9d4e799f8afa320c98,"ISLAMABAD, Pakistan (CNN) -- Pakistan has inde...",NEW: NATO force expects no impact on ability t...
14124,2813679140cb69776e64ee9d87ffa12c82a7394a,Wonderful moments in life can come at the mos...,Homeless man was filmed playing the public pia...
78669,dee351470f101ae014d080898b60ba56b09ba841,A Michigan man is celebrating the return of hi...,Robert Cortis of Farmington Hills said he was ...
74900,d4583b8aafd1f94e88f76fbe6a63354c51938ada,"By . Damien Gayle . PUBLISHED: . 22:48 EST, 10...",It was Michelle Porter's birthday when her boa...
150331,4e5dc1a12edcc16df973e7c3b92585bd9c000097,(CNN) -- A Florida man fleeing a traffic stop ...,Incident started after deputies pulled over 20...
...,...,...,...
196613,8a738e6dff859aca5ec507ad2ddcedab4b0bb08d,(CNN) -- Actress Emma Watson not only stirred ...,Emma Watson speaks as part of her role as a U....
39363,6f34b5b2108bf55385c90e4fbeb11b5f09a1bd0c,(Mashable) -- Verizon customers interested in ...,Verizon eliminating the one-year contract opti...
275811,f149f0dfd80dd3c604b3c610bd718bc40399e40d,"Makhachkala, Russia (CNN) -- The building, No....",U.S. and Russian authorities interview Tamerla...
31349,592229042f179bb90d84a47f6eb364f6a3c5ea0f,"By . Talal Musa . PUBLISHED: . 12:00 EST, 2 Oc...",Distinct styling and easy to connect via Bluet...


In [None]:
# Load the T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small") #t5-base or t5-large
model = T5ForConditionalGeneration.from_pretrained("t5-small") #t5-base or t5-large

# Load the dataset from the Pandas DataFrame
dataset = Dataset.from_pandas(sample_filtered_df)

def preprocess_function(examples):
    inputs = examples['article']
    targets = examples['highlights']

    # Tokenize the article (input)
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    # Tokenize the highlights (target)
    labels = tokenizer(targets, max_length=150, truncation=True, padding="max_length")

    # Set labels and input_ids as PyTorch tensors
    model_inputs["labels"] = labels["input_ids"]
    model_inputs = {k: torch.tensor(v) for k, v in model_inputs.items()}

    return model_inputs

# Apply the preprocessing function to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [None]:
val_path = "/content/drive/MyDrive/Fine-Tuning/cnn_dailymail/validation.csv"
val = pd.read_csv(val_path)
val

Unnamed: 0,id,article,highlights
0,61df4979ac5fcc2b71be46ed6fe5a46ce7f071c3,"Sally Forrest, an actress-dancer who graced th...","Sally Forrest, an actress-dancer who graced th..."
1,21c0bd69b7e7df285c3d1b1cf56d4da925980a68,A middle-school teacher in China has inked hun...,Works include pictures of Presidential Palace ...
2,56f340189cd128194b2e7cb8c26bb900e3a848b4,A man convicted of killing the father and sist...,"Iftekhar Murtaza, 29, was convicted a year ago..."
3,00a665151b89a53e5a08a389df8334f4106494c2,Avid rugby fan Prince Harry could barely watch...,Prince Harry in attendance for England's crunc...
4,9f6fbd3c497c4d28879bebebea220884f03eb41a,A Triple M Radio producer has been inundated w...,Nick Slater's colleagues uploaded a picture to...
...,...,...,...
13363,e93f721ba4949f21f33549c4a21d55ff456af979,All shops will be allowed to offer ‘click and ...,Shops won't have to apply for planning permiss...
13364,8df19a570ad14119a7d00f3bbe864fedf8c1691d,Mo Farah has had his nationality called into q...,Mo Farah broke the European half-marathon reco...
13365,2fdd5f89aa26e91ceea9b0ef264abfcfc3e6fa2e,Wolves kept their promotion hopes alive with a...,Wolves are three points off the play-off place...
13366,530d7b18d7a715b368b0745f9dfebfe353adeda8,A Brown University graduate student has died ...,"Hyoun Ju Sohn, a 25-year-old doctoral student,..."


In [None]:
# Filtering the DataFrame with parentheses around each condition
filtered_val = val[
    (val['article'].apply(count_tokens) <= 512) &
    (val['highlights'].apply(count_tokens) <= 150)
]

# Display the filtered DataFrame
filtered_val

Unnamed: 0,id,article,highlights
0,61df4979ac5fcc2b71be46ed6fe5a46ce7f071c3,"Sally Forrest, an actress-dancer who graced th...","Sally Forrest, an actress-dancer who graced th..."
2,56f340189cd128194b2e7cb8c26bb900e3a848b4,A man convicted of killing the father and sist...,"Iftekhar Murtaza, 29, was convicted a year ago..."
4,9f6fbd3c497c4d28879bebebea220884f03eb41a,A Triple M Radio producer has been inundated w...,Nick Slater's colleagues uploaded a picture to...
11,57f88e1055acb2b857a378a7c64de2bfd4563658,(CNN)After months racing not only to treat Ebo...,"WHO leader: This vaccine could be ""the first p..."
12,927e5b6d027f106cfa8ae4b5d060a6fed1d98c5c,Radamel Falcao has been reduced to tears by hi...,Silvano Espindola has spoken about Radamel Fal...
...,...,...,...
13356,849ea2fbce099989167cf65fd2026f562e49e2c2,"The birth of the forthcoming royal baby, due i...","Birth of fourth-in-line, expected in April, wi..."
13362,a06a40c15f3ee9a5367121410ae621c01a79eafd,Per Mertesacker says that a frank team meeting...,Arsenal face Monaco in the second leg of their...
13363,e93f721ba4949f21f33549c4a21d55ff456af979,All shops will be allowed to offer ‘click and ...,Shops won't have to apply for planning permiss...
13364,8df19a570ad14119a7d00f3bbe864fedf8c1691d,Mo Farah has had his nationality called into q...,Mo Farah broke the European half-marathon reco...


In [None]:
sample_filtered_val = filtered_val.sample(n=2000, random_state=42)
sample_filtered_val

Unnamed: 0,id,article,highlights
2414,d0416d2c82438ea96bd4a46ecd1c8bc46b629bd4,"(CNN)If it were easy, any company or governmen...",Georgia company equips plane to transport Ebol...
13002,b64655d5f0d71af62b026324341177e360e544ae,Louis van Gaal ended his post-match press conf...,Manchester United manager responded angrily to...
6862,19d528be0830bf97fbfb5e9122504850cfa5764b,Forget switchblades it's shears that will be t...,Mongrel Mob and Black Power shared a violent r...
6324,bfe5a5cac92f76dc42e6054417f67c89beecdcc6,"(CNN)Michael Graves, an American architect and...",American architect Michael Graves dies in Prin...
2228,5caabc339827cb0249f63f500ddf897570d208b6,Tim Sherwood insists Aston Villa are ready for...,Aston Villa play Sunderland in the Premier Lea...
...,...,...,...
6916,61499e59f4aed4c43431958a5d901f9a9c83d579,Motorists in Los Angeles were stunned after a ...,The historic aircraft was built by Boeing in 1...
12974,f9ec849ee3aee0c00f9e0ae17123d77efd0faaad,This unusual weeping willow was snapped by an ...,Unusual weeping willow was snapped by amateur ...
5347,58e36da47f0632bccac59797aba6efb4418be2c7,Sunrise weather presenter Edwina Bartholomew w...,Edwina Bartholomew was the only Sunrise presen...
4837,e94421b0bbc1b09a1c9f4c8cb54d9ac5c75483db,Grigor Dimitrov beat 17-time Grand Slam champi...,"Grigor Dimitrov beat Roger Federer 6-2, 1-6, 7..."


In [None]:
# Load the validation dataset
val_dataset = Dataset.from_pandas(sample_filtered_val)

# Preprocess (tokenize) the validation dataset
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
test_path = "/content/drive/MyDrive/Fine-Tuning/cnn_dailymail/test.csv"
test = pd.read_csv(test_path)
test

Unnamed: 0,id,article,highlights
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6..."
...,...,...,...
11485,ed8674cc15b29a87d8df8de1efee353d71122272,Our young Earth may have collided with a body ...,Oxford scientists say a Mercury-like body stru...
11486,2f58d1a99e9c47914e4b1c31613e3a041cd9011e,A man facing trial for helping his former love...,Man accused of helping former lover kill woman...
11487,411f6d57825161c3a037b4742baccd6cd227c0c3,A dozen or more metal implements are arranged ...,Marianne Power tried the tuning fork facial at...
11488,b5683ef8342056b17b068e0d59bdbe87e3fe44ea,Brook Lopez dominated twin brother Robin with ...,Brooklyn Nets beat the Portland Trail Blazers ...


In [None]:
# Filtering the DataFrame with parentheses around each condition
filtered_test = test[
    (test['article'].apply(count_tokens) <= 512) &
    (test['highlights'].apply(count_tokens) <= 150)
]

# Display the filtered DataFrame
filtered_test

Unnamed: 0,id,article,highlights
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
5,5ed5e3fbd235a8046cd3b87f4a1aa51b856c8ec3,This is the moment that a crew of firefighters...,Giant pig fell into the swimming pool at his h...
...,...,...,...
11480,b1f32a5b4dad7cdac9b133cd4b82d0b1c166dc0d,The five-year-old namesake grandson of famed c...,"Jerry Tarkanian, five, was taken to the hospit..."
11482,c8dbe71e3a4713b7973493232feef0f8b61a8c11,Lydia Ko shot a 2-over 74 on Saturday in the A...,Lydia Ko shot her second straight over-par rou...
11484,35d1c0421d62b7f41aa86ec6a8d43cf9c3e35356,Backache is striking us younger than ever – wi...,Some 45 per cent of under 30s surveyed said th...
11488,b5683ef8342056b17b068e0d59bdbe87e3fe44ea,Brook Lopez dominated twin brother Robin with ...,Brooklyn Nets beat the Portland Trail Blazers ...


In [None]:
sample_filtered_test = filtered_test.sample(n=2000, random_state=42)
sample_filtered_test

Unnamed: 0,id,article,highlights
437,ecfbc6b28ce4ed0728827bba178c2ed51c24c21e,John Higgins narrowly defeated Judd Trump 5-4 ...,John Higgins defeated Judd Trump 5-4 at the Ch...
2749,87657880f9f02dc76068df09bac4f864eafa0ac9,Rafa Benitez has admitted he tried to raid for...,Andre Schurrle joined Wolfsburg from Chelsea i...
7791,dbde0a725a404d05660ff806661a24e40a4e4f1f,"Almost 6,000 migrants were rescued in the Medi...","160,000 people made same journey last year oft..."
7247,72148a863d465a776592f58708262fdc8c38fa4f,A Queensland man was shocked when he discovere...,Queensland man shocked delivery from 5km away ...
1578,2fa6e5a2afdf13e4e6b1a0ebd9f0f635cfbe8c85,Police in Idaho are trying to track down the m...,"Crash occurred in Lewiston, Idaho, about 8am W..."
...,...,...,...
8700,d40aa4084543b8105fbe2a70a4ecdf299237f7b7,Bobby Moore's granddaughter celebrated her wed...,"Poppy, 23, married childhood sweetheart, Sam M..."
11349,f834aa0383a40a874e0529871d8505131ec4a3a0,"(CNN)Korea's buddae-jjigae -- or ""army stew"" a...",Anthony Bourdain teaches Anderson Cooper a Kor...
4669,ed7c912590f9e23b9adf99963f7dd96338947658,Police have found the car a Queensland teenage...,18-year-old Billy-Anne Huxham was abducted fro...
3370,6c1f869d76e1d42a230824cee216318876977929,New York State Senator Jeffrey Klein has apolo...,Jeffrey Klein apologizes for tweet with search...


In [None]:
# Load the test dataset
test_dataset = Dataset.from_pandas(sample_filtered_test)

# Preprocess (tokenize) the test dataset
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

# Training

In [None]:
# Training configurations
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
print(torch.cuda.is_available()) # Check if a GPU is available and CUDA is enabled

True


In [None]:
# Start the training
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.8106,0.673534
2,0.7656,0.656743
3,0.7494,0.648514
4,0.7445,0.643477
5,0.7432,0.642181


TrainOutput(global_step=6250, training_loss=0.8353119970703125, metrics={'train_runtime': 1136.508, 'train_samples_per_second': 21.997, 'train_steps_per_second': 5.499, 'total_flos': 3383545036800000.0, 'train_loss': 0.8353119970703125, 'epoch': 5.0})

# Saving the Model

In [None]:
# Directory where the model and tokenizer will be saved
save_directory = "/content/drive/MyDrive/Fine-Tuning/Models/T5"

# Save the trained model
with open(f"{save_directory}/trainer.pkl", "wb") as f:
    pickle.dump(trainer, f)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

('/content/drive/MyDrive/Fine-Tuning/Models/T5/tokenizer_config.json',
 '/content/drive/MyDrive/Fine-Tuning/Models/T5/special_tokens_map.json',
 '/content/drive/MyDrive/Fine-Tuning/Models/T5/spiece.model',
 '/content/drive/MyDrive/Fine-Tuning/Models/T5/added_tokens.json')

# Loading the Model

In [None]:
# Load the trained model
with open(f"{save_directory}/trainer.pkl", "rb") as f:
    trainer = pickle.load(f)

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained(save_directory)

# Hyperparameter Tuning

In [None]:
import optuna

# Define the objective function that Optuna will optimize
def objective(trial):
    # Define the hyperparameters that Optuna will tune
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.3)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16])
    num_train_epochs = trial.suggest_int("num_train_epochs", 3, 10)

    # Define the training arguments with the suggested hyperparameters
    training_args = Seq2SeqTrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        learning_rate=learning_rate,  # Tuned by Optuna
        per_device_train_batch_size=per_device_train_batch_size,  # Tuned by Optuna
        per_device_eval_batch_size=4,  # Fixed value
        weight_decay=weight_decay,  # Tuned by Optuna
        save_total_limit=3,
        num_train_epochs=num_train_epochs,  # Tuned by Optuna
        predict_with_generate=True,
        fp16=True,  # Enable FP16 if using a GPU that supports it
        logging_dir='./logs',
        logging_steps=100,
    )

    # Initialize the Seq2SeqTrainer with the training arguments
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        eval_dataset=tokenized_val_dataset,
        tokenizer=tokenizer
    )

    # Train the model
    trainer.train()

    # Evaluate the model and return the desired metric (e.g., eval_loss)
    eval_results = trainer.evaluate(eval_dataset=tokenized_val_dataset)

    # Return the metric that you want to optimize (e.g., eval_loss)
    return eval_results["eval_loss"]

# Create an Optuna study and set the number of trials (experiments)
study = optuna.create_study(direction="minimize")  # Minimize the eval_loss
study.optimize(objective, n_trials=10)  # Specify how many trials to run

# Best set of hyperparameters found
best_trial = study.best_trial
print(f"Best hyperparameters: {best_trial.params}")


# Evaluation

In [None]:
# Evaluate the model on the test dataset
val_results = trainer.evaluate(eval_dataset=tokenized_val_dataset)

# Display the results
val_results

{'eval_loss': 0.7792068123817444,
 'eval_runtime': 33.519,
 'eval_samples_per_second': 59.668,
 'eval_steps_per_second': 14.917,
 'epoch': 5.0}

Evaluation Results Summary:

Evaluation Loss: The model achieved an evaluation loss of 0.779, indicating its performance on the validation dataset. This loss suggests that the model has effectively learned from the data, but further tuning may be necessary for improvement.
Evaluation Runtime: The total runtime for the evaluation process was approximately 33.52 seconds, reflecting the evaluation's efficiency.
Samples Processed Per Second: The model processed around 59.67 samples per second, demonstrating its capability to handle multiple samples efficiently during evaluation.
Steps Per Second: The evaluation process executed 14.92 steps per second, providing insight into the model's step execution speed.
Epoch: The results are from the 5th epoch, indicating that the model has undergone several training cycles, contributing to its current performance.

# Prediction

In [None]:
## Select a subset of the test dataset if needed
#subset_test_dataset = tokenized_test_dataset.select(range(2000))

## Make predictions only for the selected records
predictions = trainer.predict(tokenized_test_dataset)
generated_summaries = predictions.predictions
#generated_summaries = tokenized_test_dataset
true_summaries = tokenized_test_dataset["labels"]



In [None]:
# Decode the predictions
decoded_summaries = tokenizer.batch_decode(generated_summaries, skip_special_tokens=True)

# Decode the references
true_summaries_decoded = tokenizer.batch_decode(true_summaries, skip_special_tokens=True)

In [None]:
# Load the ROUGE metric
rouge_metric = evaluate.load("rouge")

# Assuming decoded_summaries and true_summaries_decoded are defined
results = rouge_metric.compute(predictions=decoded_summaries, references=true_summaries_decoded)

# Display the results
results

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'rouge1': 0.2666155040338209,
 'rouge2': 0.13413487849862746,
 'rougeL': 0.22305003625158723,
 'rougeLsum': 0.2232073828888634}

Prediction Results Summary:

ROUGE-1 Score: The model achieved a ROUGE-1 score of 0.267, indicating that approximately 26.7% of the unigrams in the generated summaries match those in the reference summaries. This score reflects the model's ability to capture individual word matches effectively.

ROUGE-2 Score: The ROUGE-2 score is 0.134, suggesting that about 13.4% of the bigrams in the generated summaries align with those in the reference summaries. This score is lower than the ROUGE-1 score, highlighting the model's challenges in maintaining word pairs or phrases.

ROUGE-L Score: The ROUGE-L score stands at 0.223, indicating that 22.3% of the longest common subsequences in the generated summaries match with the reference summaries. This metric shows the model's ability to maintain the order and flow of information.

ROUGE-L Sum Score: The ROUGE-L sum score is 0.223, which is consistent with the ROUGE-L score. This reinforces the model's overall performance in generating coherent and contextually relevant summaries based on the longest common subsequences.

These ROUGE scores provide valuable insights into the model's performance in generating summaries. While the ROUGE-1 score shows a decent level of unigram matching, the lower ROUGE-2 score indicates room for improvement in capturing bigger phrases. Future work could focus on fine-tuning the model further to enhance its ability to generate summaries that are not only contextually accurate but also richer in phrase structures.

# Deployment

In [None]:
# Example input text
input_text = "As the world grapples with the increasingly severe impacts of climate change, nations are preparing to convene for the 28th United Nations Climate Change Conference, known as COP28, scheduled to take place in Dubai from November 30 to December 12, 2023. This year’s conference comes at a critical juncture, with global temperatures rising and extreme weather events becoming more frequent and intense. In the lead-up to COP28, countries are under pressure to make significant commitments to reduce greenhouse gas emissions and limit global warming to 1.5 degrees Celsius above pre-industrial levels, as outlined in the Paris Agreement. The scientific community has been vocal about the need for urgent action, highlighting that without drastic changes, the consequences for ecosystems, human health, and economies could be catastrophic. The conference will bring together world leaders, negotiators, scientists, and activists, all aiming to forge a consensus on climate action. Key topics on the agenda include transitioning to renewable energy sources, enhancing carbon capture technologies, and addressing the loss and damage faced by vulnerable nations affected by climate change. Many countries have already made pledges to cut emissions, but the gap between commitments and actual reductions remains a significant concern. The Intergovernmental Panel on Climate Change (IPCC) has warned that while some progress has been made, it is not enough to avert the worst impacts of climate change. To close this gap, governments are being urged to enhance their nationally determined contributions (NDCs) and consider more ambitious targets. Civil society organizations are mobilizing ahead of COP28, demanding that governments prioritize climate justice and equity. Activists emphasize the importance of involving indigenous communities and marginalized groups in decision-making processes, as they are often the most affected by climate change. The conference will also feature discussions on climate finance, with developed nations being urged to fulfill their commitment of providing $100 billion annually to support developing countries in their climate efforts. As the date approaches, the stakes have never been higher, and the world will be watching to see if COP28 can deliver the necessary commitments to combat climate change effectively."

In [None]:
# Tokenize the input text
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Move the tensors to GPU if available
if torch.cuda.is_available():
   inputs = {key: value.to('cuda') for key, value in inputs.items()}

# Generate the output
with torch.no_grad():  # Disable gradient calculation
    output_ids = model.generate(**inputs)

# Decode the output
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Generated summary:", output_text)

Generated summary: The 28th United Nations Climate Change Conference is scheduled to take place in Dubai from November 30 to


Comment on Model Output:

The model produced an incomplete response regarding the 28th United Nations Climate Change Conference, scheduled to take place in Dubai from November 30. While it indicates the conference's location and date, the output lacks essential details such as the conference's duration, key themes, expected participants, and the significance of this event in the context of global climate action.

To enhance the quality and informativeness of the generated content, the model could include:

Specific Objectives: What are the main goals for COP28?
Major Issues: Key topics that will be addressed during the conference, such as carbon emissions reduction, renewable energy initiatives, or climate finance.
Significance: The impact of this conference on international climate agreements and national policies.
Providing a more comprehensive overview will not only improve the model's output but also help engage readers with the vital issues at stake during this significant global event.

# Improvement ideas

To improve the model's performance, several strategies can be explored:

1.Increase the Training Data: Instead of using the current 5,000 training examples, a larger dataset can be utilized to enhance the model's ability to generalize better. Expanding the training data could lead to more accurate and robust summaries.

2.Try Larger T5 Variants: The model in this project uses a smaller T5 variant. Experimenting with larger versions like t5-base or t5-large could yield better results, as these models have a higher number of parameters, allowing them to capture more complex patterns.

3.Experiment with LLaMA 3: LLaMA 3 supports a maximum input length of 8,000 tokens, which is significantly more than the 512-token limit of T5. This makes LLaMA 3 a better option for handling longer input sequences, such as large documents or datasets with longer context dependencies.

4.Hyperparameter Tuning: Further improvements can be achieved through hyperparameter tuning. Optimizing parameters like learning rate, batch size, and warm-up steps can help the model converge better and improve performance on the summarization task.