In [5]:
!pip -qq install transformers datasets rouge_score nltk


In [6]:
!python -m spacy download en_core_web_sm
nltk.download('stopwords')


Collecting en-core-web-sm==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


NameError: name 'nltk' is not defined

In [7]:
!pip install nltk
import nltk
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
# Imports
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from rouge_score import rouge_scorer
from torch.utils.data import DataLoader
from torch import nn
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import os

# Ensure necessary NLTK data is downloaded
nltk.download('stopwords')
nltk.download('wordnet')

# Device setting
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load and Prepare Model
t5_model_name = 't5-base'
print(f"Loading T5 model: {t5_model_name}")
tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
model = T5ForConditionalGeneration.from_pretrained(t5_model_name).to(device)

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Preprocessing function
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    words = text.split()
    words = [lemmatizer.lemmatize(word.lower()) for word in words if word.lower() not in stop_words]
    return ' '.join(words)

# Function to remove rows with missing data
def remove_missing_data(dataset, columns):
    return dataset.filter(lambda x: all(x[col] is not None for col in columns))

# Function to load and clean dataset
def load_and_clean_data():
    train_data = load_dataset("ragha92/FNS_Summarization", split="train")
    validation_data = load_dataset("ragha92/FNS_Summarization", split="validation")
    test_data = load_dataset("ragha92/FNS_Summarization", split="test")

    columns = ['Annual Reports', 'Gold Summaries']
    train_data = remove_missing_data(train_data, columns)
    validation_data = remove_missing_data(validation_data, columns)
    test_data = remove_missing_data(test_data, columns)

    return train_data, validation_data, test_data

# Summarization function for T5 with updated hyperparameters
def summarize_text_t5(article):
    input_text = f"summarize: {article}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
    summary_ids = model.generate(
        input_ids,
        max_length=512,
        min_length=50,
        length_penalty=1.5,
        num_beams=3,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Checkpoint save/load functions
def save_checkpoint(model, optimizer, epoch, path="t5_checkpoint.pt"):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, path)
    print(f"Checkpoint saved at epoch {epoch}")

def load_checkpoint(path="t5_checkpoint.pt"):
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    print(f"Checkpoint loaded. Starting from epoch {epoch + 1}")
    return epoch

# Function to fine-tune the model
def fine_tune_model(train_data, epochs=3, learning_rate=5e-5, save_interval=1):
    model.train()
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    dataloader = DataLoader(train_data, batch_size=2, shuffle=True)

    starting_epoch = 0
    if "t5_checkpoint.pt" in os.listdir():
        starting_epoch = load_checkpoint("t5_checkpoint.pt")

    for epoch in range(starting_epoch, epochs):
        total_loss = 0
        for batch_idx, batch in enumerate(dataloader):
            optimizer.zero_grad()
            input_texts = batch['Annual Reports']
            target_summaries = batch['Gold Summaries']

            inputs = tokenizer(input_texts, return_tensors="pt", max_length=512, truncation=True, padding=True).input_ids.to(device)
            targets = tokenizer(target_summaries, return_tensors="pt", max_length=512, truncation=True, padding=True).input_ids.to(device)

            outputs = model(input_ids=inputs, labels=targets)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        if (epoch + 1) % save_interval == 0:
            save_checkpoint(model, optimizer, epoch, path="t5_checkpoint.pt")

        print(f"Epoch {epoch+1}/{epochs} Average Loss: {total_loss / len(dataloader)}")

# Evaluation using ROUGE and BLEU with smoothing
def evaluate_model(dataset):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bleu': []}
    smooth = SmoothingFunction().method4

    for item in dataset:
        article = preprocess_text(item['Annual Reports'])
        gold_summary = preprocess_text(item['Gold Summaries'])
        if article:
            generated_summary = summarize_text_t5(article)
            score = scorer.score(gold_summary, generated_summary)
            scores['rouge1'].append(score['rouge1'].fmeasure)
            scores['rouge2'].append(score['rouge2'].fmeasure)
            scores['rougeL'].append(score['rougeL'].fmeasure)
            reference = gold_summary.split()
            candidate = generated_summary.split()
            bleu_score = sentence_bleu([reference], candidate, smoothing_function=smooth)
            scores['bleu'].append(bleu_score)

    avg_rouge1 = sum(scores['rouge1']) / len(scores['rouge1'])
    avg_rouge2 = sum(scores['rouge2']) / len(scores['rouge2'])
    avg_rougeL = sum(scores['rougeL']) / len(scores['rougeL'])
    avg_bleu = sum(scores['bleu']) / len(scores['bleu'])
    return avg_rouge1, avg_rouge2, avg_rougeL, avg_bleu

# Main function
def main():
    print("Loading and cleaning datasets...")
    train_data, validation_data, test_data = load_and_clean_data()

    print("Fine-tuning model...")
    fine_tune_model(train_data, epochs=3)

    print("Evaluating on the validation set...")
    avg_rouge1, avg_rouge2, avg_rougeL, avg_bleu = evaluate_model(validation_data)
    print(f"Validation ROUGE-1: {avg_rouge1}, ROUGE-2: {avg_rouge2}, ROUGE-L: {avg_rougeL}, BLEU: {avg_bleu}")

    print("Evaluating on the test dataset...")
    avg_rouge1, avg_rouge2, avg_rougeL, avg_bleu = evaluate_model(test_data)
    print(f"Test Set ROUGE-1: {avg_rouge1}")
    print(f"Test Set ROUGE-2: {avg_rouge2}")
    print(f"Test Set ROUGE-L: {avg_rougeL}")
    print(f"Test Set BLEU: {avg_bleu}")

# Run the main function
if __name__ == "__main__":
    main()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


Using device: cuda
Loading T5 model: t5-base




Loading and cleaning datasets...


Filter:   0%|          | 0/2060 [00:00<?, ? examples/s]

Filter:   0%|          | 0/257 [00:00<?, ? examples/s]

Filter:   0%|          | 0/259 [00:00<?, ? examples/s]

Fine-tuning model...




Checkpoint saved at epoch 0
Epoch 1/3 Average Loss: 3.5419204562034423
Checkpoint saved at epoch 1
Epoch 2/3 Average Loss: 3.0776227133222
Checkpoint saved at epoch 2
Epoch 3/3 Average Loss: 2.9360366162310525
Evaluating on the validation set...
Validation ROUGE-1: 0.22833947827617718, ROUGE-2: 0.13165483270876008, ROUGE-L: 0.15838318147975344, BLEU: 0.08291126732736354
Evaluating on the test dataset...
Test Set ROUGE-1: 0.20656176252908742
Test Set ROUGE-2: 0.10917820702142943
Test Set ROUGE-L: 0.13878628444545002
Test Set BLEU: 0.06127653889472084
