***Install Required Libraries, Import Libraries and Download NLTK Resources and Generate Extractive Summaries using Word Frequency Algorithm (WFA)***

In [None]:
!pip install rouge-score
# Install the rouge-score module.

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
import heapq
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW
from rouge_score import rouge_scorer # Import the installed rouge_scorer module.

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

def word_frequency(text):
    word_freq = defaultdict(int)
    stopwords_set = set(stopwords.words('english'))
    for word in word_tokenize(text.lower()):
        if word not in stopwords_set:
            word_freq[word] += 1
    return word_freq

def summarize_text(text, num_sentences=2):
    sentence_scores = defaultdict(int)
    word_freq = word_frequency(text)

    sentences = sent_tokenize(text)
    for sentence in sentences:
        for word in word_tokenize(sentence.lower()):
            if word in word_freq:
                sentence_scores[sentence] += word_freq[word]

    summary_sentences = heapq.nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
    return ' '.join(summary_sentences)

# Load the dataset
df = pd.read_excel('/content/dataset.xlsx')
df = df.rename(columns={'Text1': 'source_textwfa'})
df = df[['source_textwfa']]

# Apply extractive summarization to generate initial summaries
text_column = df['source_textwfa']
summaries = [summarize_text(text) for text in text_column]
df['summary'] = summaries

# Save the preprocessed data
df.to_excel('/content/summarized_data.xlsx', index=False)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


***Load Pretrained T5 Model for Abstractive Summarization, Fine-Tune T5 Model on WFA-Generated Summaries and Evaluate Summaries using ROUGE Metrics***

In [None]:
# Ensure pip, setuptools, and wheel are up-to-date
!pip install --upgrade pip setuptools wheel

# Install transformers and tokenizers
!pip install transformers tokenizers

# Install simplet5
!pip install simplet5

# Install rouge_score
!pip install rouge_score
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW
from rouge_score import rouge_scorer
import torch

# Load the data
df = pd.read_excel('/content/summarized_data.xlsx')

# Rename columns
df = df.rename(columns={'summary': 'target_text', 'source_textwfa': 'source_text'})

# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

# Prepend "summarize: " to source_text
train_df['source_text'] = "summarize: " + train_df['source_text']
test_df['source_text'] = "summarize: " + test_df['source_text']

# Initialize the T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('t5-base')
tokenizer = T5Tokenizer.from_pretrained('t5-base')

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define a function to compute ROUGE scores
def compute_rouge(predictions, targets):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    return scorer.score(predictions, targets)

# Fine-tuning loop with ROUGE computation
num_epochs = 5  # Increase the number of epochs
for epoch in range(num_epochs):
    model.train()
    train_loss = 0

    # Training loop
    for index, row in train_df.iterrows():
        input_text = row['source_text']
        target_text = row['target_text']

        # Tokenize inputs and targets
        input_ids = tokenizer.encode(input_text, return_tensors='pt', max_length=512, truncation=True)
        labels = tokenizer.encode(target_text, return_tensors='pt', max_length=512, truncation=True)

        # Fine-tune the model
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    avg_train_loss = train_loss / len(train_df)
    print(f'Epoch {epoch+1}, Training loss: {avg_train_loss}')

    # Evaluation loop
    model.eval()
    rouge_scores = []
    with torch.no_grad():
        for index, row in test_df.iterrows():
            input_text = row['source_text']
            target_text = row['target_text']

            # Tokenize input
            input_ids = tokenizer.encode(input_text, return_tensors='pt', max_length=512, truncation=True)

            # Generate summaries
            generated_ids = model.generate(input_ids=input_ids, max_length=512, num_beams=4, early_stopping=True)
            generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

            # Compute ROUGE scores
            rouge_score = compute_rouge(generated_text, target_text)
            rouge_scores.append(rouge_score)

    # Calculate average ROUGE scores
    avg_rouge_scores = {metric: np.mean([score[metric].fmeasure for score in rouge_scores]) for metric in rouge_scores[0]}
    print(f'Epoch {epoch+1}, ROUGE scores: {avg_rouge_scores}')
    from transformers import get_linear_schedule_with_warmup

# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_df) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Fine-tuning loop with learning rate scheduler
for epoch in range(num_epochs):
    model.train()
    train_loss = 0

    # Training loop
    for index, row in train_df.iterrows():
        input_text = row['source_text']
        target_text = row['target_text']

        # Tokenize inputs and targets
        input_ids = tokenizer.encode(input_text, return_tensors='pt', max_length=512, truncation=True)
        labels = tokenizer.encode(target_text, return_tensors='pt', max_length=512, truncation=True)

        # Fine-tune the model
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    avg_train_loss = train_loss / len(train_df)
    print(f'Epoch {epoch+1}, Training loss: {avg_train_loss}')

    # Evaluation loop
    model.eval()
    rouge_scores = []
    with torch.no_grad():
        for index, row in test_df.iterrows():
            input_text = row['source_text']
            target_text = row['target_text']

            # Tokenize input
            input_ids = tokenizer.encode(input_text, return_tensors='pt', max_length=512, truncation=True)

            # Generate summaries
            generated_ids = model.generate(input_ids=input_ids, max_length=512, num_beams=4, early_stopping=True)
            generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

            # Compute ROUGE scores
            rouge_score = compute_rouge(generated_text, target_text)
            rouge_scores.append(rouge_score)

    # Calculate average ROUGE scores
    avg_rouge_scores = {metric: np.mean([score[metric].fmeasure for score in rouge_scores]) for metric in rouge_scores[0]}
    print(f'Epoch {epoch+1}, ROUGE scores: {avg_rouge_scores}')



Collecting setuptools
  Using cached setuptools-71.1.0-py3-none-any.whl.metadata (6.6 kB)
Using cached setuptools-71.1.0-py3-none-any.whl (2.3 MB)
Installing collected packages: setuptools
  Attempting uninstall: setuptools
    Found existing installation: setuptools 71.0.4
    Uninstalling setuptools-71.0.4:
      Successfully uninstalled setuptools-71.0.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ipython 7.34.0 requires jedi>=0.16, which is not installed.[0m[31m
[0mSuccessfully installed setuptools-71.1.0


Collecting simplet5
  Downloading simplet5-0.1.4.tar.gz (7.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers==4.16.2 (from simplet5)
  Downloading transformers-4.16.2-py3-none-any.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.8/61.8 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytorch-lightning==1.5.10 (from simplet5)
  Downloading pytorch_lightning-1.5.10-py3-none-any.whl.metadata (31 kB)
Requested pytorch-lightning==1.5.10 from https://files.pythonhosted.org/packages/18/f1/f59b307f75db1886c96e396eec878501510677394868680b8d2b8b58c47c/pytorch_lightning-1.5.10-py3-none-any.whl (from simplet5) has invalid metadata: .* suffix can only be used with `==` or `!=` operators
    torch (>=1.7.*)
           ~~~~~~^
Please use pip<24.1 if you need to use this version.[0m[33m
[0mINFO: pip is looking at multiple versions of simplet5 to determine which version is compatible with other requirements. This c

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Epoch 1, Training loss: 0.45403863191604615
Epoch 1, ROUGE scores: {'rouge1': 0.4516870596959585, 'rouge2': 0.35231071779744355, 'rougeL': 0.35987490125904786}
Epoch 2, Training loss: 0.20523539334535598
Epoch 2, ROUGE scores: {'rouge1': 0.6532212885154062, 'rouge2': 0.5441277646706701, 'rougeL': 0.5225700982734232}
Epoch 3, Training loss: 0.11978963315486908
Epoch 3, ROUGE scores: {'rouge1': 0.69697933227345, 'rouge2': 0.6281053614819955, 'rougeL': 0.5275201032234281}
Epoch 4, Training loss: 0.12433954365551472
Epoch 4, ROUGE scores: {'rouge1': 0.6736900165471594, 'rouge2': 0.5821003477843737, 'rougeL': 0.44074278359992647}
Epoch 5, Training loss: 0.05973219368606806
Epoch 5, ROUGE scores: {'rouge1': 0.6662100889321537, 'rouge2': 0.5830511222764068, 'rougeL': 0.4577502873951043}
Epoch 1, Training loss: 0.0560492604970932
Epoch 1, ROUGE scores: {'rouge1': 0.717486608169838, 'rouge2': 0.634281245288708, 'rougeL': 0.6125866771829505}
Epoch 2, Training loss: 0.05036195497959852
Epoch 2, R