***Install Required Libraries, Import Libraries and Download NLTK Resources and Generate Extractive Summaries using Word Frequency Algorithm (WFA)***

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
!pip install openpyxl
import openpyxl
import nltk
nltk.download('punkt')
# Read the Excel file
df = pd.read_excel('/content/dataset.xlsx')
print(df.columns)
# simpleT5 expects dat'source_textwfa'aframe to have 2 columns: "source_text" and "target_text"
df = df.rename(columns={'Text1':'source_textwfa'})
df = df[['source_textwfa']]
print(df.columns)

Index(['Text1'], dtype='object')
Index(['source_textwfa'], dtype='object')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
df

Unnamed: 0,source_textwfa
0,It is important to understand the problem befo...
1,The word candid refers to something spontaneou...
2,Sometimes we find more than one solutions of a...
3,Flowcharts are helpful to know about the steps...
4,An algorithm has a vital role in problem solvi...
5,There can be more than one algorithms to solve...
6,Difference between an algorithm and a flowchar...
7,Examinations and Assessments are undergoing a ...


***Load Pretrained T5 Model for Abstractive Summarization, Fine-Tune BART Model on WFA-Generated Summaries and Evaluate Summaries using ROUGE Metrics***

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
import heapq

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

def word_frequency(text):
    word_freq = defaultdict(int)
    stopwords_set = set(stopwords.words('english'))
    for word in word_tokenize(text.lower()):
        if word not in stopwords_set:
            word_freq[word] += 1
    return word_freq

def summarize_text(text, num_sentences=2):
    sentence_scores = defaultdict(int)
    word_freq = word_frequency(text)

    sentences = sent_tokenize(text)
    for sentence in sentences:
        for word in word_tokenize(sentence.lower()):
            if word in word_freq:
                sentence_scores[sentence] += word_freq[word]

    summary_sentences = heapq.nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
    return ' '.join(summary_sentences)

# Assuming your text column is named 'text_column'
text_column = df['source_textwfa']

# Extractive summarization for each row in the text column
summaries = []
for text in text_column:
    summary = summarize_text(text)
    summaries.append(summary)

# Add summaries to DataFrame
df['summary'] = summaries

# Save DataFrame with summaries to a new Excel file
df.to_excel('summarized_data.xlsx', index=False)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
!pip install rouge_score
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BartForConditionalGeneration, BartTokenizer, AdamW
from rouge_score import rouge_scorer

# Load the data
df = pd.read_excel('/content/summarized_data.xlsx')

# Rename columns
df = df.rename(columns={'summary': 'target_text', 'source_textwfa': 'source_text'})

# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

# Prepend "summarize: " to source_text
train_df['source_text'] = "summarize: " + train_df['source_text']
test_df['source_text'] = "summarize: " + test_df['source_text']

# Initialize the BART model and tokenizer
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define a function to compute ROUGE scores
def compute_rouge(predictions, targets):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(predictions, targets)
    return rouge_scores

# Fine-tuning loop with ROUGE computation
for epoch in range(3):  # Number of epochs
    model.train()

    # Training loop
    for index, row in train_df.iterrows():
        input_text = row['source_text']
        target_text = row['target_text']

        # Tokenize inputs and targets
        input_ids = tokenizer(input_text, return_tensors='pt', max_length=512, truncation=True)['input_ids']
        labels = tokenizer(target_text, return_tensors='pt', max_length=512, truncation=True)['input_ids']

        # Fine-tune the model
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    # Evaluation loop
    model.eval()
    rouge_scores = []
    for index, row in test_df.iterrows():
        input_text = row['source_text']
        target_text = row['target_text']

        # Tokenize input
        input_ids = tokenizer(input_text, return_tensors='pt', max_length=512, truncation=True)['input_ids']

        # Generate summaries
        generated_ids = model.generate(input_ids=input_ids, max_length=512, num_beams=4, early_stopping=True)
        generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

        # Compute ROUGE scores
        rouge_score = compute_rouge(generated_text, target_text)
        rouge_scores.append(rouge_score)

    # Calculate average ROUGE scores
    avg_rouge_scores = {metric: np.mean([score[metric].fmeasure for score in rouge_scores]) for metric in rouge_scores[0]}
    print(f'Epoch {epoch+1}, ROUGE scores: {avg_rouge_scores}')


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=fe062e19f91d020d99b4b1eb0dda49c6f76e6c24e0f17f4d6be44c2427d17e20
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Epoch 1, ROUGE scores: {'rouge1': 0.6974132863021753, 'rouge2': 0.6112686478806535, 'rougeL': 0.44409171075837744}
Epoch 2, ROUGE scores: {'rouge1': 0.7321114536941876, 'rouge2': 0.6372114702650817, 'rougeL': 0.42483727303871194}
