In [None]:
!pip -qq install spacy pytextrank contractions
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
from datasets import load_dataset
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import spacy
import pytextrank
import re
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import contractions
import nltk

# Ensuring necessary NLTK data is downloaded
nltk.download('stopwords')
nltk.download('wordnet')

# Loading spaCy and pyTextRank model
print("Loading spaCy and pyTextRank model...")
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textrank")
nlp.max_length = 1500000  # Set maximum length to handle larger texts

# Initializing lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Preprocessing function with text cleaning and normalization
def preprocess_text(text, max_length=1000000):
    """Cleans and preprocesses text, with an optional length limit."""
    if text is None:
        return ""  # Handling missing text by returning empty string
    text = contractions.fix(text.lower())  # Normalizing contractions and lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Removing special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Removing extra whitespace
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    cleaned_text = ' '.join(words)
    return cleaned_text[:max_length]

# Function to remove rows with missing data in specified columns
def remove_missing_data(dataset, columns):
    """Removes rows with missing data from specified columns in a dataset."""
    return dataset.filter(lambda x: all(x[col] is not None for col in columns))

# Loading and cleaning dataset
def load_and_clean_data():
    """Loads and cleans the dataset, returning validation and test splits."""
    validation_data = load_dataset("ragha92/FNS_Summarization", split="validation")
    test_data = load_dataset("ragha92/FNS_Summarization", split="test")

    columns = ['Annual Reports', 'Gold Summaries']
    validation_data = remove_missing_data(validation_data, columns)
    test_data = remove_missing_data(test_data, columns)

    return validation_data, test_data

# Summarizing text using TextRank
def summarize_text_textrank(article, top_n=5):
    """Generates a summary using TextRank on a given article."""
    doc = nlp(article)
    summary = [str(sent) for sent in doc._.textrank.summary(limit_sentences=top_n)]
    return ' '.join(summary)

# Evaluation function using ROUGE and BLEU
def evaluate_model_textrank(dataset, set_name):
    """Evaluates the model on a dataset using ROUGE and BLEU metrics."""
    print(f"Evaluating {set_name} dataset using TextRank...")
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bleu': []}
    smooth = SmoothingFunction().method4

    for item in tqdm(dataset):
        article = preprocess_text(item['Annual Reports'])
        gold_summary = preprocess_text(item['Gold Summaries'])

        if article:
            generated_summary = summarize_text_textrank(article)
            score = scorer.score(gold_summary, generated_summary)
            scores['rouge1'].append(score['rouge1'].fmeasure)
            scores['rouge2'].append(score['rouge2'].fmeasure)
            scores['rougeL'].append(score['rougeL'].fmeasure)

            # BLEU score with smoothing
            reference = gold_summary.split()
            candidate = generated_summary.split()
            bleu_score = sentence_bleu([reference], candidate, smoothing_function=smooth)
            scores['bleu'].append(bleu_score)

    # Calculating average scores
    avg_rouge1 = sum(scores['rouge1']) / len(scores['rouge1'])
    avg_rouge2 = sum(scores['rouge2']) / len(scores['rouge2'])
    avg_rougeL = sum(scores['rougeL']) / len(scores['rougeL'])
    avg_bleu = sum(scores['bleu']) / len(scores['bleu'])

    return avg_rouge1, avg_rouge2, avg_rougeL, avg_bleu

# Main function to run the evaluation
def main():
    print("Loading and cleaning datasets...")
    validation_data, test_data = load_and_clean_data()

    print("Evaluating on the validation set...")
    avg_rouge1, avg_rouge2, avg_rougeL, avg_bleu = evaluate_model_textrank(validation_data, "validation")
    print(f"Validation ROUGE-1: {avg_rouge1}, ROUGE-2: {avg_rouge2}, ROUGE-L: {avg_rougeL}, BLEU: {avg_bleu}")

    print("Evaluating on the test dataset...")
    avg_rouge1, avg_rouge2, avg_rougeL, avg_bleu = evaluate_model_textrank(test_data, "test")
    print(f"Test Set ROUGE-1: {avg_rouge1}")
    print(f"Test Set ROUGE-2: {avg_rouge2}")
    print(f"Test Set ROUGE-L: {avg_rougeL}")
    print(f"Test Set BLEU: {avg_bleu}")

if __name__ == "__main__":
    main()


/usr/local/lib/python3.10/dist-packages
Loading spaCy and pyTextRank model...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Loading and cleaning datasets...
Evaluating on the validation set...
Evaluating validation dataset using TextRank...


100%|██████████| 257/257 [43:26<00:00, 10.14s/it]


Validation ROUGE-1: 0.09427128906111303, ROUGE-2: 0.06314497963095687, ROUGE-L: 0.0646837550850029, BLEU: 0.034209738463307784
Evaluating on the test dataset...
Evaluating test dataset using TextRank...


100%|██████████| 259/259 [41:04<00:00,  9.52s/it]

Test Set ROUGE-1: 0.09632594059871431
Test Set ROUGE-2: 0.06739978070004861
Test Set ROUGE-L: 0.0701921449531556
Test Set BLEU: 0.03755184845554105



