In [17]:
!pip install rouge_score



In [18]:
!pip install keybert 



## Utilizing Newsroom ( similar to cnn but not clean pay attention ) 

In [15]:
import json
import pandas as pd 
# Path to the Newsroom dataset file
dataset_path = "/kaggle/input/news-summarization/data.csv"

# Load the dataset
df = pd.read_csv(dataset_path)

# Display the first few rows
df.head()
df[df['Dataset']=='Multi-News'].sample(100)

Unnamed: 0.1,Unnamed: 0,ID,Content,Summary,Dataset
447992,447992,,"Negotiations with nurse Kaci Hickox, who refus...",– Maine is seeking a court order to force nurs...,Multi-News
48400,48400,,50 Cent has angered the internet after a video...,– 50 Cent is facing outrage and possible legal...,Multi-News
521616,521616,,A new North Korean propaganda video shows the ...,– Apparently New York in flames was not enough...,Multi-News
84101,84101,,Google’s robot just got its driver’s license. ...,"– Up until recently, Google's self-driving car...",Multi-News
578653,578653,,Tour de France: Mark Cavendish has urine throw...,– British bicyclist Mark Cavendish suffered so...,Multi-News
...,...,...,...,...,...
225265,225265,,When an elite crime squad's lead detective (Mi...,"– The Snowman, a film about a detective on the...",Multi-News
141702,141702,,Rep. Michele Bachmann has been propelled into ...,– A clinic run by Michele Bachmann's therapist...,Multi-News
798856,798856,,"JUPITER, Fla. - UPDATE: The Coast Guard held a...",– The boat used by two boys who vanished off t...,Multi-News
378586,378586,,The creators of a dogfighting phone applicatio...,"– Michael Vick is growling about the new ""Dog ...",Multi-News


In [16]:
dataset = df[df['Dataset']=='Multi-News'].sample(1000)
dataset.to_csv("summarization_dataset.csv")

## Standard fine-tuned model testing ( tested on standard newsroom ) 

In [19]:
import os
import torch
import pandas as pd
import numpy as np
import gc
from typing import List, Dict
from tqdm import tqdm
from rouge_score import rouge_scorer

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

class SummarizationPipeline:
    def __init__(self, 
                 summarization_model_path="sshleifer/distilbart-cnn-12-6",
                 device=None):
        self.device = device or torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")

        self.tokenizer = AutoTokenizer.from_pretrained(summarization_model_path)
        self.summarization_model = AutoModelForSeq2SeqLM.from_pretrained(summarization_model_path)
        self.summarization_model.to(self.device)

        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    def generate_summary(self, text: str, max_length: int = 150) -> str:
        inputs = self.tokenizer(
            text, 
            max_length=512, 
            truncation=True, 
            return_tensors="pt"
        ).to(self.device)

        summary_ids = self.summarization_model.generate(
            inputs.input_ids, 
            max_length=max_length, 
            num_beams=4, 
            early_stopping=True
        )

        return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    def process_dataset(self, 
                        file_path: str, 
                        output_path: str, 
                        batch_size: int = 32,
                        summary_max_length: int = 512):
        df = pd.read_csv(file_path)
        texts = df["Content"].tolist()
        reference_summaries = df["Summary"].tolist()

        results = []
        rouge_scores = {
            'rouge1_precision': [],
            'rouge1_recall': [],
            'rouge1_fmeasure': [],
            'rouge2_precision': [],
            'rouge2_recall': [],
            'rouge2_fmeasure': [],
            'rougeL_precision': [],
            'rougeL_recall': [],
            'rougeL_fmeasure': []
        }

        for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
            batch_texts = texts[i:i + batch_size]
            batch_references = reference_summaries[i:i + batch_size]
            
            for text, ref_summary in zip(batch_texts, batch_references):
                try:
                    generated_summary = self.generate_summary(text, summary_max_length)
                    
                    # Calculate and store ROUGE scores
                    score = self.rouge_scorer.score(ref_summary, generated_summary)
                    
                    rouge_dict = {
                        'rouge1_precision': score['rouge1'].precision,
                        'rouge1_recall': score['rouge1'].recall,
                        'rouge1_fmeasure': score['rouge1'].fmeasure,
                        'rouge2_precision': score['rouge2'].precision,
                        'rouge2_recall': score['rouge2'].recall,
                        'rouge2_fmeasure': score['rouge2'].fmeasure,
                        'rougeL_precision': score['rougeL'].precision,
                        'rougeL_recall': score['rougeL'].recall,
                        'rougeL_fmeasure': score['rougeL'].fmeasure
                    }
                    
                    results.append({
                        "original_text": text,
                        "generated_summary": generated_summary,
                        "reference_summary": ref_summary,
                        **rouge_dict
                    })

                    # Accumulate scores for total metrics
                    for metric, value in rouge_dict.items():
                        rouge_scores[metric].append(value)

                except Exception as e:
                    print(f"Error processing text: {e}")

            # Memory management
            if self.device.type == 'cuda':
                torch.cuda.empty_cache()
                gc.collect()

        # Calculate total ROUGE metrics
        total_metrics = {
            metric: np.mean(scores) for metric, scores in rouge_scores.items()
        }

        # Save results to CSV
        results_df = pd.DataFrame(results)
        results_df.to_csv(output_path, index=False)
        
        # Save total metrics to a separate file
        with open(output_path.replace('.csv', '_metrics.txt'), 'w') as f:
            f.write("Total ROUGE Metrics:\n")
            for metric, value in total_metrics.items():
                f.write(f"{metric}: {value:.4f}\n")

        print("Total ROUGE Metrics:")
        for metric, value in total_metrics.items():
            print(f"{metric}: {value:.4f}")

        print(f"Processed {len(results)} documents. Results saved to {output_path}")

def main():
    pipeline = SummarizationPipeline()
    pipeline.process_dataset(
        file_path="summarization_dataset.csv",
        output_path="summarization_results_no_keywords.csv",
        batch_size=32,
        summary_max_length= 1024
    )

if __name__ == '__main__':
    main()


Using device: cuda:0


Processing batches: 100%|██████████| 32/32 [11:36<00:00, 21.75s/it]


Total ROUGE Metrics:
rouge1_precision: 0.6081
rouge1_recall: 0.1735
rouge1_fmeasure: 0.2629
rouge2_precision: 0.2070
rouge2_recall: 0.0581
rouge2_fmeasure: 0.0885
rougeL_precision: 0.3612
rougeL_recall: 0.1025
rougeL_fmeasure: 0.1555
Processed 1000 documents. Results saved to summarization_results_no_keywords.csv


## Top keyword fine-tuned model testing ( tested on enriched newsroom ) 

In [20]:
import os
import torch
import pandas as pd
import numpy as np
import gc
from typing import List, Dict
from tqdm import tqdm
from rouge_score import rouge_scorer

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer

class SummarizationPipeline:
    def __init__(self, 
                 keyword_model_path="all-MiniLM-L6-v2", 
                 summarization_model_path="VexPoli/distilbart-summarization-top-o1",
                 device=None):
        self.device = device or torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")

        self.keyword_model = SentenceTransformer(keyword_model_path)
        if self.device.type == 'cuda':
            self.keyword_model.half()
        self.keyword_model.to(self.device)
        self.kw_model = KeyBERT(model=self.keyword_model)

        self.tokenizer = AutoTokenizer.from_pretrained(summarization_model_path)
        self.summarization_model = AutoModelForSeq2SeqLM.from_pretrained(summarization_model_path)
        self.summarization_model.to(self.device)

        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    def extract_keywords(self, text: str, top_n: int = 10) -> str:
        keywords = self.kw_model.extract_keywords(
            text,
            keyphrase_ngram_range=(1, 2),
            stop_words='english',
            top_n=top_n,
            use_maxsum=False,
            use_mmr=True,
            diversity=0.5
        )
        return ", ".join([f"<keyword>{kw.upper()}</keyword>" for kw, score in keywords])

    def enrich_text(self, text: str, top_n_keywords: int = 10) -> str:
        formatted_keywords = self.extract_keywords(text, top_n_keywords)
        return f"Keywords: {formatted_keywords}\n\n{text}"

    def generate_summary(self, enriched_text: str, max_length: int = 150) -> str:
        inputs = self.tokenizer(
            enriched_text, 
            max_length=512, 
            truncation=True, 
            return_tensors="pt"
        ).to(self.device)

        summary_ids = self.summarization_model.generate(
            inputs.input_ids, 
            max_length=max_length, 
            num_beams=4, 
            early_stopping=True
        )

        return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    def process_dataset(self, 
                        file_path: str, 
                        output_path: str, 
                        batch_size: int = 32, 
                        top_n_keywords: int = 10,
                        summary_max_length: int = 512):
        df = pd.read_csv(file_path)
        texts = df["Content"].tolist()
        reference_summaries = df["Summary"].tolist()

        results = []
        rouge_scores = {
            'rouge1_precision': [],
            'rouge1_recall': [],
            'rouge1_fmeasure': [],
            'rouge2_precision': [],
            'rouge2_recall': [],
            'rouge2_fmeasure': [],
            'rougeL_precision': [],
            'rougeL_recall': [],
            'rougeL_fmeasure': []
        }

        for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
            batch_texts = texts[i:i + batch_size]
            batch_references = reference_summaries[i:i + batch_size]
            
            for text, ref_summary in zip(batch_texts, batch_references):
                try:
                    enriched_text = self.enrich_text(text, top_n_keywords)
                    generated_summary = self.generate_summary(enriched_text, summary_max_length)
                    
                    # Calculate and store ROUGE scores
                    score = self.rouge_scorer.score(ref_summary, generated_summary)
                    
                    rouge_dict = {
                        'rouge1_precision': score['rouge1'].precision,
                        'rouge1_recall': score['rouge1'].recall,
                        'rouge1_fmeasure': score['rouge1'].fmeasure,
                        'rouge2_precision': score['rouge2'].precision,
                        'rouge2_recall': score['rouge2'].recall,
                        'rouge2_fmeasure': score['rouge2'].fmeasure,
                        'rougeL_precision': score['rougeL'].precision,
                        'rougeL_recall': score['rougeL'].recall,
                        'rougeL_fmeasure': score['rougeL'].fmeasure
                    }
                    
                    results.append({
                        "original_text": text,
                        "enriched_text": enriched_text,
                        "generated_summary": generated_summary,
                        "reference_summary": ref_summary,
                        **rouge_dict
                    })

                    # Accumulate scores for total metrics
                    for metric, value in rouge_dict.items():
                        rouge_scores[metric].append(value)

                except Exception as e:
                    print(f"Error processing text: {e}")

            # Memory management
            if self.device.type == 'cuda':
                torch.cuda.empty_cache()
                gc.collect()

        # Calculate total ROUGE metrics
        total_metrics = {
            metric: np.mean(scores) for metric, scores in rouge_scores.items()
        }

        # Save results to CSV
        results_df = pd.DataFrame(results)
        results_df.to_csv(output_path, index=False)
        
        # Save total metrics to a separate file
        with open(output_path.replace('.csv', '_metrics.txt'), 'w') as f:
            f.write("Total ROUGE Metrics:\n")
            for metric, value in total_metrics.items():
                f.write(f"{metric}: {value:.4f}\n")

        print("Total ROUGE Metrics:")
        for metric, value in total_metrics.items():
            print(f"{metric}: {value:.4f}")

        print(f"Processed {len(results)} documents. Results saved to {output_path}")

def main():
    pipeline = SummarizationPipeline()
    pipeline.process_dataset(
        file_path="summarization_dataset.csv",
        output_path="summarization_results.csv",
        batch_size=32,
        top_n_keywords=10,
        summary_max_length=1024
    )

if __name__ == '__main__':
    main()

Using device: cuda:0


Processing batches: 100%|██████████| 32/32 [14:15<00:00, 26.73s/it]


Total ROUGE Metrics:
rouge1_precision: 0.6230
rouge1_recall: 0.1245
rouge1_fmeasure: 0.2017
rouge2_precision: 0.2173
rouge2_recall: 0.0423
rouge2_fmeasure: 0.0688
rougeL_precision: 0.3993
rougeL_recall: 0.0786
rougeL_fmeasure: 0.1276
Processed 1000 documents. Results saved to summarization_results.csv


## Down keywords fine-tuned model  testing  ( tested on enriched newsroom ) 

In [21]:
import os
import torch
import pandas as pd
import numpy as np
import gc
from typing import List, Dict
from tqdm import tqdm
from rouge_score import rouge_scorer

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer

class SummarizationPipeline:
    def __init__(self, 
                 keyword_model_path="all-MiniLM-L6-v2", 
                 summarization_model_path="VexPoli/distilbart-summarization-down-o1",
                 device=None):
        self.device = device or torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")

        self.keyword_model = SentenceTransformer(keyword_model_path)
        if self.device.type == 'cuda':
            self.keyword_model.half()
        self.keyword_model.to(self.device)
        self.kw_model = KeyBERT(model=self.keyword_model)

        self.tokenizer = AutoTokenizer.from_pretrained(summarization_model_path)
        self.summarization_model = AutoModelForSeq2SeqLM.from_pretrained(summarization_model_path)
        self.summarization_model.to(self.device)

        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    def extract_keywords(self, text: str, top_n: int = 10) -> str:
        keywords = self.kw_model.extract_keywords(
            text,
            keyphrase_ngram_range=(1, 2),
            stop_words='english',
            top_n=top_n,
            use_maxsum=False,
            use_mmr=True,
            diversity=0.5
        )
        return ", ".join([f"<keyword>{kw.upper()}</keyword>" for kw, score in keywords])

    def enrich_text(self, text: str, top_n_keywords: int = 10) -> str:
        formatted_keywords = self.extract_keywords(text, top_n_keywords)
        return f"Keywords: {text}\n\n{formatted_keywords}"

    def generate_summary(self, enriched_text: str, max_length: int = 150) -> str:
        inputs = self.tokenizer(
            enriched_text, 
            max_length=512, 
            truncation=True, 
            return_tensors="pt"
        ).to(self.device)

        summary_ids = self.summarization_model.generate(
            inputs.input_ids, 
            max_length=max_length, 
            num_beams=4, 
            early_stopping=True
        )

        return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    def process_dataset(self, 
                        file_path: str, 
                        output_path: str, 
                        batch_size: int = 32, 
                        top_n_keywords: int = 10,
                        summary_max_length: int = 512):
        df = pd.read_csv(file_path)
        texts = df["Content"].tolist()
        reference_summaries = df["Summary"].tolist()

        results = []
        rouge_scores = {
            'rouge1_precision': [],
            'rouge1_recall': [],
            'rouge1_fmeasure': [],
            'rouge2_precision': [],
            'rouge2_recall': [],
            'rouge2_fmeasure': [],
            'rougeL_precision': [],
            'rougeL_recall': [],
            'rougeL_fmeasure': []
        }

        for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
            batch_texts = texts[i:i + batch_size]
            batch_references = reference_summaries[i:i + batch_size]
            
            for text, ref_summary in zip(batch_texts, batch_references):
                try:
                    enriched_text = self.enrich_text(text, top_n_keywords)
                    generated_summary = self.generate_summary(enriched_text, summary_max_length)
                    
                    # Calculate and store ROUGE scores
                    score = self.rouge_scorer.score(ref_summary, generated_summary)
                    
                    rouge_dict = {
                        'rouge1_precision': score['rouge1'].precision,
                        'rouge1_recall': score['rouge1'].recall,
                        'rouge1_fmeasure': score['rouge1'].fmeasure,
                        'rouge2_precision': score['rouge2'].precision,
                        'rouge2_recall': score['rouge2'].recall,
                        'rouge2_fmeasure': score['rouge2'].fmeasure,
                        'rougeL_precision': score['rougeL'].precision,
                        'rougeL_recall': score['rougeL'].recall,
                        'rougeL_fmeasure': score['rougeL'].fmeasure
                    }
                    
                    results.append({
                        "original_text": text,
                        "enriched_text": enriched_text,
                        "generated_summary": generated_summary,
                        "reference_summary": ref_summary,
                        **rouge_dict
                    })

                    # Accumulate scores for total metrics
                    for metric, value in rouge_dict.items():
                        rouge_scores[metric].append(value)

                except Exception as e:
                    print(f"Error processing text: {e}")

            # Memory management
            if self.device.type == 'cuda':
                torch.cuda.empty_cache()
                gc.collect()

        # Calculate total ROUGE metrics
        total_metrics = {
            metric: np.mean(scores) for metric, scores in rouge_scores.items()
        }

        # Save results to CSV
        results_df = pd.DataFrame(results)
        results_df.to_csv(output_path, index=False)
        
        # Save total metrics to a separate file
        with open(output_path.replace('.csv', '_metrics.txt'), 'w') as f:
            f.write("Total ROUGE Metrics:\n")
            for metric, value in total_metrics.items():
                f.write(f"{metric}: {value:.4f}\n")

        print("Total ROUGE Metrics:")
        for metric, value in total_metrics.items():
            print(f"{metric}: {value:.4f}")

        print(f"Processed {len(results)} documents. Results saved to {output_path}")

def main():
    pipeline = SummarizationPipeline()
    pipeline.process_dataset(
        file_path="summarization_dataset.csv",
        output_path="summarization_results.csv",
        batch_size=32,
        top_n_keywords=10,
        summary_max_length=1024
    )

if __name__ == '__main__':
    main()

Using device: cuda:0


Processing batches: 100%|██████████| 32/32 [14:34<00:00, 27.33s/it]


Total ROUGE Metrics:
rouge1_precision: 0.6227
rouge1_recall: 0.1281
rouge1_fmeasure: 0.2073
rouge2_precision: 0.2172
rouge2_recall: 0.0437
rouge2_fmeasure: 0.0710
rougeL_precision: 0.3967
rougeL_recall: 0.0808
rougeL_fmeasure: 0.1310
Processed 1000 documents. Results saved to summarization_results.csv
