In [None]:
!pip install keybert

## Dataset building 

# the dataset is a sub-sample of newspaper-text-summarization-cnn-dailymail train.csv 
# contains 200 MB of records with a size of 50000 samples

In [None]:
import pandas as pd

file_path = "/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv"
ds = pd.read_csv(file_path)
sample_size = 50000
sampled_ds = ds.sample(n=sample_size, random_state=42)
print(sampled_ds.head())
output_path = "summarization_dataset.csv"
sampled_ds.to_csv(output_path)

## enriching the dataset with keywords at the end of the text

In [None]:
import os
import torch
import pandas as pd
from tqdm import tqdm
import gc
from typing import List, Dict
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
print("Checking and importing dependencies...")
print(f"PyTorch version: {torch.__version__}")

def process_texts(articles: list, summaries: list, batch_size: int, top_n_keywords: int, model_path: str):
    """
    Process articles with formatted keywords appended to the text for special tokenization.
    """
    # Enable performance optimizations
    torch.backends.cudnn.benchmark = True
    if hasattr(torch.backends.cuda, 'matmul'):
        torch.backends.cuda.matmul.allow_tf32 = True

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Initialize model with optimizations
    try:
        model = SentenceTransformer(model_path)
        if device.type == 'cuda':
            model.half()  # Use FP16 for faster inference
        model.to(device)
        kw_model = KeyBERT(model=model)
    except Exception as e:
        print(f"Error initializing models: {e}")
        raise

    results = []

    # Process in batches
    for i in tqdm(range(0, len(articles), batch_size), desc="Processing batches"):
        batch_articles = articles[i:i + batch_size]
        batch_summaries = summaries[i:i + batch_size]

        try:
            # Process each article in the batch
            for idx, (article, summary) in enumerate(zip(batch_articles, batch_summaries)):
                try:
                    keywords = kw_model.extract_keywords(
                        article,
                        keyphrase_ngram_range=(1, 2),
                        stop_words='english',
                        top_n=top_n_keywords,
                        use_maxsum=False,
                        use_mmr=True,
                        diversity=0.5
                    )

                    # Format keywords as special tokens
                    formatted_keywords = ", ".join(
                        [f"<keyword>{kw.upper()}</keyword>" for kw, score in keywords]
                    )

                    # Append the keywords to the article text
                    augmented_text = f"{article}\n\nKeywords: {formatted_keywords}"
                    
                    # Append result (no 'id' field included)
                    results.append({"text": augmented_text, "highlights": summary})

                except Exception as e:
                    print(f"Error processing article {i+idx}: {e}")
                    results.append({"text": article, "highlights": summary})

            if i % (batch_size * 4) == 0 and i > 0 and device.type == 'cuda':
                torch.cuda.empty_cache()
                gc.collect()

        except Exception as e:
            print(f"Error processing batch starting at index {i}: {e}")
            for article, summary in zip(batch_articles, batch_summaries):
                results.append({"text": article, "highlights": summary})

    return results



def process_dataset(file_path: str, batch_size: int, top_n_keywords: int, model_path: str) -> List[Dict]:

    print("Loading dataset...")
    df = pd.read_csv(file_path)
    
    articles = df["article"].tolist()
    summaries = df["highlights"].tolist()

    return process_texts(articles, summaries, batch_size, top_n_keywords, model_path)



def save_to_csv(processed_data: List[Dict], output_path: str):
    chunk_size = 5000
    for i in range(0, len(processed_data), chunk_size):
        chunk = processed_data[i:i + chunk_size]
        mode = 'w' if i == 0 else 'a'
        header = i == 0

        pd.DataFrame(chunk).to_csv(
            output_path,
            index=False,
            mode=mode,
            header=header
        )

        del chunk
        gc.collect()

    print(f"Saved processed data to {output_path}")


def main():
    print("\nCUDA Information:")
    print(f"CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

    file_path = "summarization_dataset.csv"

    processed_data = process_dataset(
        file_path=file_path,
        batch_size=64, 
        top_n_keywords=10,
        model_path="all-MiniLM-L6-v2"
    )

    # Save results
    output_path = "processed_dataset_bottom.csv"
    save_to_csv(processed_data, output_path)

    print(f"\nProcessing completed. Processed {len(processed_data)} documents")


if __name__ == '__main__':
    main()

## enriching the dataset with keywords at the beginning of the text 

In [None]:
import os
import torch
import pandas as pd
from tqdm import tqdm
import gc
from typing import List, Dict
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer

def process_texts(articles: list, summaries: list, batch_size: int, top_n_keywords: int, model_path: str):
    """
    Process articles with formatted keywords appended to the text for special tokenization.
    """
    # Enable performance optimizations
    torch.backends.cudnn.benchmark = True
    if hasattr(torch.backends.cuda, 'matmul'):
        torch.backends.cuda.matmul.allow_tf32 = True

    # Setup GPU
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Initialize model with optimizations
    try:
        model = SentenceTransformer(model_path)
        if device.type == 'cuda':
            model.half()  # Use FP16 for faster inference
        model.to(device)
        kw_model = KeyBERT(model=model)
    except Exception as e:
        print(f"Error initializing models: {e}")
        raise

    results = []

    # Process in batches
    for i in tqdm(range(0, len(articles), batch_size), desc="Processing batches"):
        batch_articles = articles[i:i + batch_size]
        batch_summaries = summaries[i:i + batch_size]

        try:
            # Process each article in the batch
            for idx, (article, summary) in enumerate(zip(batch_articles, batch_summaries)):
                try:
                    keywords = kw_model.extract_keywords(
                        article,
                        keyphrase_ngram_range=(1, 2),
                        stop_words='english',
                        top_n=top_n_keywords,
                        use_maxsum=False,
                        use_mmr=True,
                        diversity=0.5
                    )

                    # Format keywords as special tokens
                    formatted_keywords = ", ".join(
                        [f"<keyword>{kw.upper()}</keyword>" for kw, score in keywords]
                    )

                    # Append the keywords to the article text
                    augmented_text = f"Keywords: {formatted_keywords}\n\n{article}"
                    
                    # Append result (no 'id' field included)
                    results.append({"text": augmented_text, "highlights": summary})

                except Exception as e:
                    print(f"Error processing article {i+idx}: {e}")
                    results.append({"text": article, "highlights": summary})

            if i % (batch_size * 4) == 0 and i > 0 and device.type == 'cuda':
                torch.cuda.empty_cache()
                gc.collect()

        except Exception as e:
            print(f"Error processing batch starting at index {i}: {e}")
            for article, summary in zip(batch_articles, batch_summaries):
                results.append({"text": article, "highlights": summary})

    return results



def process_dataset(file_path: str, batch_size: int, top_n_keywords: int, model_path: str) -> List[Dict]:
    
    print("Loading dataset...")
    df = pd.read_csv(file_path)
    # Assicurati che le colonne siano denominate correttamente
    articles = df["article"].tolist()
    summaries = df["highlights"].tolist()

    return process_texts(articles, summaries, batch_size, top_n_keywords, model_path)



def save_to_csv(processed_data: List[Dict], output_path: str):
    chunk_size = 5000
    for i in range(0, len(processed_data), chunk_size):
        chunk = processed_data[i:i + chunk_size]
        mode = 'w' if i == 0 else 'a'
        header = i == 0

        pd.DataFrame(chunk).to_csv(
            output_path,
            index=False,
            mode=mode,
            header=header
        )

        del chunk
        gc.collect()

    print(f"Saved processed data to {output_path}")


def main():
    # Print CUDA information
    print("\nCUDA Information:")
    print(f"CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

    file_path = "summarization_dataset.csv"

    # Process dataset with optimized parameters for single GPU
    processed_data = process_dataset(
        file_path=file_path,
        batch_size=64,  
        top_n_keywords=10,
        model_path="all-MiniLM-L6-v2"
    )

    # Save results
    output_path = "processed_dataset_top.csv"
    save_to_csv(processed_data, output_path)

    print(f"\nProcessing completed. Processed {len(processed_data)} documents")


if __name__ == '__main__':
    main()

## Making a dataset with list of keywords as unique special token


In [None]:
import re
from datasets import load_dataset
from huggingface_hub import login

def combine_keywords(example):
    text = example["text"]
    keyword_matches = re.findall(r'<keyword>(.*?)</keyword>', text)

    if keyword_matches:
        seen = set()
        ordered_keywords = []
        for match in keyword_matches:
            for keyword in match.split(','):
                keyword = keyword.strip()
                if keyword and keyword not in seen:
                    seen.add(keyword)
                    ordered_keywords.append(keyword)
                    
        combined = ','.join(ordered_keywords)
        new_block = f"Keywords: <keyword>{combined}</keyword>"

        modified_text = re.sub(
            r'^Keywords:\s*(?:<keyword>.*?</keyword>\s*,?\s*)+',
            new_block,
            text,
            flags=re.DOTALL
        )
        example["text"] = modified_text

    return example


ds = load_dataset("VexPoli/cnn_enrich_with_top_keywords")
modified_ds = ds.map(combine_keywords)
modified_ds
login(token="")
modified_ds.push_to_hub("cnn_enrich_with_top_keywords_modified")
