In [1]:
import numpy as np
import re
import string
from collections import Counter
from nltk.tokenize import sent_tokenize, word_tokenize
from scipy.stats import entropy
from sklearn.datasets import fetch_20newsgroups
from datasets import load_dataset

In [2]:
def clean_text(text):
    """Cleans text by removing special characters, emails, and phone numbers."""
    text = re.sub(r'\S+@\S+', '', text)  # Remove emails
    text = re.sub(r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b', '', text)  # Remove phone numbers
    text = re.sub(r'\n+', ' ', text)  # Replace newlines with spaces
    text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text


def get_word_distribution(text):
    """Computes normalized word frequency distribution."""
    words = word_tokenize(text.lower())  # Tokenize and lowercase
    word_counts = Counter(words)  # Count word frequencies
    total_words = sum(word_counts.values())
    word_probs = {word: count / total_words for word, count in word_counts.items()}
    return word_probs


def kl_divergence(p, q):
    """Compute KL divergence between two probability distributions after ensuring equal lengths."""
    p = np.array(p)
    q = np.array(q)

    # Ensure both distributions have the same length
    max_len = max(len(p), len(q))
    
    # Pad shorter array with small values (1e-10)
    p = np.pad(p, (0, max_len - len(p)), 'constant', constant_values=1e-10)
    q = np.pad(q, (0, max_len - len(q)), 'constant', constant_values=1e-10)

    # Normalize both to sum to 1
    p /= p.sum()
    q /= q.sum()

    return entropy(p, q)


def kl_sum_word(sentences, doc_text, summary_length=3):
    """
    Extracts a summary by minimizing KL-divergence between document and summary word distributions.
    
    Args:
        sentences (list): List of sentences.
        doc_text (str): Full document text.
        summary_length (int): Number of sentences in the final summary.
    
    Returns:
        list: Selected summary sentences.
    """
    # Compute word distribution of the full document (PD)
    PD = get_word_distribution(doc_text)

    selected_sentences = []
    PS = {}  # Start with an empty summary distribution

    while len(selected_sentences) < summary_length:
        best_sentence = None
        best_kl_score = float("inf")

        for sentence in sentences:
            if sentence in selected_sentences:
                continue  # Skip already selected sentences

            # Compute PS if we add this sentence
            sentence_word_dist = get_word_distribution(sentence)

            # Merge current PS with new sentence distribution
            merged_PS = Counter(PS)
            for word, prob in sentence_word_dist.items():
                merged_PS[word] += prob  # Add new words to PS

            # Normalize PS
            total_words = sum(merged_PS.values())
            normalized_PS = {word: count / total_words for word, count in merged_PS.items()}

            # Compute KL divergence
            kl_score = kl_divergence(list(PD.values()), list(normalized_PS.values()))

            # Choose sentence that minimizes KL divergence
            if kl_score < best_kl_score:
                best_kl_score = kl_score
                best_sentence = sentence

        if best_sentence:
            selected_sentences.append(best_sentence)
            # Update PS
            PS = get_word_distribution(" ".join(selected_sentences))

    return selected_sentences

In [13]:
# Load 20NG dataset (selected categories)
categories = ['alt.atheism', 'sci.med', 'sci.electronics', 'comp.graphics', 'talk.politics.guns', 'sci.crypt']
newsgroups = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
documents = newsgroups.data  # Limit to first 5 for testing

for i, doc in enumerate(documents):
    clean_doc = clean_text(doc)
    sentences = sent_tokenize(clean_doc)  # Sentence segmentation
    summary = kl_sum_word(sentences, clean_doc, summary_length=2)

    print(f"\nDocument {i+1} Summary:")
    print("\n".join(summary))


Document 1 Summary:
The squarewave has a high of 5 v and low of 0v.
I would like to modulate a 40KHz squarewave over rf.

Document 2 Summary:
I was wondering if anyone knows where I can get more information about the graphics in the WingCommander series, and the RealSpace system they use.
I think its really awesome, and wouldnt mind being able to use similar features in programs.

Document 3 Summary:
Although a bad color quantization effect could result in some visible machbands on a picture that was smooth before it was quantizised.
The term mach banding was not the correct one, it shouldve been color quantization effect.

Document 4 Summary:
Candida albicans can cause severe lifethreatening infections, usually in people who are otherwise quite ill.
Systemic yeast syndrome where the body is allergic to yeast is considered a quack diagnosis by mainstream medicine.

Document 5 Summary:
I require BGI drivers for Super VGA Displays and Super XVGA Displays.
Does anyone know where I could 

KeyboardInterrupt: 

In [7]:
duc_dataset = load_dataset("midas/duc2001", "raw")
duc_documents = duc_dataset['test']['document']

# Clean and summarize
for i, doc in enumerate(duc_documents):  # Testing on 5 documents
    clean_doc = clean_text(" ".join(doc))
    sentences = sent_tokenize(clean_doc)
    summary = kl_sum_word(sentences, clean_doc, summary_length=1)

    print(f"\nDUC-2001 Document {i+1} Summary:")
    print("\n".join(summary))


DUC-2001 Document 1 Summary:
Here , at a glance , are developments today involving the crash of Pan American World Airways Flight 103 Wednesday night in Lockerbie , Scotland , that killed all 259 people aboard and more than 20 people on the ground

DUC-2001 Document 2 Summary:
Rumbling spring thunderstorms have announced the beginning of the unofficial tornado season that runs from April through June across Texas and other Tornado Alley states in the nation s heartland .

DUC-2001 Document 3 Summary:
The condition of those on board is unknown , Ms. Hayes told The Associated Press in a telephone interview from U.S. Air Force European headquarters at Ramstein Air Base .

DUC-2001 Document 4 Summary:
The agencies involved should make sure that fire management plans conform to departmental policies , that employees understand the policies , that everybody is using a common vocabulary and that agencies have agreed beforehand what to do if fires threaten to move across administrative bounda