In [1]:
%pip install rogue

Collecting rogue
  Downloading rogue-0.0.2.tar.gz (5.4 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hBuilding wheels for collected packages: rogue
  Building wheel for rogue (pyproject.toml) ... [?25ldone
[?25h  Created wheel for rogue: filename=rogue-0.0.2-py3-none-any.whl size=7297 sha256=6808dac50be5d0d37feb5fa8cb17a30e74128401ce64d23323fa9c4e584b1635
  Stored in directory: /Users/ajeyk/Library/Caches/pip/wheels/88/65/0c/e2d3efe66c4b48cb42ed2a2c5b310b9b5884c42238096f4414
Successfully built rogue
Installing collected packages: rogue
Successfully installed rogue-0.0.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel

In [2]:
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from sklearn.feature_extraction.text import CountVectorizer
from scipy.stats import entropy
# from rogue import Rogue

In [13]:
from sklearn.datasets import fetch_20newsgroups
from collections import defaultdict


nltk.download('punkt_tab')

# Load DUC dataset or 20NG dataset (you can modify as needed)
documents = fetch_20newsgroups(subset="train", remove=('headers', 'footers', 'quotes'))
documents = documents.data


# Preprocess: Sentence segmentation & tokenization
def preprocess_text(text):
    return [word.lower() for word in word_tokenize(text) if word.isalnum]


# Segment documents into sentences
documents_sentences = [sent_tokenize(doc) for doc in documents]
flattened_sentences = [sent for doc in documents_sentences for sent in doc]

# Compute PD (Document Distribution) - Word-based
def compute_word_distribution(doc_sentences):
    word_counts = Counter()
    for sent in doc_sentences:
        word_counts.update(preprocess_text(sent))
    
    total_words = sum(word_counts.values())
    return {word: count / total_words for word, count in word_counts.items()}


# KL - divergence function
def kl_divergence(p_dist, q_dist):
    p_vals = np.array(list(p_dist.values()))
    q_vals = np.array([q_dist.get(k, 1e-10) for k in p_dist.keys()])
    return entropy(p_vals, q_vals)


# KL divergence function
def kl_sum_word_based(doc_sentences, summary_length=3):
    PD = compute_word_distribution(doc_sentences)
    PS = {}  # Growing summary distribution
    summary = []

    while len(summary) < summary_length:
        best_sentence = None
        best_divergence = float('inf')

        for sentence in doc_sentences:
            new_summary = summary + [sentence]
            new_PS = compute_word_distribution(new_summary)
            divergence = kl_divergence(new_PS, PD)

            if divergence < best_divergence:
                best_divergence = divergence
                best_sentence = sentence
        
        if best_sentence:
            summary.append(best_sentence)
            doc_sentences.remove(best_sentence)

    return " ".join(summary)


# KL-Sum Summarization over topics using LDA
def kl_sum_topic_based(doc_sentences, dictionary, lda_model, summary_length=3):
    # Compute PD (Document topic distribution)
    bow_corpus = [dictionary.doc2bow(preprocess_text(sent)) for sent in doc_sentences]
    # PD = np.mean([dict(lda_model[doc]) for doc in bow_corpus], axis=0)
    # Initialize a dictionary to store topic distributions
    topic_sums = defaultdict(float)
    num_docs = len(bow_corpus)

    # Accumulate topic probabilities
    for doc in bow_corpus:
        for topic_id, prob in lda_model[doc]:  # lda_model[doc] returns list of (topic_id, probability)
            topic_sums[topic_id] += prob

    # Compute the mean topic distribution (PD)
    PD = {topic_id: topic_sums[topic_id] / num_docs for topic_id in topic_sums}
    PD_array = np.array(list(PD.values()))

    summary = []
    PS = np.zeros_like(PD_array)

    while len(summary) < summary_length:
        best_sentence = None
        best_divergence = float('inf')

        for sentence in doc_sentences:
            sent_bow = dictionary.doc2bow(preprocess_text(sentence))
            sent_topic_dist = dict(lda_model[sent_bow])

            new_PS = (PS * len(summary) + np.array([sent_topic_dist.get(k, 1e-10) for k in range(len(PD))])) / (len(summary) + 1)
            divergence = entropy(new_PS, PD_array)

            if divergence < best_divergence:
                best_divergence = divergence
                best_sentence = sentence

        if best_sentence:
            summary.append(best_sentence)
            doc_sentences.remove(best_sentence)

    return " ".join(summary)

[nltk_data] Downloading package punkt_tab to /Users/ajeyk/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [8]:
# Run KL-Sum over words
summary_words = kl_sum_word_based(flattened_sentences)
print("\nExtractive Summary (Word-based):")
print(summary_words)


Extractive Summary (Word-based):
Aykut Atalay Atakan -ciao +=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=+


In [None]:
# Run KL-summary over topics using LDA
dictionary = Dictionary([preprocess_text(sent) for sent in flattened_sentences])
corpus = [dictionary.doc2bow(preprocess_text(sent)) for sent in flattened_sentences]
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=20, passes=20)

In [15]:
import gensim
import string

In [22]:
import re

In [25]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = re.sub(r'\S+@\S+', '', text)  # Remove emails
    text = re.sub(r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b', '', text)  # Remove phone numbers
    text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)  # Remove special characters
    return text

def preprocess_texts(texts):
    """Tokenize and clean a list of documents."""
    all_sentences = []
    all_tokenized_sentences = []
    
    for text in texts:
        sentences = sent_tokenize(text)  # Split into sentences
        for sentence in sentences:
            words = word_tokenize(sentence.lower())  # Lowercase & tokenize
            words = [w for w in words if w not in stop_words and w not in string.punctuation]  # Remove stopwords & punctuation
            if len(words) >= 5:  # Filter out short sentences
                all_tokenized_sentences.append(words)  
                all_sentences.append(sentence)  

    return all_tokenized_sentences, all_sentences

def train_lda(documents, num_topics=10):
    """Train an LDA model using Gensim."""
    dictionary = Dictionary(documents)
    bow_corpus = [dictionary.doc2bow(doc) for doc in documents]
    lda_model = LdaModel(bow_corpus, num_topics=num_topics, id2word=dictionary, passes=40, random_state=42)
    return lda_model, dictionary, bow_corpus

def get_topic_distribution(lda_model, bow):
    """Get topic distribution for a document or summary."""
    topic_probs = lda_model.get_document_topics(bow, minimum_probability=1e-10)
    topic_dict = {k: v for k, v in topic_probs}  
    return np.array([topic_dict.get(k, 1e-10) for k in range(lda_model.num_topics)])

def kl_divergence(p, q):
    """Compute KL divergence between two probability distributions."""
    return entropy(p, q)

def kl_sum_per_topic(sentences, tokenized_sentences, lda_model, dictionary, summary_length=3):
    """Generate separate extractive summaries for each topic."""
    
    # Step 1: Get topic distribution for each sentence
    sentence_topic_dists = {}
    for i, sentence in enumerate(tokenized_sentences):
        bow = dictionary.doc2bow(sentence)
        if len(bow) > 0:  # Avoid empty docs
            sentence_topic_dists[sentences[i]] = get_topic_distribution(lda_model, bow)

    topic_summaries = {}

    # Step 2: Extract summaries per topic
    for topic_id in range(lda_model.num_topics):
        PD = np.zeros(lda_model.num_topics)  
        num_sentences = 0  

        # Compute PD: The topic distribution of all sentences that strongly belong to this topic
        for sent, dist in sentence_topic_dists.items():
            if dist[topic_id] > 0.25:  # Select sentences with meaningful weight for this topic
                PD += dist  
                num_sentences += 1  

        if num_sentences == 0:
            continue  

        PD /= num_sentences  # Normalize PD

        selected_sentences = []
        PS = np.ones_like(PD) * 1e-10  # Initialize uniform topic distribution

        while len(selected_sentences) < summary_length:
            best_sentence = None
            best_kl_score = float("inf")

            for sentence, topic_dist in sentence_topic_dists.items():
                if sentence in selected_sentences or topic_dist[topic_id] < 0.1:
                    continue  

                new_PS = (PS * len(selected_sentences) + topic_dist) / (len(selected_sentences) + 1)
                kl_score = kl_divergence(PD, new_PS)

                if kl_score < best_kl_score:
                    best_kl_score = kl_score
                    best_sentence = sentence

            if best_sentence:
                selected_sentences.append(best_sentence)
                PS = (PS * (len(selected_sentences) - 1) + sentence_topic_dists[best_sentence]) / len(selected_sentences)

        topic_summaries[topic_id] = selected_sentences

    return topic_summaries

# Load 20 Newsgroups dataset
documents = fetch_20newsgroups(subset="train", remove=('headers', 'footers', 'quotes'))
documents = documents.data  # Use a subset for faster training
documents = [clean_text(doc) for doc in documents]

# Preprocess dataset (get tokenized sentences & original sentences)
tokenized_sentences, original_sentences = preprocess_texts(documents)

# Train LDA on the dataset
lda_model, dictionary, bow_corpus = train_lda(tokenized_sentences, num_topics=12)

# Generate KL-Sum extractive summaries for each topic
topic_summaries = kl_sum_per_topic(original_sentences, tokenized_sentences, lda_model, dictionary, summary_length=2)

# Print summaries per topic
for topic, summary in topic_summaries.items():
    print(f"\nExtractive Summary for Topic {topic}:")
    print("\n".join(summary))


Extractive Summary for Topic 0:
He spoke
 to the racial heart strings of the German, opened the 
 fountain of his national genius, strock down the spirit
 of defeatism...At no period since the World War had Berlin
 conducted so realistic, well organized, and planned policy
 as now, since Hitlers assumption to power...And whatever
 others may think concerning Hitlerism and Fascism as a 
 system of Government, it is proved that they have revitalized
 and regenerated the two states, Germany and Italy.2

1 Captain George Haig, The Case of Palestine, in Hairenik
    Weekly, Friday, September 25, 1936.
He
also noted that the software people were starting to feel management
pressure to cut corners, but hadnt had to give in to it much yet.

Extractive Summary for Topic 1:
Beware of our materialistic, worldly and selfish motives.
Quoth the Moderator


In a short poem God in His mercy made  the fixed pains of Hell,
C. S. Lewis expresses an idea that Im sure was current among others,
but I haven

In [27]:
from datasets import load_dataset

In [29]:
duc_dataset = load_dataset("midas/duc2001", "raw")
duc_documents = duc_dataset['test']['document']
duc_documents = [" ".join(doc) for doc in duc_documents]
duc_documents = [clean_text(doc) for doc in duc_documents]

tokenized_sentences_duc, original_sentences_duc = preprocess_texts(duc_documents)

# Train LDA on the dataset
lda_model_duc, dictionary_duc, bow_corpus_duc = train_lda(tokenized_sentences_duc, num_topics=30)

# Generate KL-Sum extractive summaries for each topic
topic_summaries_duc = kl_sum_per_topic(original_sentences_duc, tokenized_sentences_duc, lda_model_duc, dictionary_duc, summary_length=2)

# Print summaries per topic
for topic, summary in topic_summaries_duc.items():
    print(f"\nExtractive Summary for Topic {topic}:")
    print("\n".join(summary))


Extractive Summary for Topic 0:
It was drawn by three pairs of black horses .
Banners welcoming him back were draped around the arena .

Extractive Summary for Topic 1:
Challenging them will be Uta Pippig , Wanda Panfil and Kim Jones.
Korologos asked the exhibitor ,  Why are you ruining this perfectly legitimate rifle ?

Extractive Summary for Topic 2:
He studiously ignores other major differences between 1791 and today .
Barber Conable is an outstanding person ,  he says .

Extractive Summary for Topic 3:
Rosendo Herrera , flight engineer  Sgt.
Carpio Villarreal , flight engineer  Sgt.

Extractive Summary for Topic 4:
Nevertheless , Venetia originally was to have employed 870 and it now has 764 .
Prediction hinges on spotting anomalous phenomena , or precursors .

Extractive Summary for Topic 5:
In Rotterdam on Sunday , Belayneh Dinsamo won in 20839 .
They are a conservative bunch , not given to Sundaysupplement scare stories .

Extractive Summary for Topic 6:
Then we said ,  Oh no ,

In [None]:
duc_dataset = load_dataset("midas/duc2001", "raw")
duc_documents = duc_dataset['test']['document']
duc_documents = [" ".join(doc) for doc in duc_documents]
duc_documents = [clean_text(doc) for doc in duc_documents]

