# Multi-Doc Summarizing
In this file, we will explore different approaches to summarizing a topic in the news given multiple articles discussing the same thing.

# Preprocessing
Extract sample data and preprocess it

In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, ElementClickInterceptedException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.firefox.options import Options

from datetime import datetime, timedelta

import time
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

import numpy as np

import requests
from bs4 import BeautifulSoup

#from dbconnect import insert_articles,connect_db,insert_similar_articles

#import dbconnect  # Import the module itself
import importlib

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN

import spacy
from keybert import KeyBERT
from collections import defaultdict
from rapidfuzz import fuzz
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import AgglomerativeClustering

import re
import ast

import time
from rouge import Rouge

import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import importlib
import re
import ast

import utils

from nltk.corpus import stopwords


In [5]:
#   Multi-doc Summarizer


connection = utils.connect_db()
df = pd.read_sql_query("""
    SELECT 
        sa.simart_id,
        a.article_id,
        atx.article_content
	FROM similar_articles sa
        JOIN junct_simart_articles jsa ON jsa.simart_id = sa.simart_id
        JOIN articles a ON a.article_id = jsa.article_id
        JOIN article_text atx ON atx.article_id = a.article_id
	WHERE sa.similar_weight >= 0.8
	AND EXISTS (
        SELECT 1
        FROM articles a2 
        JOIN junct_simart_articles jsa2 ON jsa2.article_id = a2.article_id
        WHERE jsa2.simart_id = sa.simart_id
        AND a2.date >= NOW() - INTERVAL '2 days'
	)
	GROUP BY sa.simart_id,a.article_id,atx.article_content
	ORDER BY sa.simart_id;
                       """,con=connection)
connection.close()

In [6]:
grouped_df = df.groupby('simart_id')['article_content'].apply(list).reset_index()

In [7]:
def preprocess(texts):
    cleaned = []
    for t in texts:
        #   Remove missing articles
        if pd.isna(t):
            continue
        t_clean = re.sub(r'[^\w\s\'\"$£€;\-:.,]', '', str(t)) # remove excessive punctuation, keep periods, quotes
        t_clean = t_clean.replace('  ',' ')
        #   Empty texts
        if t_clean.lower() == 'nan' or not t_clean.strip():
            continue
        
        cleaned.append(t_clean)
        #   check for dupe text
    
    return cleaned

In [8]:
def clean(df):
    rows_to_drop = []   # Drop empty matches after cleaning

    for i,r in df.iterrows():
        text_list = r['article_content']
        
        #   Clean texts
        cleaned_texts = preprocess(text_list)
        seen = set()
        unique_lst = [
            x
            for x in cleaned_texts 
            if not (x.strip().lower() in seen or seen.add(x.strip().lower()))
        ]
 
        df.at[i,'article_content'] = unique_lst
        if len(cleaned_texts) < 2:
            rows_to_drop.append(i)

    df.drop(rows_to_drop,inplace=True)
        
    return df

grouped_df = clean(grouped_df)

# Approach 1: Centroid-based Summarization for MDS
- Source: https://aclanthology.org/W00-0403.pdf \
    This study dives into an approach to multi-doc summarization for news articles specifically. \
    It highlights a centroid-based approach to collecting the most important sentences across all articles. \
    We will be replicating this study for approach 1.




In [9]:
from rapidfuzz import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

def fit_vectorizer():
    # Grab all
    connection = utils.connect_db()
    df = pd.read_sql_query("""
        SELECT jsa.simart_id, ax.article_id, ax.article_content 
        FROM junct_simart_articles jsa
        JOIN article_text ax ON jsa.article_id = ax.article_id
    """, con=connection)
    connection.close()

    grouped_df = df.groupby('simart_id')['article_content'].apply(list).reset_index()
    grouped_df = clean(grouped_df)  # Clean text

    # Function to remove highly similar duplicates
    def remove_dupes(df):
        new_df = df.copy()
        for i, row in new_df.iterrows():
            txt = row['article_content']
            to_remove = set()

            for x in txt:
                for y in txt:
                    if x == y:
                        continue
                    sim = fuzz.ratio(x, y)
                    if sim > 85:  # Near-duplicate detected
                        shorter, longer = (x, y) if len(x) < len(y) else (y, x)
                        to_remove.add(shorter)  # Keep the longer version

            # Remove duplicates for this row
            new_df.at[i, 'article_content'] = [t for t in txt if t not in to_remove]

        return new_df

    grouped_df = remove_dupes(grouped_df)

    # Check for remaining duplicates
    rows_to_drop = grouped_df[grouped_df['article_content'].apply(len) < 2].index
    grouped_df.drop(index=rows_to_drop, inplace=True)

    print(grouped_df)

    article_texts = []
    vectorizer = TfidfVectorizer(
        stop_words='english',
        token_pattern=r'(?u)\b[\w-]+\b'
    )

    for content_list in grouped_df['article_content']:
        for article in content_list:
            article_texts.append(article)


    vectorizer.fit(article_texts)
    #   Run once, save vectorizer
    joblib.dump(vectorizer,'tfidf_vectorizer.joblib')

fit_vectorizer()



      simart_id                                    article_content
36         8053  [Tech billionaire Elon Musk said that the soci...
41         8150  [Tech billionaire Elon Musk said that the soci...
43         8185  [March's full "Blood Worm Moon," a phenomenon ...
46         8231  [The mysterious disappearance of an American c...
47         8232  [An Indiana woman who was found alive in her c...
...         ...                                                ...
2731      14266  [Commissioner Richard Trumka Jr. is photograph...
2732      14267  [NIH Director Jayanta Bhattacharya, left, and ...
2733      14268  [Speaker of the House Rep. Mike Johnson, R-La....
2736      14271  [VATICAN CITY AP Pope Leo XIV laid out the vis...
2738      14277  [NUUK, Greenland AP Lisa Sólrun Christiansen g...

[1413 rows x 2 columns]


In [10]:
vectorizer = joblib.load('tfidf_vectorizer.joblib')

#### Calculate cluster centroid
The centroid, as seen in the study, is the average TF-IDF vector of all articles within a cluster of articles on the same topic. \
Thus, $
\text{Centroid} = \frac{\sum_{i=1}^{n} \mathbf{v}_i}{n}
$ For $\mathbf{v}_i$ = TF-IDF for each 1,..,n articles within a group

In [11]:
from nltk.tokenize import sent_tokenize
import nltk

from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


In [12]:
import spacy

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

def tokenize_sentences(article):
    return [sent.text for sent in nlp(article).sents]

def generate_summary(cluster_articles, vectorizer, summary_length=3):
    #MEAD-style summary generator for a single cluster
    # Transform articles to TF-IDF vectors
    tfidf_matrix = vectorizer.transform(cluster_articles)
    
    # Compute cluster centroid
    centroid = np.mean(tfidf_matrix.toarray(), axis=0)
    
    # Score sentences
    scored_sentences = []
    for article in cluster_articles:
        sentences = tokenize_sentences(article)
        for sent in sentences:
            sent_vec = vectorizer.transform([sent])
            similarity = cosine_similarity(sent_vec, [centroid])[0][0]
            scored_sentences.append({
                'text': sent,
                'score': similarity,
                'position': len(scored_sentences)  # Track original order
            })

     # Remove redundant sentences
    selected = []
    for sent in sorted(scored_sentences, key=lambda x: -x['score']):
        if not selected or all(
            cosine_similarity(
                vectorizer.transform([sent['text']]),
                vectorizer.transform([s['text']])
            )[0][0] < 0.7 for s in selected
        ):
            selected.append(sent)
            if len(selected) >= summary_length * 2:
                break
    
    # Preserve chronological order
    final_summary = sorted(selected[:summary_length], key=lambda x: x['position'])
    return ' '.join([s['text'] for s in final_summary])

In [13]:
a1_summary_df = pd.DataFrame(columns=['simart_id','summary'])

for i,r in grouped_df.iterrows():
    simart_id = r['simart_id']
    cluster_articles = r['article_content']
    
    summary = generate_summary(cluster_articles,vectorizer,summary_length=5)
    
    a1_summary_df = pd.concat([
        a1_summary_df,
        pd.DataFrame({'simart_id': [simart_id], 'summary': [summary]})
    ], ignore_index=True)

#   Approach 2: PRIMERA (Pyramid-based Masked Sentence Pre-training)
PRIMERA is a leading model for multi-doc summarization. \
It uses a pre-trained method called Entity Pyramid to identify the important sentences by getting frequency across documents and how representative each is.

In [14]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("allenai/PRIMERA")
model = AutoModelForSeq2SeqLM.from_pretrained("allenai/PRIMERA").to("cuda")

In [15]:
def generate_summary2(model, tokenizer, article_clust):
    # Check if the input cluster is empty
    if not article_clust:
        return "No content to summarize."

    # Use <doc-sep> to separate articles in the cluster
    article_text = "<doc-sep>".join(article_clust)
    
    # Tokenize the input, limiting max length and truncating if necessary
    inputs = tokenizer(article_text, return_tensors="pt", max_length=2048, truncation=True).to("cuda")

    try:
        # Generate summary with the specified parameters
        summary_ids = model.generate(inputs["input_ids"], max_length=200, min_length=50, length_penalty=2.0, num_beams=6)
        #   length_pentalty -> Penalty set on length of summary, >1 penalizes long summaries, <1 favors
        #   num_beams -> # of beams used in beam search (deciding over summary candidates): high = better quality, slower, less diverse
        # Decode the summary
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        return summary
    except Exception as e:
        print(f"Error in generating summary: {e}")
        return "Error during summarization."


In [16]:
a2_summary_df = pd.DataFrame(columns=['simart_id','summary'])

for i,r in grouped_df.iterrows():
    simart_id = r['simart_id']
    cluster_articles = r['article_content']
    
    summary = generate_summary2(model,tokenizer,cluster_articles)
    
    a2_summary_df = pd.concat([
        a2_summary_df,
        pd.DataFrame({'simart_id': [simart_id], 'summary': [summary]})
    ], ignore_index=True)

Input ids are automatically padded from 1737 to 2048 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 1188 to 1536 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 960 to 1024 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 1108 to 1536 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 1229 to 1536 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 679 to 1024 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 1012 to 1024 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 1393 to 1536 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 1906 to 2048 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 1098 to 1536 to be a multi

# Evaluation
To score and compare our approaches, we will use a combination of approaches:
1. Manual comparison
    - I will manually look at a subset of summaries and determine the best ones.
2. ROUGE (Recall-Oriented Understudy for Gisting Evaluation)
    - Combine all documents and evaluate the summary's precision and recall with ROUGE.
    - Precision: how relevant the summarized phrases are to the crux of the sources.
    - Recall: how much of the source material is included in the summary

### Limitations
Our method of evaluating the best approach is very limited. Without use of an LLM or large-scale human evaluation, it is hard to determine how good a summary is. For this reason, I will heavily rely on my own judgements.

In [17]:
def evaluate_summaries(candidate, reference):
    start_time = time.time()
    
    # Only use ROUGE-2 for speed
    rouge = Rouge()
    scores = rouge.get_scores(candidate, reference)[0]
    
    end_time = time.time()
    
    return {
        'rouge2_precision': scores['rouge-2']['p'],
        'rouge2_recall': scores['rouge-2']['r'],
        'rouge2_f1': scores['rouge-2']['f'],
        'duration_seconds': end_time - start_time
    }


In [18]:
lim = 0
for i, r in grouped_df.iterrows():
    lim += 1
    if lim > 5:
        break
    id = r['simart_id']
    
    summary_1 = a1_summary_df[a1_summary_df['simart_id'] == id]['summary'].iloc[0]
    summary_2 = a2_summary_df[a2_summary_df['simart_id'] == id]['summary'].iloc[0]
    
    #   Combine documents and evaluate summary
    combined_articles = " [DOC_SEP] ".join(r['article_content'])
    eval = evaluate_summaries(summary_1,combined_articles)
    eval2 = evaluate_summaries(summary_2,combined_articles)

    print("SUMMARY (Approach 1): ", summary_1)
    print('precision: ',eval['rouge2_precision'])
    print('recall: ',eval['rouge2_recall'])
    #for x in r['article_content']:
    #    print(x)
    
    print("SUMMARY (Approach 2): ", summary_2)
    print('precision: ',eval2['rouge2_precision'])
    print('recall: ',eval2['rouge2_recall'])
    #for x in r['article_content']:
    #    print(x)
    
    print("~~~~")

SUMMARY (Approach 1):  On Monday, Newsom shared a model ordinance for cities and counties to "immediately address dangerous and unhealthy encampments and connect people experiencing homelessness with shelter and services." Gov. Gavin Newsom; people at a homeless encampment in California Getty Images Newsom is also encouraging local leaders to use their authority, affirmed by the U.S. Supreme Court, to address encampments. PROGRESSIVE JOURNALIST SAYS NEWSOM MUST TAKE 'ACCOUNTABILITY' FOR HOW HE 'DESTROYED' CALIFORNIA California Gov. Gavin Newsom MediaNews GroupEast Bay Times via Getty Images Monday's announcement is in addition to the release of $3.3 billion in voter-approved Proposition 1 funding, which Newsom's office said will be made available later today to communities statewide. "Governor Newsom is the first governor to actively address this issue in our state, and he is reversing a crisis that was decades in the making," Newsom's office said. Saying that there are no more excuses

## Verdict
Both summaries are very similar in what they extract from the articles. \
They also share very similar precisions and recalls, but as we stated earlier, this is not a good determining factor. \
However, PRIMERA surpasses approach 1 by creating a more coherent flow of sentence structure. \
While approach 1 is decent at capturing similar meaning as PRIMERA, its sentence structure is very choppy, redundant, and messy. \ 
Lastly, approach 1 runs much faster. Time is not a big concern for our use though.