In [10]:
import sys
sys.path.append('..')

In [11]:

import pandas as pd
from tqdm import tqdm
import time

from summarizers.light import SumyTextRankSummarizer, SpacyTextrank 
from summarizers.balanced import SentenceTransformerEmbeddingSummarizer
from summarizers.heavy import ClaudeSummarizer

In [12]:
df = pd.read_csv('../dataset/unified_dataset_labeled.csv')
print(df.shape)

(198, 10)


In [19]:
# Kind of basline summarizer
random_summarizer = SumyTextRankSummarizer('random')

lsa_summarizer = SumyTextRankSummarizer('lsa')
textrank_summarizer = SumyTextRankSummarizer('text_rank')
spacy_textrank = SpacyTextrank()

# open_ai_embedding_summarizer_small = OpenAiEmbeddingSummarizer(model_name="text-embedding-3-small")
# open_ai_embedding_summarizer_large = OpenAiEmbeddingSummarizer(model_name="text-embedding-3-large")

# Balanced
sentence_transformer_distiluse = SentenceTransformerEmbeddingSummarizer("distiluse-base-multilingual-cased-v2")
# sentence_transformer_mpnet = SentenceTransformerEmbeddingSummarizer("paraphrase-multilingual-mpnet-base-v2")

# Heavy
claude_haiku_3_5 = ClaudeSummarizer("claude-3-5-haiku-20241022")
claude_sonnet_4 =ClaudeSummarizer("claude-sonnet-4-20250514")

summarizers = {
    # 'random': random_summarizer,
    # 'lsa': lsa_summarizer,
    # 'text_rank': textrank_summarizer,
    # 'spacy_textrank': spacy_textrank,
    # 'sentence_transformer_mpnet': sentence_transformer_mpnet,
    'sentence_transformer_distiluse': sentence_transformer_distiluse,
    # "claude_haiku_3_5": claude_haiku_3_5,
    # "claude_sonnet_4": claude_sonnet_4,
}

# kwargs to pass to the for each summarizer
params = {
    'sentence_transformer_distiluse': {"use_cached_embeddings": False}
}


sumy v2
sumy v2
sumy v2
fast summarizer v3
claude summarizer v3
claude summarizer v3


In [20]:
skip_exists =  False

for name in summarizers:
    if f"summary_{name}" not in df.columns:
        df[f"summary_{name}"] = None
    if f"summary_{name}_latency" not in df.columns:
        df[f"summary_{name}_latency"] = None

# Generate
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Running summarizers"):
    text, lang = row["cleaned_markdown"], row["language"]
    for name, summarizer in summarizers.items():
        
        if skip_exists and f"summary_{name}" in df.columns and pd.notna(df.at[idx, f"summary_{name}"]):
            print(f"Skipping {name} for index {idx} as it already exists.")
            continue
        
        kw = params.get(name, {})
        
        # calculate latency
        start = time.perf_counter()
        summary = summarizer.summarize(text, lang, **kw)
        latency = time.perf_counter() - start
        
        df.at[idx, f"summary_{name}"] = summary
        df.at[idx, f"summary_{name}_latency"] = latency

Running summarizers:  11%|█         | 21/198 [00:27<03:53,  1.32s/it]


KeyboardInterrupt: 

In [17]:
# Save
df.to_csv("../dataset/results.csv", index=False)

In [18]:
# show latency statistics
for name in summarizers:
    latencies = df[f"summary_{name}_latency"].dropna()
    # convert to ms
    latencies = latencies * 1000  # convert to milliseconds
    print(f"{name} - Mean Latency: {latencies.mean():.4f}ms, Std: {latencies.std():.4f}ms, Min: {latencies.min():.4f}ms, Max: {latencies.max():.4f}ms")

random - Mean Latency: 7.7135ms, Std: 19.6819ms, Min: 1.0364ms, Max: 199.1679ms
lsa - Mean Latency: 42.2017ms, Std: 58.2229ms, Min: 1.1941ms, Max: 657.9025ms
text_rank - Mean Latency: 26.0111ms, Std: 29.2877ms, Min: 0.9965ms, Max: 302.5229ms
spacy_textrank - Mean Latency: 202.9161ms, Std: 176.3273ms, Min: 0.5635ms, Max: 1338.0581ms
sentence_transformer_mpnet - Mean Latency: 4164.7493ms, Std: 3483.3543ms, Min: 0.2728ms, Max: 15013.8744ms
sentence_transformer_distiluse - Mean Latency: 248.8491ms, Std: 220.1000ms, Min: 0.2607ms, Max: 1466.2590ms
claude_haiku_3_5 - Mean Latency: 2474.0552ms, Std: 633.0200ms, Min: 1240.2865ms, Max: 4203.4304ms
claude_sonnet_4 - Mean Latency: 3547.7101ms, Std: 623.1898ms, Min: 1851.3053ms, Max: 5852.1279ms
