In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from eval_funcs import (
    perplexity_for_corpora,
    wasserstein_distance_embeddings,
    classify_real_vs_synth,
    compute_stat_properties
)

from coherence_utils import compute_topic_coherence

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("gowrishankarp/newspaper-text-summarization-cnn-dailymail")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/gowrishankarp/newspaper-text-summarization-cnn-dailymail?dataset_version_number=2...
Downloading from https://www.kaggle.com/api/v1/datasets/download/gowrishankarp/newspaper-text-summarization-cnn-dailymail?dataset_version_number=2...


100%|██████████| 503M/503M [00:20<00:00, 25.6MB/s] 

Extracting files...





Path to dataset files: C:\Users\rynoc\.cache\kagglehub\datasets\gowrishankarp\newspaper-text-summarization-cnn-dailymail\versions\2


In [4]:
cnn_train = pd.read_csv(path + '/cnn_dailymail/train.csv')
cnn_train.head(2)

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...


In [5]:
gemma_data = pd.read_csv('../gemma_1000_outputs.csv').drop('Unnamed: 0', axis=1)
gemma_data.head(2)

Unnamed: 0,uuid,topic,generated_article,elapsed_time
0,278aa54d-0ed8-4650-886e-99f285cc78e5,World - United Kingdom,The shocking incident occurred in the early ho...,23.479649
1,767d682b-5fde-4186-b3d9-e2de1f8bba8b,World - Middle East,Thousands of Palestinian protesters clashed wi...,17.532754


In [None]:
stats = compute_stat_properties(gemma_data['generated_article'])

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

{'avg_len_tokens': 417.7130126953125,
 'std_len_tokens': 49.771141052246094,
 'avg_len_chars': 2094.407,
 'ttr': 0.04242625917795233,
 'hapax_ratio': 0.29528269946958585}

In [8]:
ppl = perplexity_for_corpora(cnn_train['article'].sample(1000, random_state=42), gemma_data['generated_article'], batch_size=8, max_length=2048)

pytorch_model.bin:   0%|          | 0.00/251M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

[perplexity] device=cpu batch_size=8 max_length(requested)=2048 max_length(effective)=2048
[perplexity] num_docs: real=1000 synthetic=1000 total_batches=250
[perplexity] num_docs: real=1000 synthetic=1000 total_batches=250


model.safetensors:   0%|          | 0.00/251M [00:00<?, ?B/s]

[perplexity] progress 5/250 | elapsed=49.7s | avg/batch=9.93s | ETA~2433.2s
[perplexity] progress 10/250 | elapsed=108.0s | avg/batch=10.80s | ETA~2591.4s
[perplexity] progress 10/250 | elapsed=108.0s | avg/batch=10.80s | ETA~2591.4s
[perplexity] progress 15/250 | elapsed=172.4s | avg/batch=11.49s | ETA~2701.2s
[perplexity] progress 15/250 | elapsed=172.4s | avg/batch=11.49s | ETA~2701.2s


KeyboardInterrupt: 

In [9]:
wd = wasserstein_distance_embeddings(cnn_train['article'].sample(1000, random_state=42), gemma_data['generated_article'], n_projections=128)

[embed:real] device=cpu batch_size=8 max_length(req)=2048 max_length(eff)=2048 num_docs=1000


KeyboardInterrupt: 

In [None]:
clf_res = classify_real_vs_synth(cnn_train['article'].sample(1000, random_state=42), gemma_data['generated_article'], cv=5)

## Evaluation Metrics

## Coherence Evaluation

In [11]:

coherence_result = compute_topic_coherence(
        gemma_data['generated_article'],
        sample_size=1000,
        num_topics=15,
        random_seed=42
    )

Processing texts: 100%|██████████| 1000/1000 [00:01<00:00, 934.80it/s]


In [None]:
print("Statistical Properties:\n", stats)
print("\nPerplexity:", ppl)
print("\nWasserstein Distance:", wd)
print("\nClassification Results:\n", clf_res)
print("\nTopic Coherence:\n", coherence_result)
# Save evaluation results to CSV
gemma_evaluation_results = pd.DataFrame({
    'Statistical Properties': [stats],
    'Perplexity': [ppl],
    'Wasserstein Distance': [wd],
    'Classification Results': [clf_res],
    'Topic Coherence': [coherence_result]
})  