# Setup

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import dill as pkl
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from eval_funcs import (
        perplexity_for_corpora,
        wasserstein_distance_embeddings,
        classify_real_vs_synth,
        compute_stat_properties,
        compute_opt_embeddings
    )

from coherence_utils import compute_topic_coherence

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\loren\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\loren\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Import Real Dataset

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("gowrishankarp/newspaper-text-summarization-cnn-dailymail")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\loren\.cache\kagglehub\datasets\gowrishankarp\newspaper-text-summarization-cnn-dailymail\versions\2


In [3]:
cnn_train = pd.read_csv(r'C:\Users\loren\.cache\kagglehub\datasets\gowrishankarp\newspaper-text-summarization-cnn-dailymail\versions\2\cnn_dailymail\train.csv')

In [4]:
cnn_train.head(2)

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...


# Run Eval Functions

### Running Statistical Properties Metrics on TinyLlama Data

In [7]:
agg_stats = {}
for seed in range(10):
        stats = compute_stat_properties(cnn_train['article'].sample(1000, random_state=seed), max_length=4096)
        for k, v in stats.items():
                if k not in agg_stats:
                        agg_stats[k] = []
                agg_stats[k].append(v)
for k, v in agg_stats.items():
        agg_stats[k] = np.mean(v)
agg_stats

{'avg_len_tokens': 866.3100036621094,
 'std_len_tokens': 413.8068115234375,
 'avg_len_chars': 4027.174399999999,
 'ttr': 0.036102450097657626,
 'hapax_ratio': 0.22892320829817744}

### Running Coherence Metrics

In [8]:
coherence_result = compute_topic_coherence(
        cnn_train['article'].sample(1000, random_state=42),
        sample_size=1000,
        num_topics=15,
        random_seed=42
    )
with open('pickles/tinyllama_coherence_result.pkl', 'wb') as f:
        pkl.dump(coherence_result, f)

Processing texts: 100%|██████████| 1000/1000 [00:02<00:00, 457.39it/s]


In [9]:
coherence_result

{'coherence_score': 0.30041111187314246,
 'model_info': {'num_topics': 15, 'dictionary_size': 7691, 'num_docs': 1000},
 'data_info': {'input_texts': 1000, 'processed_texts': 1000, 'sampled': True},
 'parameters': {'sample_size': 1000,
  'no_below': 5,
  'no_above': 0.5,
  'num_passes': 5,
  'random_seed': 42},
 'timestamp': '2025-11-02 19:54:34'}