# Setup

In [1]:
import pandas as pd
import numpy as np

from eval_funcs import (
        perplexity_for_corpora,
        wasserstein_distance_embeddings,
        classify_real_vs_synth,
        compute_stat_properties
    )

# Import Real Dataset

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("gowrishankarp/newspaper-text-summarization-cnn-dailymail")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\loren\.cache\kagglehub\datasets\gowrishankarp\newspaper-text-summarization-cnn-dailymail\versions\2


In [3]:
cnn_train = pd.read_csv(r'C:\Users\loren\.cache\kagglehub\datasets\gowrishankarp\newspaper-text-summarization-cnn-dailymail\versions\2\cnn_dailymail\train.csv')

In [4]:
cnn_train.head(2)

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...


# Import TinyLlama Synthetic Dataset

In [5]:
tinyllama_data = pd.read_csv('../tinyllama_outputs.csv').drop('Unnamed: 0', axis=1)

In [6]:
tinyllama_data.head(2)

Unnamed: 0,uuid,topic,generated_article,elapsed_time
0,00b75438-5d31-4f38-add3-f873c44fdb65,Politics - Elections,"Fearing a catastrophic defeat at the polls, US...",5.364199
1,3a1296a5-83f0-4945-a105-74a1117758b6,World - Middle East,Bombarded by violent attacks in mid-East: CNN ...,4.014536


In [7]:
tinyllama_data['generated_article'].shape

(1000,)

In [8]:
tinyllama_data['generated_article'].str.split(' ').transform(lambda x: len(x)).max()

2396

# Run Eval Functions

### Running Statistical Properties Metrics on TinyLlama Data

In [9]:
stats = compute_stat_properties(tinyllama_data['generated_article'], max_length=4096)

In [10]:
stats

{'avg_len_tokens': 825.7349853515625,
 'std_len_tokens': 352.1277160644531,
 'avg_len_chars': 4039.864,
 'ttr': 0.031341774298049616,
 'hapax_ratio': 0.26866306027820713}

### Running Perplexity Scores

In [11]:
ppl = perplexity_for_corpora(cnn_train['article'].sample(1000, random_state=42), tinyllama_data['generated_article'], batch_size=8, max_length=2048)

[perplexity] device=cuda batch_size=8 max_length(requested)=2048 max_length(effective)=2048
[perplexity] num_docs: real=1000 synthetic=1000 total_batches=250


  attn_output = torch.nn.functional.scaled_dot_product_attention(


[perplexity] progress 5/250 | elapsed=5.3s | avg/batch=1.05s | ETA~258.3s
[perplexity] progress 10/250 | elapsed=10.6s | avg/batch=1.06s | ETA~253.8s
[perplexity] progress 15/250 | elapsed=16.4s | avg/batch=1.09s | ETA~257.2s
[perplexity] progress 20/250 | elapsed=21.5s | avg/batch=1.07s | ETA~247.2s
[perplexity] progress 25/250 | elapsed=28.7s | avg/batch=1.15s | ETA~258.7s
[perplexity] progress 30/250 | elapsed=32.7s | avg/batch=1.09s | ETA~239.9s
[perplexity] progress 35/250 | elapsed=37.8s | avg/batch=1.08s | ETA~232.5s
[perplexity] progress 40/250 | elapsed=43.1s | avg/batch=1.08s | ETA~226.1s
[perplexity] progress 45/250 | elapsed=48.3s | avg/batch=1.07s | ETA~220.1s
[perplexity] progress 50/250 | elapsed=55.2s | avg/batch=1.10s | ETA~220.8s
[perplexity] progress 55/250 | elapsed=62.8s | avg/batch=1.14s | ETA~222.8s
[perplexity] progress 60/250 | elapsed=68.0s | avg/batch=1.13s | ETA~215.3s
[perplexity] progress 65/250 | elapsed=74.5s | avg/batch=1.15s | ETA~212.0s
[perplexity] p

In [12]:
ppl

{'real': {'corpus_ppl': 20.093418953077368},
 'synthetic': {'corpus_ppl': 15.954454026445283}}

### Running Wasserstein Distance

In [13]:
wd = wasserstein_distance_embeddings(cnn_train['article'].sample(1000, random_state=42), tinyllama_data['generated_article'], n_projections=128)

[embed:real] device=cuda batch_size=8 max_length(req)=2048 max_length(eff)=2048 num_docs=1000
[embed:real] progress 5/125 | elapsed=2.7s | avg/batch=0.53s | ETA~63.6s
[embed:real] progress 10/125 | elapsed=5.4s | avg/batch=0.54s | ETA~61.9s
[embed:real] progress 15/125 | elapsed=8.5s | avg/batch=0.57s | ETA~62.2s
[embed:real] progress 20/125 | elapsed=11.0s | avg/batch=0.55s | ETA~58.0s
[embed:real] progress 25/125 | elapsed=14.0s | avg/batch=0.56s | ETA~56.0s
[embed:real] progress 30/125 | elapsed=16.4s | avg/batch=0.55s | ETA~51.8s
[embed:real] progress 35/125 | elapsed=19.0s | avg/batch=0.54s | ETA~48.9s
[embed:real] progress 40/125 | elapsed=21.7s | avg/batch=0.54s | ETA~46.2s
[embed:real] progress 45/125 | elapsed=24.4s | avg/batch=0.54s | ETA~43.3s
[embed:real] progress 50/125 | elapsed=27.6s | avg/batch=0.55s | ETA~41.5s
[embed:real] progress 55/125 | elapsed=30.8s | avg/batch=0.56s | ETA~39.2s
[embed:real] progress 60/125 | elapsed=33.5s | avg/batch=0.56s | ETA~36.3s
[embed:rea

In [14]:
wd

{'mean_distance': 0.18677199570207928,
 'distances': [0.03386680243086891,
  0.13545763071156314,
  0.05560612915598394,
  0.11469221387437938,
  0.16262268942037883,
  0.07505124159315679,
  0.17207024716239072,
  0.14755202405419743,
  0.29010557784006163,
  0.22645292630094083,
  0.15862075507372556,
  0.3991889076570622,
  0.01349262700678944,
  0.03022434033998387,
  0.5015110460533206,
  0.22782751710933058,
  0.06646386675173015,
  0.32162927963660026,
  0.05530110350177157,
  0.12391061870170042,
  0.15335110849796013,
  0.24391709586479415,
  0.07359275165058485,
  0.09087225088091751,
  0.2608741821658789,
  0.09904382957606749,
  0.027911155972345975,
  0.10393881749158124,
  0.26968685000376047,
  0.09211645534357973,
  0.06583862784712036,
  0.04253818242350693,
  0.2785969844820127,
  0.18767998163979066,
  0.3047907937181493,
  0.05931221067392144,
  0.21785647935635039,
  0.26609346102297105,
  0.3155713033517564,
  0.03842402754561618,
  0.15832716226575327,
  0.191912

### Runnning Classification (Real vs. Synthetic)

In [15]:
clf_res = classify_real_vs_synth(cnn_train['article'].sample(1000, random_state=42), tinyllama_data['generated_article'], cv=5)

[classify] batch_size=8 max_length=2048 cv=5 Cs=[0.1, 0.5, 1.0, 2.0, 5.0]
[embed:real] device=cuda batch_size=8 max_length(req)=2048 max_length(eff)=2048 num_docs=1000
[embed:real] progress 5/125 | elapsed=2.8s | avg/batch=0.55s | ETA~66.5s
[embed:real] progress 10/125 | elapsed=5.6s | avg/batch=0.56s | ETA~64.3s
[embed:real] progress 15/125 | elapsed=8.8s | avg/batch=0.59s | ETA~64.5s
[embed:real] progress 20/125 | elapsed=11.4s | avg/batch=0.57s | ETA~60.0s
[embed:real] progress 25/125 | elapsed=14.4s | avg/batch=0.58s | ETA~57.7s
[embed:real] progress 30/125 | elapsed=16.9s | avg/batch=0.56s | ETA~53.4s
[embed:real] progress 35/125 | elapsed=19.6s | avg/batch=0.56s | ETA~50.3s
[embed:real] progress 40/125 | elapsed=22.3s | avg/batch=0.56s | ETA~47.5s
[embed:real] progress 45/125 | elapsed=25.0s | avg/batch=0.56s | ETA~44.5s
[embed:real] progress 50/125 | elapsed=28.4s | avg/batch=0.57s | ETA~42.6s
[embed:real] progress 55/125 | elapsed=31.6s | avg/batch=0.57s | ETA~40.2s
[embed:real

In [16]:
clf_res

{'metrics': {'accuracy': 1.0, 'macro_f1': 1.0, 'roc_auc': 1.0},
 'report': {'real': {'precision': 1.0,
   'recall': 1.0,
   'f1-score': 1.0,
   'support': 200.0},
  'synthetic': {'precision': 1.0,
   'recall': 1.0,
   'f1-score': 1.0,
   'support': 200.0},
  'accuracy': 1.0,
  'macro avg': {'precision': 1.0,
   'recall': 1.0,
   'f1-score': 1.0,
   'support': 400.0},
  'weighted avg': {'precision': 1.0,
   'recall': 1.0,
   'f1-score': 1.0,
   'support': 400.0}},
 'embeddings_shape': (2000, 768),
 'classifier': Pipeline(steps=[('standardscaler', StandardScaler()),
                 ('logisticregressioncv',
                  LogisticRegressionCV(Cs=[0.1, 0.5, 1.0, 2.0, 5.0], cv=5,
                                       max_iter=5000, n_jobs=-1,
                                       scoring='roc_auc', solver='saga'))])}

# Visualize?