# Setup

In [None]:
import pandas as pd
import numpy as np

from eval_funcs import (
        perplexity_for_corpora,
        wasserstein_distance_embeddings,
        classify_real_vs_synth,
        compute_stat_properties
    )

# Import Real Dataset

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("gowrishankarp/newspaper-text-summarization-cnn-dailymail")

print("Path to dataset files:", path)

In [None]:
cnn_train = pd.read_csv(r'C:\Users\loren\.cache\kagglehub\datasets\gowrishankarp\newspaper-text-summarization-cnn-dailymail\versions\2\cnn_dailymail\train.csv')

In [None]:
cnn_train.head(2)

# Import TinyLlama Synthetic Dataset

In [None]:
qwen_data = pd.read_csv('../qwen_outputs.csv').drop('Unnamed: 0', axis=1)

In [None]:
qwen_data.head(2)

In [None]:
qwen_data['generated_article'].shape

In [None]:
qwen_data['generated_article'].str.split(' ').transform(lambda x: len(x)).max()

# Run Eval Functions

### Running Statistical Properties Metrics on TinyLlama Data

In [None]:
stats = compute_stat_properties(qwen_data['generated_article'], max_length=4096)

In [None]:
stats

### Running Perplexity Scores

In [None]:
ppl = perplexity_for_corpora(cnn_train['article'].sample(1000, random_state=42), qwen_data['generated_article'], batch_size=8, max_length=2048)

In [None]:
ppl

### Running Wasserstein Distance

In [None]:
wd = wasserstein_distance_embeddings(cnn_train['article'].sample(1000, random_state=42), qwen_data['generated_article'], n_projections=128)

In [None]:
wd

### Runnning Classification (Real vs. Synthetic)

In [None]:
clf_res = classify_real_vs_synth(cnn_train['article'].sample(1000, random_state=42), qwen_data['generated_article'], cv=5)

In [None]:
clf_res

# Data Quality Visualizations

Article Length

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Sample real data to match synthetic data size
real_texts = cnn_train['article'].sample(1000, random_state=42)
synth_texts = qwen_data['generated_article']

len_real = real_texts.str.split().str.len()
len_synth = synth_texts.str.split().str.len()

# Create the plot
plt.figure(figsize=(10, 6))
sns.kdeplot(len_real, label='Real (CNN)', fill=True, alpha=0.5, clip=(0, 4000))
sns.kdeplot(len_synth, label='Synthetic (Qwen)', fill=True, alpha=0.5, clip=(0, 4000))

plt.title('Distribution of Article Lengths (in Tokens)', fontsize=16)
plt.xlabel('Article Length')
plt.ylabel('Density')
plt.legend()
plt.savefig('article_length_distribution.png')
plt.show();

Perplexity

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Get the data from your 'ppl' variable
ppl_scores = {
    'Real (CNN)': ppl['real']['corpus_ppl'],
    'Synthetic (TinyLlama)': ppl['synthetic']['corpus_ppl']
}

# 2. Create the plot
plt.figure(figsize=(7, 5))
sns.barplot(x=list(ppl_scores.keys()), y=list(ppl_scores.values()))

plt.title('Perplexity Score Comparison', fontsize=16)
plt.ylabel('Perplexity (Lower is Better)')
plt.savefig('perplexity_comparison.png')
plt.show();

Wasserstein Distance

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Get data from 'wd'
distances = wd['mean_distance']
mean_dist = wd['mean_distance']

# Create plot
plt.figure(figsize=(10, 6))
sns.histplot(distances, bins=30, kde=True)

plt.axvline(mean_dist, color='red', linestyle='--', label=f'Mean Distance: {mean_dist:.3f}')
plt.title('Distribution of Wasserstein Distances (128 Projections)', fontsize=16)
plt.xlabel('Wasserstein Distance')
plt.ylabel('Frequency')
plt.legend()
plt.savefig('wasserstein_distance_distribution.png')
plt.show();

Classifier: t-SNE Plot of Real vs. Synthetic Embeddings

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
import numpy as np
import pandas as pd

# Compute embeddings

real_texts = cnn_train['article'].sample(1000, random_state=42)
synth_texts = qwen_data['generated_article']

print("Computing embeddings for REAL texts (for t-SNE)...")
Er = compute_opt_embeddings(
    real_texts, 
    batch_size=8, 
    max_length=2048, 
    verbose=True, 
    label="real_tsne"
)

print("\nComputing embeddings for SYNTHETIC texts (for t-SNE)...")
Es = compute_opt_embeddings(
    synth_texts, 
    batch_size=8, 
    max_length=2048, 
    verbose=True, 
    label="synth_tsne"
)

# Combine embeddings and create labels
embeddings = np.concatenate([Er, Es], axis=0)
labels = ['Real (CNN)'] * len(Er) + ['Synthetic (Qwen)'] * len(Es)

print(f"\nRunning t-SNE on {embeddings.shape[0]} embeddings")

# Reduce 768 dimensions down to 2
tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000)
tsne_results = tsne.fit_transform(embeddings)

# Create dataframe and plot

df_tsne = pd.DataFrame({
    'tsne_1': tsne_results[:, 0],
    'tsne_2': tsne_results[:, 1],
    'label': labels
})

plt.figure(figsize=(10, 8))
sns.scatterplot(
    x='tsne_1', y='tsne_2',
    hue='label',
    palette=sns.color_palette("hls", 2),
    data=df_tsne,
    alpha=0.7
)

plt.title('t-SNE Projection of Real vs. Synthetic Embeddings (OPT-125m)', fontsize=16)
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend(loc='best')
plt.savefig('tsne_embeddings.png')
plt.show();