# Evaluations

Aggregate the compression and quality metrics that motivated the focus on short prompts. These cells compute compression ratios, ROUGE scores, and token length distributions for the generated datasets.

In [1]:
import csv
from collections import Counter
from pathlib import Path

PROJECT_ROOT = Path('..').resolve()
BASE_DATASET = PROJECT_ROOT / 'training_data' / 'dolly-summarization-data-rouge.csv'
SHORT_DATASET = PROJECT_ROOT / 'src' / 'training_data' / 'dolly-short-prompt-compression.csv'
VERY_SHORT_DATASET = PROJECT_ROOT / 'src' / 'training_data' / 'dolly-very-short-prompt-compression.csv'


def load_csv(path: Path):
    with path.open('r', encoding='utf-8', newline='') as fh:
        reader = csv.DictReader(fh)
        return list(reader)


def token_value(row, key):
    value = row.get(key, '')
    return int(value) if value else 0


def summarize_compression(rows, label: str):
    total_original = sum(token_value(row, 'original_token_count') for row in rows)
    total_compressed = sum(token_value(row, 'compressed_token_count') for row in rows)
    if not total_original:
        print(f'{label}: no token counts available')
        return None
    ratio = total_compressed / total_original
    print(f'{label}: {len(rows)} rows | tokens: {total_original} → {total_compressed} (ratio={ratio:.4f})')
    return ratio


def summarize_rouge(rows):
    if not rows or 'rouge_1' not in rows[0]:
        print('ROUGE columns not present in this dataset.')
        return None
    r1 = sum(float(row.get('rouge_1', 0) or 0) for row in rows) / len(rows)
    r2 = sum(float(row.get('rouge_2', 0) or 0) for row in rows) / len(rows)
    rl = sum(float(row.get('rouge_l', 0) or 0) for row in rows) / len(rows)
    print(f'Average ROUGE-1: {r1:.4f} | ROUGE-2: {r2:.4f} | ROUGE-L: {rl:.4f}')
    return r1, r2, rl


def bucket_lengths(rows, label: str):
    counts = [token_value(row, 'original_token_count') for row in rows if row.get('original_token_count')]
    if not counts:
        print(f'{label}: no rows')
        return
    buckets = [0, 16, 32, 48, 64, 96, 128, 160, 256, 512]
    counter = Counter()
    for value in counts:
        placed = False
        for start, end in zip(buckets, buckets[1:]):
            if start < value <= end:
                counter[f'{start + 1}-{end}'] += 1
                placed = True
                break
        if not placed:
            counter[f'>{buckets[-1]}'] += 1
    print(f'{label} token distribution:')
    for bucket in sorted(counter.keys(), key=lambda x: (len(x), x)):
        print(f'  {bucket}: {counter[bucket]}')

base_rows = load_csv(BASE_DATASET)
short_rows = load_csv(SHORT_DATASET)
very_short_rows = load_csv(VERY_SHORT_DATASET)

summarize_compression(base_rows, 'Full synthetic dataset')
summarize_rouge(base_rows)
bucket_lengths(base_rows, 'Full synthetic dataset')

print('Filtered subsets:')
summarize_compression(short_rows, '≤128 tokens')
bucket_lengths(short_rows, '≤128 tokens')

summarize_compression(very_short_rows, '≤64 tokens')
bucket_lengths(very_short_rows, '≤64 tokens')

Full synthetic dataset: 14779 rows | tokens: 236923 → 177070 (ratio=0.7474)
Average ROUGE-1: 0.7220 | ROUGE-2: 0.5207 | ROUGE-L: 0.6755
Full synthetic dataset token distribution:
  1-16: 10807
  >512: 7
  17-32: 2934
  33-48: 663
  49-64: 205
  65-96: 112
  97-128: 22
  129-160: 8
  161-256: 9
  257-512: 12
Filtered subsets:
≤128 tokens: 14739 rows | tokens: 247170 → 174772 (ratio=0.7071)
≤128 tokens token distribution:
  1-16: 9766
  17-32: 3641
  33-48: 844
  49-64: 263
  65-96: 189
  97-128: 36
≤64 tokens: 14514 rows | tokens: 228794 → 160613 (ratio=0.7020)
≤64 tokens token distribution:
  1-16: 9766
  17-32: 3641
  33-48: 844
  49-64: 263
