# Evaluations

Aggregate the compression and quality metrics that motivated the focus on short prompts. These cells compute compression ratios, ROUGE scores, and token length distributions for the generated datasets.

In [None]:
import csv
from collections import Counter
from pathlib import Path

PROJECT_ROOT = Path('..').resolve()
BASE_DATASET = PROJECT_ROOT / 'training_data' / 'dolly-summarization-data-rouge.csv'
SHORT_DATASET = PROJECT_ROOT / 'src' / 'training_data' / 'dolly-short-prompt-compression.csv'
VERY_SHORT_DATASET = PROJECT_ROOT / 'src' / 'training_data' / 'dolly-very-short-prompt-compression.csv'


def load_csv(path: Path):
    with path.open('r', encoding='utf-8', newline='') as fh:
        reader = csv.DictReader(fh)
        return list(reader)


def token_value(row, key):
    value = row.get(key, '')
    return int(value) if value else 0


def summarize_compression(rows, label: str):
    total_original = sum(token_value(row, 'original_token_count') for row in rows)
    total_compressed = sum(token_value(row, 'compressed_token_count') for row in rows)
    if not total_original:
        print(f'{label}: no token counts available')
        return None
    ratio = total_compressed / total_original
    print(f'{label}: {len(rows)} rows | tokens: {total_original} → {total_compressed} (ratio={ratio:.4f})')
    return ratio


def summarize_rouge(rows):
    if not rows or 'rouge_1' not in rows[0]:
        print('ROUGE columns not present in this dataset.')
        return None
    r1 = sum(float(row.get('rouge_1', 0) or 0) for row in rows) / len(rows)
    r2 = sum(float(row.get('rouge_2', 0) or 0) for row in rows) / len(rows)
    rl = sum(float(row.get('rouge_l', 0) or 0) for row in rows) / len(rows)
    print(f'Average ROUGE-1: {r1:.4f} | ROUGE-2: {r2:.4f} | ROUGE-L: {rl:.4f}')
    return r1, r2, rl


def bucket_lengths(rows, label: str):
    counts = [token_value(row, 'original_token_count') for row in rows if row.get('original_token_count')]
    if not counts:
        print(f'{label}: no rows')
        return
    buckets = [0, 16, 32, 48, 64, 96, 128, 160, 256, 512]
    counter = Counter()
    for value in counts:
        placed = False
        for start, end in zip(buckets, buckets[1:]):
            if start < value <= end:
                counter[f'{start + 1}-{end}'] += 1
                placed = True
                break
        if not placed:
            counter[f'>{buckets[-1]}'] += 1
    print(f'{label} token distribution:')
    for bucket in sorted(counter.keys(), key=lambda x: (len(x), x)):
        print(f'  {bucket}: {counter[bucket]}')

base_rows = load_csv(BASE_DATASET)
short_rows = load_csv(SHORT_DATASET)
very_short_rows = load_csv(VERY_SHORT_DATASET)

summarize_compression(base_rows, 'Full synthetic dataset')
summarize_rouge(base_rows)
bucket_lengths(base_rows, 'Full synthetic dataset')

print('Filtered subsets:')
summarize_compression(short_rows, '≤128 tokens')
bucket_lengths(short_rows, '≤128 tokens')

summarize_compression(very_short_rows, '≤64 tokens')
bucket_lengths(very_short_rows, '≤64 tokens')


Full synthetic dataset: 14779 rows | tokens: 236923 → 177070 (ratio=0.7474)
Average ROUGE-1: 0.7220 | ROUGE-2: 0.5207 | ROUGE-L: 0.6755
Full synthetic dataset token distribution:
  1-16: 10807
  >512: 7
  17-32: 2934
  33-48: 663
  49-64: 205
  65-96: 112
  97-128: 22
  129-160: 8
  161-256: 9
  257-512: 12
Filtered subsets:
≤128 tokens: 14739 rows | tokens: 247170 → 174772 (ratio=0.7071)
≤128 tokens token distribution:
  1-16: 9766
  17-32: 3641
  33-48: 844
  49-64: 263
  65-96: 189
  97-128: 36
≤64 tokens: 14514 rows | tokens: 228794 → 160613 (ratio=0.7020)
≤64 tokens token distribution:
  1-16: 9766
  17-32: 3641
  33-48: 844
  49-64: 263


In [14]:

# --- Apples-to-apples evaluation: Fine-tuned model on dsp-train.csv ---
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import pandas as pd
import numpy as np
import evaluate
from pathlib import Path
import random
import time
import os

os.environ['TOKENIZERS_PARALLELISM'] = 'false'  # Suppress fork warning

MODEL_DIR = Path('../small-prompt-compression-model').resolve()
DATA_PATH = Path('./training_data/dsp-train.csv').resolve()

max_to_process = 200  # Change as needed

# Load and sample
random.seed(42)
df = pd.read_csv(DATA_PATH)
if max_to_process < len(df):
    df = df.sample(n=max_to_process, random_state=42).reset_index(drop=True)
inputs = df['original'].astype(str).tolist()
refs = df['compressed'].astype(str).tolist()

# Load model/tokenizer
start_load = time.time()
tokenizer = AutoTokenizer.from_pretrained(str(MODEL_DIR))
model = AutoModelForSeq2SeqLM.from_pretrained(str(MODEL_DIR))
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
end_load = time.time()

max_input_length = 512
max_target_length = 256
batch_size = 8
rouge = evaluate.load('rouge')

generated = []
input_token_counts = []
generated_token_counts = []
reference_token_counts = []
gen_times = []

for i in range(0, len(inputs), batch_size):
    batch = inputs[i:i+batch_size]
    batch_refs = refs[i:i+batch_size]
    enc = tokenizer(batch, max_length=max_input_length, truncation=True, padding=True, return_tensors='pt').to(device)
    start_gen = time.time()
    with torch.no_grad():
        out = model.generate(**enc, max_new_tokens=max_target_length, num_beams=4, no_repeat_ngram_size=3)
    end_gen = time.time()
    decoded = tokenizer.batch_decode(out, skip_special_tokens=True)
    generated.extend(decoded)
    # Calculate token counts per example (no padding)
    batch_input_token_counts = [len(tokenizer(x, add_special_tokens=True).input_ids) for x in batch]
    batch_generated_token_counts = [len(tokenizer(x, add_special_tokens=True).input_ids) for x in decoded]
    batch_reference_token_counts = [len(tokenizer(x, add_special_tokens=True).input_ids) for x in batch_refs]
    input_token_counts.extend(batch_input_token_counts)
    generated_token_counts.extend(batch_generated_token_counts)
    reference_token_counts.extend(batch_reference_token_counts)
    gen_times.extend([end_gen - start_gen] * len(batch))

# ROUGE scores
scores = rouge.compute(predictions=generated, references=refs, use_stemmer=True)
scores = {k: (v.mid.fmeasure if hasattr(v, 'mid') else v) for k, v in scores.items()}

# Token savings
# Use new input_token_counts and generated_token_counts
total_input = sum(input_token_counts)
total_generated = sum(generated_token_counts)
savings_ratio = total_generated / max(1, total_input)

# Timing
avg_gen_time = np.mean(gen_times)
total_time = sum(gen_times)
load_time = end_load - start_load

print(f"\nFine-tuned model evaluation on dsp-train.csv:")
print(f"  Prompts processed: {len(inputs)} (max_to_process={max_to_process})")
print(f"  Total input tokens: {total_input}")
print(f"  Total generated tokens: {total_generated}")
print(f"  Compression ratio (generated/input): {savings_ratio:.4f}")
print(f"  ROUGE scores: {scores}")
print(f"  Model load time: {load_time:.2f} sec")
print(f"  Total generation time: {total_time:.2f} sec")
print(f"  Avg generation time per prompt: {avg_gen_time:.3f} sec")


Fine-tuned model evaluation on dsp-train.csv:
  Prompts processed: 200 (max_to_process=200)
  Total input tokens: 3247
  Total generated tokens: 2748
  Compression ratio (generated/input): 0.8463
  ROUGE scores: {'rouge1': np.float64(0.7729290856386746), 'rouge2': np.float64(0.5648445182454944), 'rougeL': np.float64(0.7392906920885842), 'rougeLsum': np.float64(0.7400663044079726)}
  Model load time: 0.07 sec
  Total generation time: 138.59 sec
  Avg generation time per prompt: 0.693 sec
