# Prompt Compression + Variant Generation
This notebook batches prompts through `gpt-5-nano` to produce compressed prompts, token counts, and two stylistic variants for fine-tuning.


## Workflow Overview
1. (Optional) install dependencies.
2. Configure your OpenAI API key.
3. Configure compression rules and helpers.
4. Run the processing cell to populate new columns in `training_data/very-short-prompt-dolly-data.csv`.

If the run is interrupted, re-run the final cell: completed rows are skipped automatically.


In [None]:
# If running in a fresh environment, install the required packages first.
!pip install --quiet --upgrade openai transformers

In [1]:
import os
from getpass import getpass

try:
    from openai import OpenAI
except ImportError as exc:
    raise ImportError('openai package not found. Install it via %pip install openai before continuing.') from exc

if not os.getenv('OPENAI_API_KEY'):
    os.environ['OPENAI_API_KEY'] = getpass('Enter your OpenAI API key: ').strip()

CLIENT = OpenAI(api_key=os.environ['OPENAI_API_KEY'])
print('API key configured. Ready to call gpt-5-nano.')


API key configured. Ready to call gpt-5-nano.


In [90]:

import os
import csv
import shutil
import tempfile
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from itertools import islice
from pathlib import Path
from typing import Iterable, Sequence

from openai import OpenAI
from pydantic import BaseModel
from transformers import AutoTokenizer

MODEL_NAME = 'gpt-5-nano'
TOKENIZER_PATH = Path('/Users/dotslashderek/workspace/Gravitee/small-prompt-compression')
FALLBACK_TOKENIZER_NAME = 'dotslashderek/short-prompt-compressor'
CSV_PATH = Path('training_data/very-short-prompt-dolly-data.csv')
FIELDNAMES = [
    'original',
    'original_token_count',
    'compressed_prompt',
    'compressed_token_count',
    'uncompressed_alt_one',
    'uncompressed_alt_two',
]
FLUSH_EVERY = 5
MAX_RETRIES = 4
RETRY_BACKOFF_SECONDS = 2
MAX_OUTPUT_TOKENS = 5000
REASONING_EFFORT = 'low'
VERBOSITY = 'low'
BATCH_SIZE = 6
MAX_WORKERS = 6
LIMIT = 100
PROCESS_ONLY_PENDING = True

SYSTEM_PROMPT = """You are PromptCompressor, an expert at rewriting prompts for downstream LLMs while preserving every constraint.

Workflow:
1. Read the original prompt carefully.
2. Produce two stylistic variants of the original. They must keep all facts and constraints but remain slightly more conversational.
3. Generate the most compressed prompt possible, double-checking against the original and both variants so nothing is lost.

Compression rules:
- Strip trailing punctuation (question marks, exclamation points, ellipses, periods) unless removing it alters the meaning.
- Drop optional articles, helper verbs, fillers, or polite wording when they are not required for meaning.
- Preserve entities, dates, numbers, units, modality, polarity, and ordering exactly.
- Prefer minimal punctuation overall; use simple separators such as semicolons only when grouping is ambiguous.
- If the original prompt is already minimal, return it unchanged.

Variant rules:
- Each variant must be semantically identical to the original.
- Include at least one harmless fluff token (e.g., conversational filler, doubled punctuation) in each variant.
- Keep variants roughly the same length as the original prompt.

Output format:
Return a JSON object with keys compressed_prompt, uncompressed_alt_one, uncompressed_alt_two. Do not add commentary or Markdown fences.
"""

if TOKENIZER_PATH.exists():
    TOKENIZER = AutoTokenizer.from_pretrained(str(TOKENIZER_PATH), use_fast=True)
else:
    TOKENIZER = AutoTokenizer.from_pretrained(FALLBACK_TOKENIZER_NAME, use_fast=True)

_CLIENT: OpenAI | None = None


class PromptVariants(BaseModel):
    compressed_prompt: str
    uncompressed_alt_one: str
    uncompressed_alt_two: str


def get_client() -> OpenAI:
    global _CLIENT
    if _CLIENT is None:
        api_key = os.getenv('OPENAI_API_KEY')
        if not api_key:
            raise RuntimeError('OPENAI_API_KEY environment variable is not set.')
        _CLIENT = OpenAI(api_key=api_key)
    return _CLIENT


def ensure_csv_schema(csv_path: Path, fieldnames: Sequence[str]):
    if not csv_path.exists():
        raise FileNotFoundError(f"{csv_path} not found.")
    with csv_path.open('r', encoding='utf-8', newline='') as handle:
        reader = csv.DictReader(handle)
        rows = list(reader)
        existing_fieldnames = reader.fieldnames or []
    field_set = set(fieldnames)
    normalized_rows = []
    needs_rewrite = existing_fieldnames != list(fieldnames)
    for row in rows:
        normalized = {field: row.get(field, '') for field in fieldnames}
        if set(row.keys()) != field_set:
            needs_rewrite = True
        normalized_rows.append(normalized)
    normalized_rows = refresh_original_token_counts(normalized_rows)
    if needs_rewrite:
        with csv_path.open('w', encoding='utf-8', newline='') as handle:
            writer = csv.DictWriter(handle, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(normalized_rows)
    return normalized_rows


def load_rows(csv_path: Path):
    with csv_path.open('r', encoding='utf-8', newline='') as handle:
        reader = csv.DictReader(handle)
        return list(reader)


def write_rows(csv_path: Path, rows: Iterable[dict]):
    rows = list(rows)
    rows = refresh_original_token_counts(rows)
    tmp = tempfile.NamedTemporaryFile('w', delete=False, encoding='utf-8', newline='')
    try:
        writer = csv.DictWriter(tmp, fieldnames=FIELDNAMES)
        writer.writeheader()
        writer.writerows(rows)
    finally:
        tmp.close()
    shutil.move(tmp.name, csv_path)


def count_tokens(text: str) -> int:
    if not text:
        return 0
    return len(TOKENIZER.encode(text, add_special_tokens=False))


def refresh_original_token_counts(rows: Iterable[dict]):
    refreshed = []
    for row in rows:
        original = (row.get('original') or '').strip()
        row = dict(row)
        row['original_token_count'] = str(count_tokens(original))
        refreshed.append(row)
    return refreshed


def batched(iterable, size: int):
    iterator = iter(iterable)
    while True:
        batch = list(islice(iterator, size))
        if not batch:
            break
        yield batch


def extract_output_text(response):
    raw_text = getattr(response, 'output_text', None)
    if raw_text:
        return raw_text.strip()
    fragments = []
    for item in getattr(response, 'output', []) or []:
        for fragment in getattr(item, 'content', []) or []:
            fragment_type = fragment.get('type') if isinstance(fragment, dict) else getattr(fragment, 'type', None)
            if fragment_type == 'output_text':
                if isinstance(fragment, dict):
                    fragments.append(fragment.get('text', ''))
                else:
                    fragments.append(getattr(fragment, 'text', '') or '')
    return ''.join(fragments).strip()


def call_model(prompt_text: str):
    messages: tuple[dict[str, str], ...] = (
        {
            'role': 'system',
            'content': SYSTEM_PROMPT,
        },
        {
            'role': 'user',
            'content': f"""Original prompt:
{prompt_text}

First craft the two variants, then produce the compressed prompt. Return only the JSON object with the required keys.""",
        },
    )
    client = get_client()
    last_error: Exception | None = None
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            response = client.responses.parse(
                model=MODEL_NAME,
                input=messages,
                reasoning={'effort': REASONING_EFFORT},
                text={'verbosity': VERBOSITY},
                max_output_tokens=MAX_OUTPUT_TOKENS,
                text_format=PromptVariants,
            )
            raw_text = extract_output_text(response)
            parsed = response.output_parsed
            if parsed is None:
                if not raw_text:
                    raise RuntimeError('Model returned no text to parse for structured output.')
                parsed = PromptVariants.model_validate_json(raw_text)
            return parsed, raw_text
        except Exception as err:
            last_error = err
            wait_for = RETRY_BACKOFF_SECONDS * attempt
            print(f'Attempt {attempt} failed ({err}). Retrying in {wait_for:.1f}s...')
            time.sleep(wait_for)
    raise RuntimeError(f'Failed to generate compression after {MAX_RETRIES} attempts: {last_error}')


def process_dataset(limit: int | None = None):
    rows = ensure_csv_schema(CSV_PATH, FIELDNAMES)
    total_rows = len(rows)
    pending_total = sum(1 for row in rows if not (row.get('compressed_prompt') or '').strip())
    if PROCESS_ONLY_PENDING:
        target_rows = [(idx, row) for idx, row in enumerate(rows) if not (row.get('compressed_prompt') or '').strip()]
    else:
        target_rows = list(enumerate(rows))
    print(f'Total rows: {total_rows} | Pending: {pending_total}')
    if limit is not None:
        target_rows = target_rows[:limit]
        print(f'Processing up to {len(target_rows)} rows (limit={limit}).')
    if not target_rows:
        print('Nothing to do!')
        return

    processed = 0
    max_workers = min(MAX_WORKERS, len(target_rows)) or 1
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for batch in batched(target_rows, BATCH_SIZE):
            print(f'Submitting batch of {len(batch)} prompts (starting row {batch[0][0] + 1}).')
            futures = {
                executor.submit(call_model, (row.get('original') or '').strip()): (row_idx, row)
                for row_idx, row in batch
            }
            for future in as_completed(futures):
                row_idx, row = futures[future]
                original_prompt = (row.get('original') or '').strip()
                try:
                    parsed, raw_content = future.result()
                except Exception as err:
                    raise RuntimeError(f'Row {row_idx + 1} failed after retries: {err}') from err
                data = parsed.model_dump()
                compressed = data.get('compressed_prompt') or original_prompt
                row['compressed_prompt'] = compressed
                row['compressed_token_count'] = str(count_tokens(compressed))
                row['uncompressed_alt_one'] = data.get('uncompressed_alt_one', '')
                row['uncompressed_alt_two'] = data.get('uncompressed_alt_two', '')
                processed += 1
                print(f"Row {row_idx + 1}: compressed tokens {row['compressed_token_count']} | Raw response: {raw_content[:120]}...")
                if processed % FLUSH_EVERY == 0:
                    write_rows(CSV_PATH, rows)
                    print(f'Progress saved through row {row_idx + 1}. Processed so far: {processed}.')
            write_rows(CSV_PATH, rows)
            print(f'Progress saved through row {batch[-1][0] + 1}. Processed so far: {processed}.')
    write_rows(CSV_PATH, rows)
    print(f'Run complete. Processed {processed} rows.')



In [91]:

# Alternative helpers: compression only (no stylistic variants)

COMPRESS_ONLY_SYSTEM_PROMPT = """You are PromptCompressor, an expert at compressing prompts for downstream LLMs while preserving every constraint.

Workflow:
1. Read the original prompt carefully.
2. Come up with a shorter version of the prompt that conveys the same meaning as the original, with no loss of semantic context.

Compression rules:
- Strip trailing punctuation (question marks, exclamation points, ellipses, periods) at the end of the prompt.
- Reduce fluff words - 'of', 'a', 'the', 'an', etc - wherever possible
- Drop optional articles, helper verbs, fillers, or polite wording when they are not required for meaning.
- Preserve entities, dates, numbers, units, modality, polarity, and ordering exactly.
- Prefer minimal punctuation overall; use simple separators such as semicolons only when grouping is ambiguous.

Examples:

Who was Kyle Van Zyl playing against when he scored 36 of hisa teams 61 points? -> Who was Kyle Van Zyl playing when he scored 36 of teams 61 points
From the passage list down the areas for which Dar es Salaam is Tanzania's most prominent city. List the results in comma separated format -> From passage list areas where Dar es Salaam is Tanzania's most prominent city list results comma separated
What is a polygon? -> What is a polygon
Which episodes of season four of Game of Thrones did Michelle MacLaren direct? -> season four episodes Game of Thrones did Michelle MacLaren direct
What is process mining -> What is process mining
What are some unique curtain tie backs that you can make yourself -> list unique curtain tie backs you can make yourself
Who gave the UN the land in NY to build their HQ? -> Who gave UN land in NY to build HQ
"""


class CompressedPromptOnly(BaseModel):
    compressed_prompt: str


def call_model_compression_only(prompt_text: str):
    messages: tuple[dict[str, str], ...] = (
        {
            'role': 'system',
            'content': COMPRESS_ONLY_SYSTEM_PROMPT,
        },
        {
            'role': 'user',
            'content': f"""Original prompt:
{prompt_text}

Return only JSON with key compressed_prompt.""",
        },
    )
    client = get_client()
    last_error: Exception | None = None
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            response = client.responses.parse(
                model=MODEL_NAME,
                input=messages,
                reasoning={'effort': REASONING_EFFORT},
                text={'verbosity': VERBOSITY},
                max_output_tokens=MAX_OUTPUT_TOKENS,
                text_format=CompressedPromptOnly,
            )
            raw_text = extract_output_text(response)
            parsed = response.output_parsed
            if parsed is None:
                if not raw_text:
                    raise RuntimeError('Model returned no text to parse for compression-only output.')
                parsed = CompressedPromptOnly.model_validate_json(raw_text)
            return parsed, raw_text
        except Exception as err:
            last_error = err
            wait_for = RETRY_BACKOFF_SECONDS * attempt
            print(f'Attempt {attempt} failed ({err}). Retrying in {wait_for:.1f}s...')
            time.sleep(wait_for)
    raise RuntimeError(f'Compression-only call failed after {MAX_RETRIES} attempts: {last_error}')


# Just process and add compression columns to the CSV, assuming extra columns may exist

def process_dataset_compression_only(limit: int | None = None):
    import csv
    from pathlib import Path
    rows = []
    with open(CSV_PATH, 'r', encoding='utf-8', newline='') as handle:
        reader = csv.DictReader(handle)
        for row in reader:
            rows.append(row)
    total_rows = len(rows)
    pending_total = sum(1 for row in rows if not (row.get('compressed_prompt') or '').strip())
    if PROCESS_ONLY_PENDING:
        target_rows = [(idx, row) for idx, row in enumerate(rows) if not (row.get('compressed_prompt') or '').strip()]
    else:
        target_rows = list(enumerate(rows))
    print(f'Total rows: {total_rows} | Pending: {pending_total}')
    if limit is not None:
        target_rows = target_rows[:limit]
        print(f'Processing up to {len(target_rows)} rows (limit={limit}).')
    if not target_rows:
        print('Nothing to do!')
        return

    processed = 0
    from concurrent.futures import ThreadPoolExecutor, as_completed
    max_workers = min(MAX_WORKERS, len(target_rows)) or 1
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for batch in batched(target_rows, BATCH_SIZE):
            print(f'Submitting batch of {len(batch)} prompts (starting row {batch[0][0] + 1}).')
            futures = {
                executor.submit(call_model_compression_only, (row.get('original') or '').strip()): (row_idx, row)
                for row_idx, row in batch
            }
            for future in as_completed(futures):
                row_idx, row = futures[future]
                original_prompt = (row.get('original') or '').strip()
                try:
                    parsed, raw_content = future.result()
                except Exception as err:
                    raise RuntimeError(f'Row {row_idx + 1} failed after retries: {err}') from err
                data = parsed.model_dump()
                compressed = data.get('compressed_prompt') or original_prompt
                row['compressed_prompt'] = compressed
                row['compressed_token_count'] = str(count_tokens(compressed))
                processed += 1
                print(f"Row {row_idx + 1}: compressed tokens {row['compressed_token_count']} | Raw response: {raw_content[:120]}...")
                if processed % FLUSH_EVERY == 0:
                    # Write all columns, even if extra ones exist
                    with open(CSV_PATH, 'w', encoding='utf-8', newline='') as handle:
                        writer = csv.DictWriter(handle, fieldnames=rows[0].keys())
                        writer.writeheader()
                        writer.writerows(rows)
                    print(f'Progress saved through row {row_idx + 1}. Processed so far: {processed}.')
            with open(CSV_PATH, 'w', encoding='utf-8', newline='') as handle:
                writer = csv.DictWriter(handle, fieldnames=rows[0].keys())
                writer.writeheader()
                writer.writerows(rows)
            print(f'Progress saved through row {batch[-1][0] + 1}. Processed so far: {processed}.')
    with open(CSV_PATH, 'w', encoding='utf-8', newline='') as handle:
        writer = csv.DictWriter(handle, fieldnames=rows[0].keys())
        writer.writeheader()
        writer.writerows(rows)
    print(f'Compression-only run complete. Processed {processed} rows.')



In [92]:
process_dataset_compression_only(limit=LIMIT)

Total rows: 14609 | Pending: 14589
Processing up to 100 rows (limit=100).
Submitting batch of 6 prompts (starting row 21).
Row 21: compressed tokens 8 | Raw response: {"compressed_prompt":"Give me top 5 golf equipment company names"}...
Row 23: compressed tokens 12 | Raw response: {"compressed_prompt":"Which Dutch artist painted Girl with a Pearl Earring"}...
Row 25: compressed tokens 3 | Raw response: {"compressed_prompt":"What is verb"}...
Row 26: compressed tokens 27 | Raw response: {"compressed_prompt":"Extract owner of Lamborghini and listing of different types of Huracan cars Lamborghini has produc...
Row 22: compressed tokens 9 | Raw response: {"compressed_prompt":"Who has won the most Olympic gold medals"}...
Progress saved through row 22. Processed so far: 5.
Row 24: compressed tokens 7 | Raw response: {"compressed_prompt":"What happens when the sun goes down"}...
Progress saved through row 26. Processed so far: 6.
Submitting batch of 6 prompts (starting row 27).
Row 30: compr

In [62]:
# Optional: inspect a few rows after processing
# import pandas as pd
# df = pd.read_csv(CSV_PATH)
# df.head()

# Test a single compression prompt and print the full API response for troubleshooting
example_prompt = "Summarize the following review: The food was great but the service was slow."
try:
    messages = [
        {'role': 'system', 'content': COMPRESS_ONLY_SYSTEM_PROMPT},
        {'role': 'user', 'content': f"""Original prompt:\n{example_prompt}\n\nReturn only JSON with key compressed_prompt."""}
    ]
    client = get_client()
    response = client.responses.parse(
        model=MODEL_NAME,
        input=messages,
        reasoning={'effort': REASONING_EFFORT},
        text={'verbosity': VERBOSITY},
        max_output_tokens=MAX_OUTPUT_TOKENS,
        text_format=CompressedPromptOnly,
    )
    print("[TEST] Full API response:", response)
    raw_text = extract_output_text(response)
    print("[TEST] Raw model response:", raw_text)
    parsed = response.output_parsed
    print("[TEST] Parsed output:", parsed)
except Exception as e:
    print("[TEST] Exception during single prompt test:", e)

[TEST] Full API response: ParsedResponse[CompressedPromptOnly](id='resp_0f492f7d6c6a72d70068fabe5cc838819c8b1e51e3ef15aa7e', created_at=1761263196.0, error=None, incomplete_details=IncompleteDetails(reason='max_output_tokens'), instructions=None, metadata={}, model='gpt-5-nano-2025-08-07', object='response', output=[ResponseReasoningItem(id='rs_0f492f7d6c6a72d70068fabe5d285c819c88595c1c83fe8bd6', summary=[], type='reasoning', content=None, encrypted_content=None, status=None)], parallel_tool_calls=True, temperature=1.0, tool_choice='auto', tools=[], top_p=1.0, background=False, conversation=None, max_output_tokens=128, max_tool_calls=None, previous_response_id=None, prompt=None, prompt_cache_key=None, reasoning=Reasoning(effort='medium', generate_summary=None, summary=None), safety_identifier=None, service_tier='default', status='incomplete', text=ResponseTextConfig(format=ResponseFormatTextJSONSchemaConfig(name='CompressedPromptOnly', schema_={'properties': {'compressed_prompt': {'tit

In [106]:

# Analyze first 120 rows for token savings, filtering out rows with original_token_count > 64
import csv
from pathlib import Path
import pandas as pd

CSV_PATH = Path('training_data/very-short-prompt-dolly-data.csv')

# Load first 120 rows
with CSV_PATH.open('r', encoding='utf-8', newline='') as handle:
    reader = csv.DictReader(handle)
    rows = [row for _, row in zip(range(120), reader)]

# Filter rows with original_token_count <= 64
filtered_rows = [row for row in rows if int(row.get('original_token_count', 0)) <= 64]

total_original_tokens = sum(int(row.get('original_token_count', 0)) for row in filtered_rows)
total_compressed_tokens = sum(int(row.get('compressed_token_count', 0)) for row in filtered_rows)

if total_original_tokens > 0:
    savings_pct = 100 * (1 - total_compressed_tokens / total_original_tokens)
else:
    savings_pct = 0.0

print(f"First 120 rows (filtered, original_token_count <= 64):")
print(f"Total original tokens: {total_original_tokens}")
print(f"Total compressed tokens: {total_compressed_tokens}")
print(f"Overall savings: {savings_pct:.2f}%")

# Compare first 120 rows (with <=64 tokens) from both CSVs, calculate token counts and compression ratio
import csv
from pathlib import Path
from transformers import AutoTokenizer

# Paths
SOURCE_CSV = Path('training_data/dolly-summarization-data-rouge.csv')
COMPRESSED_CSV = Path('training_data/very-short-prompt-dolly-data.csv')

# Load tokenizer (same logic as elsewhere in notebook)
TOKENIZER_PATH = Path('/Users/dotslashderek/workspace/Gravitee/small-prompt-compression')
FALLBACK_TOKENIZER_NAME = 'dotslashderek/short-prompt-compressor'
if TOKENIZER_PATH.exists():
    tokenizer = AutoTokenizer.from_pretrained(str(TOKENIZER_PATH), use_fast=True)
else:
    tokenizer = AutoTokenizer.from_pretrained(FALLBACK_TOKENIZER_NAME, use_fast=True)

def count_tokens(text):
    if not text:
        return 0
    return len(tokenizer.encode(text, add_special_tokens=False))

# Step 1: Load and filter first 150 rows from both files, keep only those with <=64 tokens (on the fly), select first 120
def load_and_filter(csv_path, tokenizer, max_rows=150, max_tokens=64, select_rows=120):
    filtered = []
    with csv_path.open('r', encoding='utf-8', newline='') as handle:
        reader = csv.DictReader(handle)
        for row in reader:
            orig = (row.get('original') or '').strip()
            orig_tokens = count_tokens(orig)
            if orig_tokens <= max_tokens:
                row['original_token_count_actual'] = orig_tokens
                filtered.append(row)
            if len(filtered) >= select_rows:
                break
            if len(filtered) + (max_rows - len(filtered)) < select_rows:
                # If not enough rows left to reach select_rows after filtering, break early
                break
    return filtered[:select_rows]

filtered_rows_a = load_and_filter(SOURCE_CSV, tokenizer)
filtered_rows_b = load_and_filter(COMPRESSED_CSV, tokenizer)

# Step 2: Compare and calculate token counts for both approaches
print(f"Comparing first 120 rows with <=64 tokens from both files (each is a different compression approach):")
print(f"{'Idx':>3} | {'A Orig':<40} | {'A OrigTok':>8} | {'A Compr':<40} | {'A CompTok':>8} | {'B Orig':<40} | {'B OrigTok':>8} | {'B Compr':<40} | {'B CompTok':>8}")
print('-'*200)
total_a_orig = total_a_comp = total_b_orig = total_b_comp = 0
for idx, (row_a, row_b) in enumerate(zip(filtered_rows_a, filtered_rows_b), 1):
    a_orig = (row_a.get('original') or '').strip()
    a_orig_tok = count_tokens(a_orig)
    a_comp = (row_a.get('compressed') or row_a.get('compressed_prompt') or '').strip()
    a_comp_tok = count_tokens(a_comp)
    b_orig = (row_b.get('original') or '').strip()
    b_orig_tok = count_tokens(b_orig)
    b_comp = (row_b.get('compressed') or row_b.get('compressed_prompt') or '').strip()
    b_comp_tok = count_tokens(b_comp)
    total_a_orig += a_orig_tok
    total_a_comp += a_comp_tok
    total_b_orig += b_orig_tok
    total_b_comp += b_comp_tok
    print(f"{idx:3} | {a_orig[:40]:<40} | {a_orig_tok:8} | {a_comp[:40]:<40} | {a_comp_tok:8} | {b_orig[:40]:<40} | {b_orig_tok:8} | {b_comp[:40]:<40} | {b_comp_tok:8}")

print('-'*200)
print(f"Totals:")
print(f"A: Original tokens: {total_a_orig} | Compressed tokens: {total_a_comp}")
print(f"B: Original tokens: {total_b_orig} | Compressed tokens: {total_b_comp}")
if total_a_orig > 0:
    savings_a = 100 * (1 - total_a_comp / total_a_orig)
    print(f"A: Compression ratio: {total_a_comp/total_a_orig:.2f} | Savings: {savings_a:.2f}%")
else:
    print("A: No tokens in original column.")
if total_b_orig > 0:
    savings_b = 100 * (1 - total_b_comp / total_b_orig)
    print(f"B: Compression ratio: {total_b_comp/total_b_orig:.2f} | Savings: {savings_b:.2f}%")
else:
    print("B: No tokens in original column.")

# After the previous comparison, filter and display rows where A's compressed prompt is shorter than B's
shorter_in_a = []
for idx, (row_a, row_b) in enumerate(zip(filtered_rows_a, filtered_rows_b), 1):
    a_comp = (row_a.get('compressed') or row_a.get('compressed_prompt') or '').strip()
    a_comp_tok = count_tokens(a_comp)
    b_comp = (row_b.get('compressed') or row_b.get('compressed_prompt') or '').strip()
    b_comp_tok = count_tokens(b_comp)
    if a_comp_tok < b_comp_tok:
        shorter_in_a.append((idx, row_a, row_b, a_comp_tok, b_comp_tok))

print(f"\nRows where A's compressed prompt is shorter than B's:")
print(f"{'Idx':>3} | {'A Orig':<40} | {'A Compr':<40} | {'A CompTok':>8} | {'B Compr':<40} | {'B CompTok':>8}")
print('-'*160)
for idx, row_a, row_b, a_comp_tok, b_comp_tok in shorter_in_a:
    a_orig = (row_a.get('original') or '').strip()
    a_comp = (row_a.get('compressed') or row_a.get('compressed_prompt') or '').strip()
    b_comp = (row_b.get('compressed') or row_b.get('compressed_prompt') or '').strip()
    print(f"{idx:3} | {a_orig[:200]:<200} | {a_comp[:200]:<200} | {a_comp_tok:8} | {b_comp[:200]:<200} | {b_comp_tok:8}")
print(f"Total rows where A's compression is shorter: {len(shorter_in_a)} out of {len(filtered_rows_a)}")

# Also display rows where B's compressed prompt is shorter than A's by at least 2 tokens
shorter_in_b = []
for idx, (row_a, row_b) in enumerate(zip(filtered_rows_a, filtered_rows_b), 1):
    a_comp = (row_a.get('compressed') or row_a.get('compressed_prompt') or '').strip()
    a_comp_tok = count_tokens(a_comp)
    b_comp = (row_b.get('compressed') or row_b.get('compressed_prompt') or '').strip()
    b_comp_tok = count_tokens(b_comp)
    if b_comp_tok < a_comp_tok - 1:
        shorter_in_b.append((idx, row_a, row_b, a_comp_tok, b_comp_tok))

print(f"\nRows where B's compressed prompt is shorter than A's by at least 2 tokens:")
print(f"{'Idx':>3} | {'A Orig':<40} | {'A Compr':<40} | {'A CompTok':>8} | {'B Compr':<40} | {'B CompTok':>8}")
print('-'*160)
for idx, row_a, row_b, a_comp_tok, b_comp_tok in shorter_in_b:
    a_orig = (row_a.get('original') or '').strip()
    a_comp = (row_a.get('compressed') or row_a.get('compressed_prompt') or '').strip()
    b_comp = (row_b.get('compressed') or row_b.get('compressed_prompt') or '').strip()
    print(f"{idx:3} | {a_orig[:200]:<200} | {a_comp[:200]:<200} | {a_comp_tok:8} | {b_comp[:200]:<200} | {b_comp_tok:8}")
print(f"Total rows where B's compression is shorter by 2+: {len(shorter_in_b)} out of {len(filtered_rows_a)}")

First 120 rows (filtered, original_token_count <= 64):
Total original tokens: 1769
Total compressed tokens: 1463
Overall savings: 17.30%
Comparing first 120 rows with <=64 tokens from both files (each is a different compression approach):
Idx | A Orig                                   | A OrigTok | A Compr                                  | A CompTok | B Orig                                   | B OrigTok | B Compr                                  | B CompTok
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  1 | When did Virgin Australia start operatin |        7 | When did Virgin Australia start operatin |        7 | When did Virgin Australia start operatin |        7 | When did Virgin Australia start operatin |        6
  2 | Which is a species of fish? Tope or Rope |       13 | Which is a species of fish? Tope or Rope |       13 | Wh