# Short Prompt Dolly Data Creation
This notebook explores the token count distribution in the Dolly summarization dataset and creates a filtered CSV with short prompts.

In [4]:
# Explore token count distribution and averages using CSV column
import csv

csv_path = 'training_data/dolly-summarization-data-rouge.csv'
under_180 = 0
under_80 = 0
total = 0
sum_all = 0
sum_under_180 = 0
sum_under_80 = 0

with open(csv_path, 'r', encoding='utf-8') as infile:
    reader = csv.DictReader(infile)
    for row in reader:
        token_count = int(row['original_token_count'])
        total += 1
        sum_all += token_count
        if token_count < 180:
            under_180 += 1
            sum_under_180 += token_count
        if token_count < 80:
            under_80 += 1
            sum_under_80 += token_count

avg_all = sum_all / total if total else 0
avg_under_180 = sum_under_180 / under_180 if under_180 else 0
avg_under_80 = sum_under_80 / under_80 if under_80 else 0

print(f'Total examples: {total}')
print(f'Examples with original_token_count < 180: {under_180}')
print(f'Examples with original_token_count < 80: {under_80}')
print(f'Average token count (all): {avg_all:.2f}')
print(f'Average token count (<180): {avg_under_180:.2f}')
print(f'Average token count (<80): {avg_under_80:.2f}')

Total examples: 14779
Examples with original_token_count < 180: 14755
Examples with original_token_count < 80: 14685
Average token count (all): 16.03
Average token count (<180): 15.10
Average token count (<80): 14.68


In [3]:
# Filter and save prompts with <=128 tokens
import csv

input_path = 'training_data/dolly-summarization-data-rouge.csv'
output_path = 'training_data/short-prompt-dolly-data.csv'

with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8', newline='') as outfile:
    reader = csv.DictReader(infile)
    writer = csv.writer(outfile)
    writer.writerow(['original', 'original_token_count'])
    for row in reader:
        token_count = int(row['original_token_count'])
        if token_count <= 128:
            writer.writerow([row['original'], row['original_token_count']])

print(f'Filtered prompts saved to {output_path}')

Filtered prompts saved to training_data/short-prompt-dolly-data.csv


In [5]:
# Analyze token count distribution in short-prompt-dolly-data.csv


import csv
from collections import Counter

input_path = 'training_data/short-prompt-dolly-data.csv'
token_counts = []
with open(input_path, 'r', encoding='utf-8') as infile:
    reader = csv.DictReader(infile)
    for row in reader:
        token_counts.append(int(row['original_token_count']))

print(f"Total rows: {len(token_counts)}")
if token_counts:
    print(f"Min token count: {min(token_counts)}")
    print(f"Max token count: {max(token_counts)}")
    print(f"Median token count: {sorted(token_counts)[len(token_counts)//2]}")
    buckets = [0, 32, 64, 96, 128]
    bucket_counts = Counter()
    for count in token_counts:
        for i in range(len(buckets)-1):
            if buckets[i] < count <= buckets[i+1]:
                bucket_counts[f"{buckets[i]+1}-{buckets[i+1]}"] += 1
                break
        else:
            if count <= buckets[0]:
                bucket_counts[f"0-{buckets[0]}" ] += 1
    print("Token count distribution:")
    for bucket in [f"0-{buckets[0]}"] + [f"{buckets[i]+1}-{buckets[i+1]}" for i in range(len(buckets)-1)]:
        print(f"  {bucket}: {bucket_counts[bucket]}")
else:
    print("No rows found in file.")

Total rows: 14743
Min token count: 1
Max token count: 128
Median token count: 12
Token count distribution:
  0-0: 0
  1-32: 13741
  33-64: 868
  65-96: 112
  97-128: 22


In [2]:
# Filter prompts <=64 tokens, save, and analyze distribution
import csv
from collections import Counter

input_path = 'training_data/short-prompt-dolly-data.csv'
output_path = 'training_data/very-short-prompt-dolly-data.csv'
token_counts = []
rows = []

with open(input_path, 'r', encoding='utf-8') as infile:
    reader = csv.DictReader(infile)
    for row in reader:
        count = int(row['original_token_count'])
        if count <= 64:
            rows.append({'original': row['original'], 'original_token_count': row['original_token_count']})
            token_counts.append(count)

with open(output_path, 'w', encoding='utf-8', newline='') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=['original', 'original_token_count'])
    writer.writeheader()
    writer.writerows(rows)

print(f"Filtered prompts saved to {output_path}")
print(f"Total rows: {len(token_counts)}")
if token_counts:
    print(f"Min token count: {min(token_counts)}")
    print(f"Max token count: {max(token_counts)}")
    print(f"Median token count: {sorted(token_counts)[len(token_counts)//2]}")
    buckets = [0, 16, 32, 48, 64]
    bucket_counts = Counter()
    for count in token_counts:
        for i in range(len(buckets)-1):
            if buckets[i] < count <= buckets[i+1]:
                bucket_counts[f"{buckets[i]+1}-{buckets[i+1]}"] += 1
                break
        else:
            if count <= buckets[0]:
                bucket_counts[f"0-{buckets[0]}" ] += 1
    print("Token count distribution:")
    for bucket in [f"0-{buckets[0]}"] + [f"{buckets[i]+1}-{buckets[i+1]}" for i in range(len(buckets)-1)]:
        print(f"  {bucket}: {bucket_counts[bucket]}")
else:
    print("No rows found with token count <= 64.")

Filtered prompts saved to training_data/very-short-prompt-dolly-data.csv
Total rows: 14609
Min token count: 1
Max token count: 64
Median token count: 12
Token count distribution:
  0-0: 0
  1-16: 10807
  17-32: 2934
  33-48: 663
  49-64: 205


In [1]:
# ---
# 1. Recalculate Token Counts and Compression Ratios, output to new file (with Rouge columns)
import csv
from pathlib import Path
from transformers import AutoTokenizer

def count_tokens(text):
    if not text:
        return 0
    return len(tokenizer.encode(text, add_special_tokens=False))

SOURCE_PATH = Path('training_data/dolly-summarization-data-rouge.csv')
OUTPUT_PATH = Path('training_data/dolly-prompt-compression.csv')
TOKENIZER_PATH = Path('/Users/dotslashderek/workspace/Gravitee/small-prompt-compression')
FALLBACK_TOKENIZER_NAME = 'dotslashderek/short-prompt-compressor'

if TOKENIZER_PATH.exists():
    tokenizer = AutoTokenizer.from_pretrained(str(TOKENIZER_PATH), use_fast=True)
else:
    tokenizer = AutoTokenizer.from_pretrained(FALLBACK_TOKENIZER_NAME, use_fast=True)

rows = []
with SOURCE_PATH.open('r', encoding='utf-8', newline='') as infile:
    reader = csv.DictReader(infile)
    fieldnames = reader.fieldnames or []
    # Ensure new columns are present
    for col in ['original_token_count', 'compressed_token_count', 'compression_ratio']:
        if col not in fieldnames:
            fieldnames.append(col)
    for row in reader:
        orig = (row.get('original') or '').strip()
        comp = (row.get('compressed') or row.get('compressed_prompt') or '').strip()
        orig_tok = count_tokens(orig)
        comp_tok = count_tokens(comp)
        row['original_token_count'] = str(orig_tok)
        row['compressed_token_count'] = str(comp_tok)
        if orig_tok > 0:
            row['compression_ratio'] = f"{comp_tok/orig_tok:.4f}"
        else:
            row['compression_ratio'] = ''
        rows.append(row)

with OUTPUT_PATH.open('w', encoding='utf-8', newline='') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(rows)

print(f"Updated token counts and compression ratios for {len(rows)} rows in {OUTPUT_PATH}.")


Token indices sequence length is longer than the specified maximum sequence length for this model (650 > 512). Running this sequence through the model will result in indexing errors


Updated token counts and compression ratios for 14779 rows in training_data/dolly-prompt-compression.csv.


In [3]:

# ---
# 2. Create Filtered Very-Short-Prompt CSV (v2) with Post-Processing
import re

INPUT_PATH = Path('training_data/dolly-prompt-compression.csv')
OUTPUT_PATH = Path('training_data/very-short-prompt-dolly-data-v2.csv')

def postprocess_compressed(text):
    words = text.strip().split()
    if len(words) <= 3:
        return text.strip()
    filtered = [w for w in words if w.lower() not in {'the', 'an', 'a'}]
    if filtered and re.match(r'[.?!…]+$', filtered[-1]):
        filtered = filtered[:-1]
    result = ' '.join(filtered)
    result = re.sub(r'[.?!…]+$', '', result).strip()
    return result

with INPUT_PATH.open('r', encoding='utf-8', newline='') as infile, OUTPUT_PATH.open('w', encoding='utf-8', newline='') as outfile:
    reader = csv.DictReader(infile)
    fieldnames = ['original', 'original_token_count', 'compressed', 'compressed_token_count']
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    writer.writeheader()
    kept = 0
    for row in reader:
        orig = (row.get('original') or '').strip()
        comp = (row.get('compressed') or row.get('compressed_prompt') or '').strip()
        orig_tok = int(row.get('original_token_count', 0))
        if orig_tok <= 64:
            comp_post = postprocess_compressed(comp)
            comp_tok = count_tokens(comp_post)
            writer.writerow({
                'original': orig,
                'original_token_count': orig_tok,
                'compressed': comp_post,
                'compressed_token_count': comp_tok
            })
            kept += 1
print(f"Wrote {kept} filtered and post-processed rows to {OUTPUT_PATH}.")

Wrote 14514 filtered and post-processed rows to training_data/very-short-prompt-dolly-data-v2.csv.


In [4]:
# ---
# 2. Create Filtered Very-Short-Prompt CSV (v2) with Post-Processing
import re

INPUT_PATH = Path('training_data/dolly-prompt-compression.csv')
OUTPUT_PATH = Path('training_data/dolly-short-prompt-compression.csv')

def postprocess_compressed(text):
    words = text.strip().split()
    if len(words) <= 3:
        return text.strip()
    filtered = [w for w in words if w.lower() not in {'the', 'an', 'a'}]
    if filtered and re.match(r'[.?!…]+$', filtered[-1]):
        filtered = filtered[:-1]
    result = ' '.join(filtered)
    result = re.sub(r'[.?!…]+$', '', result).strip()
    return result

with INPUT_PATH.open('r', encoding='utf-8', newline='') as infile, OUTPUT_PATH.open('w', encoding='utf-8', newline='') as outfile:
    reader = csv.DictReader(infile)
    fieldnames = ['original', 'original_token_count', 'compressed', 'compressed_token_count']
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    writer.writeheader()
    kept = 0
    for row in reader:
        orig = (row.get('original') or '').strip()
        comp = (row.get('compressed') or row.get('compressed_prompt') or '').strip()
        orig_tok = int(row.get('original_token_count', 0))
        if orig_tok <= 128:
            comp_post = postprocess_compressed(comp)
            comp_tok = count_tokens(comp_post)
            writer.writerow({
                'original': orig,
                'original_token_count': orig_tok,
                'compressed': comp_post,
                'compressed_token_count': comp_tok
            })
            kept += 1
print(f"Wrote {kept} filtered and post-processed rows to {OUTPUT_PATH}.")

Wrote 14739 filtered and post-processed rows to training_data/dolly-short-prompt-compression.csv.


In [6]:
# ---
# Split short and very-short prompt compression datasets into train/test (90/10)
import csv
import random
from pathlib import Path

def split_and_save(input_path, train_path, test_path, seed=42, train_frac=0.9):
    with open(input_path, 'r', encoding='utf-8', newline='') as infile:
        reader = list(csv.DictReader(infile))
        fieldnames = reader[0].keys() if reader else []
        random.Random(seed).shuffle(reader)
        n_train = int(len(reader) * train_frac)
        train_rows = reader[:n_train]
        test_rows = reader[n_train:]
    with open(train_path, 'w', encoding='utf-8', newline='') as trainfile:
        writer = csv.DictWriter(trainfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(train_rows)
    with open(test_path, 'w', encoding='utf-8', newline='') as testfile:
        writer = csv.DictWriter(testfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(test_rows)
    print(f"{input_path}: {len(train_rows)} train, {len(test_rows)} test rows written.")

split_and_save(
    'src/training_data/dolly-short-prompt-compression.csv',
    'src/training_data/dsp-train.csv',
    'src/training_data/dsp-test.csv'
)
split_and_save(
    'src/training_data/dolly-very-short-prompt-compression.csv',
    'src/training_data/dvsp-train.csv',
    'src/training_data/dvsp-test.csv'
)

src/training_data/dolly-short-prompt-compression.csv: 13265 train, 1474 test rows written.
src/training_data/dolly-very-short-prompt-compression.csv: 13062 train, 1452 test rows written.
