In [1]:
# Cell to load Dolly 15k dataset and print first ten entries (rows)
from datasets import load_dataset

# Load the Dolly 15k dataset from Hugging Face
try:
    dolly = load_dataset("databricks/databricks-dolly-15k")
    # Print the first ten entries from the train split (actual data rows)
    for i in range(10):
        entry = dolly["train"][i]
        print(f"Entry {i+1}:")
        print(entry)
        print("-" * 40)
except Exception as e:
    print(f"Error loading dataset: {e}")

  from .autonotebook import tqdm as notebook_tqdm


Entry 1:
{'instruction': 'When did Virgin Australia start operating?', 'context': "Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.", 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.', 'category': 'closed_qa'}
----------------------------------------
Entry 2:
{'instruction': 'Which is a species of fish? Tope or Rope', 'context': '', 'response': 'Tope', 'category': 'classification'}
----------------------------------------
Entry 3:
{'instruction': 'Why can camels 

In [2]:
# Cell to create dolly-15k-summarizations.csv with 'original' column from Dolly instructions
from datasets import load_dataset
import csv

# Load Dolly 15k dataset
try:
    dolly = load_dataset("databricks/databricks-dolly-15k")
    instructions = dolly["train"]["instruction"]
    output_path = "training_data/dolly-15k-summarizations.csv"
    with open(output_path, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["original", "summarization"])
        for instr in instructions:
            writer.writerow([instr, ""])
    print(f"Created {output_path} with {len(instructions)} rows.")
except Exception as e:
    print(f"Error creating CSV: {e}")

Created training_data/dolly-15k-summarizations.csv with 15011 rows.


In [5]:
!pip install --quiet openai tiktoken
import csv
import openai
import tiktoken
import time
import os
import shutil
import concurrent.futures

# Set your OpenAI API key (ensure it's securely loaded)
openai.api_key = ""  # Replace with your actual key or use environment variable

input_path = "training_data/dolly-15k-summarizations.csv"
output_path = "training_data/dolly-15k-compressions-gpt-test.csv"
temp_output_path = output_path + ".tmp"

system_prompt = (
    "You are a prompt compression model. Rewrite user prompts for chatbots and LLMs to be as short as possible, using the fewest possible words and tokens, while preserving every detail, nuance, and requirement. "
    "Prefer entity-only answers or keyword lists when possible. Do not add any labels, explanations, or extra words. If the prompt cannot be compressed without losing information, return it unchanged. "
    "The output must never be longer than the input."
)

# Pre-populate output file if it doesn't exist
if not os.path.exists(output_path):
    with open(input_path, "r", encoding="utf-8") as infile, open(output_path, "w", encoding="utf-8", newline="") as outfile:
        reader = csv.DictReader(infile)
        fieldnames = ["original", "compression", "original_token_count", "compression_token_count", "compression_ratio"]
        writer = csv.DictWriter(outfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
        writer.writeheader()
        for row in reader:
            writer.writerow({
                "original": row["original"],
                "compression": "",
                "original_token_count": "",
                "compression_token_count": "",
                "compression_ratio": ""
            })

# Read all rows from output file (to resume if interrupted)
with open(output_path, "r", encoding="utf-8") as outfile:
    reader = csv.DictReader(outfile)
    rows = list(reader)

try:
    enc = tiktoken.encoding_for_model("gpt-5-nano")
except Exception:
    enc = tiktoken.get_encoding("cl100k_base")

def count_tokens(text):
    return len(enc.encode(text))

def clean_summary(summary):
    for prefix in ["Summary:", "Question:", "Riddle:"]:
        if summary.strip().startswith(prefix):
            summary = summary.strip()[len(prefix):].strip()
    return ' '.join(summary.strip().split())

def compress_instruction(instruction):
    # No global state, safe for threading
    print(f"Calling LLM for instruction: {instruction}")
    try:
        response = openai.chat.completions.create(
            model="gpt-5-nano",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": instruction}
            ]
        )
        summary = response.choices[0].message.content.strip()
        summary = clean_summary(summary)
        return summary
    except Exception as e:
        print(f"Error compressing: {e}")
        return f"[ERROR: {e}]"

batch_size = 100
max_workers = 5  # Number of concurrent LLM calls
results = []
processed = 0
for i, row in enumerate(rows):
    if row["compression"].strip():
        results.append(row)
        continue
    # Gather a batch of rows needing compression
    batch_rows = []
    batch_indices = []
    for j in range(i, len(rows)):
        if not rows[j]["compression"].strip():
            batch_rows.append(rows[j]["original"])
            batch_indices.append(j)
        if len(batch_rows) == batch_size:
            break
    # Run concurrent LLM calls for this batch
    compressions = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        compressions = list(executor.map(compress_instruction, batch_rows))
    # Update results and rows with compressions
    for idx, instr, comp in zip(batch_indices, batch_rows, compressions):
        orig_tokens = count_tokens(instr)
        comp_clean = clean_summary(comp)
        if count_tokens(comp_clean) >= orig_tokens:
            comp_clean = instr
        comp_tokens = count_tokens(comp_clean)
        compression_ratio = round(comp_tokens / orig_tokens, 2) if orig_tokens else 1.0
        row_data = {
            "original": instr,
            "compression": comp_clean,
            "original_token_count": orig_tokens,
            "compression_token_count": comp_tokens,
            "compression_ratio": compression_ratio
        }
        results.append(row_data)
        rows[idx] = row_data
        processed += 1
        print(f"Row {idx+1}: orig_tokens={orig_tokens}, comp_tokens={comp_tokens}, compression_ratio={compression_ratio}\n{'-'*40}")
    # Write batch to file
    with open(temp_output_path, "w", encoding="utf-8", newline="") as tmpfile:
        writer = csv.DictWriter(tmpfile, fieldnames=["original", "compression", "original_token_count", "compression_token_count", "compression_ratio"], quoting=csv.QUOTE_ALL)
        writer.writeheader()
        writer.writerows(results + rows[max(batch_indices)+1:])
    shutil.move(temp_output_path, output_path)
    print(f"Batch of {len(batch_rows)} compressions written to {output_path} (up to row {max(batch_indices)+1})")
    # Move i to after this batch
    i = max(batch_indices)
    if i >= len(rows)-1:
        break

# Write any remaining results at the end
with open(output_path, "w", encoding="utf-8", newline="") as outfile:
    writer = csv.DictWriter(outfile, fieldnames=["original", "compression", "original_token_count", "compression_token_count", "compression_ratio"], quoting=csv.QUOTE_ALL)
    writer.writeheader()
    writer.writerows(results + rows[i+1:])
print(f"Completed compression for all {len(results)} rows. Output written to {output_path}")



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Calling LLM for instruction: When did Virgin Australia start operating?Calling LLM for instruction: Which is a species of fish? Tope or Rope

Calling LLM for instruction: Why can camels survive for long without water?
Calling LLM for instruction: Alice's parents have three daughters: Amy, Jessy, and what’s the name of the third daughter?
Calling LLM for instruction: When was Tomoaki Komorida born?
Calling LLM for instruction: If I have more pieces at the time of stalemate, have I won?
Calling LLM for instruction: Given a reference text about Lollapalooza, where does it take place, who started it and what is it?
Calling LLM for instruction: Who gave the UN the land in NY to build their HQ
Calling LLM for instruction: Why mobile is bad fo