In [3]:
import csv

input_path = 'training_data/validation_examples.csv'
output_path = 'training_data/validation_examples_repaired.csv'

with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8', newline='') as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)
    header = next(reader)
    writer.writerow(header)
    good, bad = 0, 0
    for row in reader:
        if len(row) == 4:
            writer.writerow(row)
            good += 1
        else:
            bad += 1
print(f"Repair complete. {good} good rows written, {bad} bad rows skipped. Cleaned file: {output_path}")


Repair complete. 1527 good rows written, 4 bad rows skipped. Cleaned file: training_data/validation_examples_repaired.csv


In [4]:
import pandas as pd

input_path = 'training_data/validation_examples_repaired.csv'
output_path = 'training_data/validation_examples_trimmed.csv'

def trim_csv(input_path, output_path, chunk_size=500):
    first = True
    for chunk in pd.read_csv(input_path, chunksize=chunk_size):
        trimmed = chunk[['original', 'summarization']]
        trimmed.to_csv(output_path, mode='w' if first else 'a', index=False, header=first)
        first = False

trim_csv(input_path, output_path)
print(f"Trimmed CSV written to {output_path}")


Trimmed CSV written to training_data/validation_examples_trimmed.csv


In [5]:
import pandas as pd
from transformers import pipeline
from evaluate import load

# Load trimmed CSV
input_path = 'training_data/validation_examples_trimmed.csv'
df = pd.read_csv(input_path)

# Randomly select 10 examples
sample = df.sample(n=10, random_state=42)

# Load summarization pipeline (facebook/bart-large-cnn)
summarizer = pipeline('summarization', model='facebook/bart-large-cnn')
rouge = load('rouge')

generated_summaries = []
reference_summaries = []

for idx, row in sample.iterrows():
    prompt = row['original']
    reference = row['summarization']
    # Optionally prepend 'summarize: ' for consistency with test-summarization-model
    # prompt = 'summarize: ' + prompt
    result = summarizer(prompt, max_length=200, min_length=10, do_sample=False)
    generated = result[0]['summary_text']
    generated_summaries.append(generated)
    reference_summaries.append(reference)
    print(f"\nPrompt: {prompt}\nGenerated: {generated}\nReference: {reference}")

# Compute ROUGE scores
scores = rouge.compute(predictions=generated_summaries, references=reference_summaries)
print("\nROUGE scores (aggregated):")
for k, v in scores.items():
    print(f"{k}: {v:.4f}")

print("\nROUGE-L for each example:")
for i, (gen, ref) in enumerate(zip(generated_summaries, reference_summaries)):
    score = rouge.compute(predictions=[gen], references=[ref])
    print(f"Example {i+1}: ROUGE-L: {score['rougeL']:.4f}")


  from .autonotebook import tqdm as notebook_tqdm
Device set to use mps:0
Your max_length is set to 200, but your input_length is only 57. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=28)
Your max_length is set to 200, but your input_length is only 17. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=8)



Prompt: We’re a small tech company migrating from on-prem infrastructure to AWS. Our team lacks cloud expertise but must ensure security and uptime during transition. Could you propose a phased migration plan with architecture recommendations, compliance checkpoints, and rollback procedures that allow zero downtime?
Generated: Small tech company migrating from on-prem infrastructure to AWS. Company lacks cloud expertise but must ensure security and uptime. Could you propose a phased migration plan with architecture recommendations, compliance checkpoints, and rollback procedures that allow zero downtime?
Reference: Create a phased AWS migration plan for a small tech firm including architecture design, security compliance, and rollback options ensuring zero downtime.


Your max_length is set to 200, but your input_length is only 58. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=29)



Prompt: Describe the steps to create a secure password and tips for remembering it.
Generated: . Describe the steps to create a secure password and tips for remembering it.
Reference: List steps to create a secure password and tips for remembering it.


Your max_length is set to 200, but your input_length is only 59. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=29)



Prompt: I’m drafting a policy memo on the ethics of AI surveillance in public spaces. Include background, stakeholder analysis, risk taxonomy (privacy, chilling effects, bias), and potential regulatory responses. Suggest international comparisons and propose a balanced framework preserving safety and civil liberties.
Generated: I’m drafting a policy memo on the ethics of AI surveillance in public spaces. Include background, stakeholder analysis, risk taxonomy (privacy, chilling effects, bias) Suggest international comparisons and propose a balanced framework preserving safety and civil liberties.
Reference: I’m drafting a policy memo on the ethics of AI surveillance in public spaces. Include background, stakeholder analysis, risk taxonomy (privacy, chilling effects, bias), and potential regulatory responses.


Your max_length is set to 200, but your input_length is only 45. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=22)



Prompt: Could you help design a science club for high school students? Please include: (1) ideas for weekly activities, (2) tips for recruiting members, (3) advice for involving parents, (4) ways to showcase student projects, and (5) resources for learning more.
Generated: Could you help design a science club for high school students? Please include: (1) ideas for weekly activities, (2) tips for recruiting members, (3) advice for involving parents.
Reference: High school science club: weekly activities, member recruitment, parent involvement, project showcases, and learning resources.


Your max_length is set to 200, but your input_length is only 60. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=30)



Prompt: I’m preparing an investor brief for a company developing solid‑state batteries. Outline technology readiness, supply‑chain dependencies, competitive analysis, and scaling challenges. Include risk factors and policy incentives supporting adoption.
Generated: I’m preparing an investor brief for a company developing solid‑state batteries. Outline technology readiness, supply‑chain dependencies and competitive analysis. Include risk factors and policy incentives supporting adoption.
Reference: I’m preparing an investor brief for a company developing solid‑state batteries. Outline technology readiness, supply‑chain dependencies, competitive analysis, and scaling challenges.


Your max_length is set to 200, but your input_length is only 38. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=19)



Prompt: Write a technical explainer for a general audience comparing neural radiance fields (NeRFs) and traditional 3D modeling. Cover underlying principles, computational trade-offs, and practical use cases (AR/VR, film, gaming). End with a future outlook section and open research questions.
Generated: Write a technical explainer for a general audience comparing neural radiance fields (NeRFs) and traditional 3D modeling. Cover underlying principles, computational trade-offs, and practical use cases. End with a future outlook section and open research questions.
Reference: Write a technical explainer for a general audience comparing neural radiance fields (NeRFs) and traditional 3D modeling. Cover underlying principles, computational trade-offs, and practical use cases (AR/VR, film, gaming).


Your max_length is set to 200, but your input_length is only 42. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=21)



Prompt: I’m helping a national park service redesign its visitor experience to balance conservation and tourism. Could you propose visitor flow management, interpretive design, and partnerships with local businesses?
Generated: A national park service is redesigning its visitor experience to balance conservation and tourism. Could you propose visitor flow management, interpretive design, and partnerships with local businesses?
Reference: Develop a sustainable park tourism strategy combining visitor flow design, educational interpretation, and local business collaboration.


Your max_length is set to 200, but your input_length is only 33. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=16)



Prompt: I’m creating an interactive public website to visualize air pollution exposure data by neighborhood. Could you propose design principles, accessibility features, and communication strategies that balance scientific accuracy with clarity for the public?
Generated: I’m creating an interactive public website to visualize air pollution exposure data by neighborhood. Could you propose design principles, accessibility features, and communication strategies that balance scientific accuracy with clarity for the public?
Reference: Develop a public-facing air pollution visualization site with accessible design, clear storytelling, and scientifically accurate communication.


Your max_length is set to 200, but your input_length is only 48. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)



Prompt: Our performing arts center wants to become a carbon-neutral venue within five years. Could you outline operational upgrades, renewable procurement options, and audience engagement programs?
Generated: Performing arts center wants to become a carbon-neutral venue within five years. Could you outline operational upgrades, renewable procurement options, and audience engagement programs?
Reference: Create a five-year carbon-neutral plan for a performing arts venue including facility upgrades, renewable sourcing, and audience engagement.

Prompt: Our university’s research office wants a standardized way to showcase faculty projects online. Could you design a data schema, submission workflow, and tagging system that help students and partners discover relevant work? Include long-term maintenance ideas.
Generated: Our university’s research office wants a standardized way to showcase faculty projects online. Could you design a data schema, submission workflow, and tagging system? Incl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Example 4: ROUGE-L: 0.1860
Example 5: ROUGE-L: 0.8148
Example 6: ROUGE-L: 0.7941
Example 7: ROUGE-L: 0.3721
Example 8: ROUGE-L: 0.2353
Example 9: ROUGE-L: 0.4000
Example 10: ROUGE-L: 0.3529


In [None]:
!pip install evaluate rouge_score

# 1. Log in to Hugging Face
from huggingface_hub import notebook_login
from google.colab import userdata

userdata.get('HF_TOKEN')
notebook_login()

# 2. Read CSV and split into train/test
import pandas as pd
from sklearn.model_selection import train_test_split

csv_path = 'sample_data/validation_examples_trimmed.csv'
df = pd.read_csv(csv_path)
train_df, test_df = train_test_split(df, test_size=0.15, random_state=42)

# 3. Convert to Hugging Face Dataset
from datasets import Dataset, DatasetDict

train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
test_dataset = Dataset.from_pandas(test_df, preserve_index=False)
dataset = DatasetDict({'train': train_dataset, 'test': test_dataset})

# 4. Load tokenizer and model
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

model_checkpoint = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# 5. Preprocess data (updated target tokenization)
max_input_length = 512
max_target_length = 128


def preprocess_function(examples):
    inputs = examples["original"]
    targets = examples["summarization"]
    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True
    )
    with tokenizer.as_target_tokenizer():
        pass  # no-op to silence very old notebooks; using text_target below
    labels = tokenizer(
        text_target=targets,
        max_length=max_target_length,
        truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_datasets = dataset.map(preprocess_function, batched=True,
                                 remove_columns=[c for c in df.columns if c not in ("original", "summarization")])

# 6. Load model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# 7. Metrics
import evaluate

rouge = evaluate.load("rouge")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # predictions may come as tuple(logits, ...) depending on HF internals
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them
    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Return mid.fmeasure for each ROUGE variant
    return {k: v.mid.fmeasure if hasattr(v, "mid") else v for k, v in result.items()}


# 8. Training args
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",  # <-- match eval
    predict_with_generate=True,
    generation_max_length=128,
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=2,
    weight_decay=0.01,
    save_total_limit=2,
    push_to_hub=True,
    hub_model_id="dotslashderek/bart-large-cnn-finetuned-summarization",
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,
    logging_strategy="epoch"  # optional: keeps logs aligned
)

# 9. Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# 10. Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 11. Train and push to hub
trainer.train()
trainer.push_to_hub()


In [6]:
# Another attempt and include some baseline testing against the main model (again, run in colab)

# 0. Repro & device helpers
import random, numpy as np, torch


def set_seed(seed=42):
    random.seed(seed);
    np.random.seed(seed);
    torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)


set_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
supports_bf16 = torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8  # Ampere+

# 1. Log in to Hugging Face (token handling up to you)
from huggingface_hub import notebook_login
from google.colab import userdata

_ = userdata.get('HF_TOKEN')  # optional if you use Secrets
notebook_login()

# 2. Read CSV and split into train/test
import pandas as pd
from sklearn.model_selection import train_test_split

csv_path = 'sample_data/validation_examples_trimmed.csv'
df = pd.read_csv(csv_path)
train_df, test_df = train_test_split(df, test_size=0.15, random_state=42)

# 3. Convert to Hugging Face Dataset
from datasets import Dataset, DatasetDict

train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
test_dataset = Dataset.from_pandas(test_df, preserve_index=False)
dataset = DatasetDict({'train': train_dataset, 'test': test_dataset})

# 4. Tokenizer & model
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments, Seq2SeqTrainer
)

model_checkpoint = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)

# Generation defaults (used by Trainer.generate & our baseline)
# These live on the model's generation config
model.generation_config.num_beams = 4
model.generation_config.no_repeat_ngram_size = 3
model.generation_config.length_penalty = 1.0

# 5. Preprocess
max_input_length = 512
max_target_length = 128


def preprocess_function(examples):
    inputs = examples["original"]
    targets = examples["summarization"]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# drop original columns; keep tokenized fields only
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

# 6. Metrics
import evaluate

rouge = evaluate.load("rouge")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # replace -100 for decoding
    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {k: (v.mid.fmeasure if hasattr(v, "mid") else v) for k, v in result.items()}


# 7. Baseline evaluation on the *untuned* base model
@torch.inference_mode()
def baseline_eval(texts, refs, batch_size=8):
    preds = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        enc = tokenizer(batch, max_length=max_input_length, truncation=True, padding=True, return_tensors="pt").to(
            device)
        gen = model.generate(
            **enc,
            max_new_tokens=max_target_length,  # similar length budget as training
            num_beams=model.generation_config.num_beams,
            no_repeat_ngram_size=model.generation_config.no_repeat_ngram_size,
            length_penalty=model.generation_config.length_penalty
        )
        preds.extend(tokenizer.batch_decode(gen, skip_special_tokens=True))
    scores = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
    return {k: (v.mid.fmeasure if hasattr(v, "mid") else v) for k, v in scores.items()}, preds


# Prepare test texts/refs for baseline
test_texts = test_df["original"].tolist()
test_refs = test_df["summarization"].tolist()
baseline_scores, baseline_preds = baseline_eval(test_texts, test_refs)
print("Baseline ROUGE (base BART, no fine-tune):", baseline_scores)
print("\nSample baseline predictions:")
for i in range(min(3, len(test_texts))):
    print(f"\n# {i + 1}\nINPUT: {test_texts[i][:200]}...")
    print(f"PRED : {baseline_preds[i][:200]}...")
    print(f"REF  : {test_refs[i][:200]}...")

# 8. Training args
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",  # must match when load_best_model_at_end=True
    predict_with_generate=True,
    generation_max_length=max_target_length,
    generation_num_beams=model.generation_config.num_beams,
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=2,
    weight_decay=0.01,
    label_smoothing_factor=0.1,
    warmup_ratio=0.1,
    save_total_limit=2,
    push_to_hub=True,
    hub_model_id="dotslashderek/bart-large-cnn-finetuned-summarization",
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,
    logging_strategy="epoch",
    gradient_checkpointing=True,
    fp16=not supports_bf16 and torch.cuda.is_available(),
    bf16=supports_bf16
)

# 9. Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# 10. Trainer (use processing_class to silence deprecation)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    processing_class=tokenizer,  # replaces deprecated "tokenizer" arg
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 11. Train and push to hub
trainer.train()
trainer.push_to_hub()


In [None]:
# This gave us dotslashderek/bart-large-cnn-prompt-summarization-v2 (most promising)
# Going to use this for initial quantization POC efforts
# Improvements would likely require more / better data

# Epoch	Training Loss	Validation Loss	Rouge1	Rouge2	Rougel	Rougelsum
# 1	2.381100	2.161340	0.594487	0.447341	0.549184	0.549877
# 2	1.905400	2.088969	0.612007	0.476901	0.571949	0.572764
# 3	1.796500	2.091182	0.617459	0.485547	0.579515	0.579718


# %%
# 🔧 Full training cell for BART summarization (Colab-ready)

# 0) Repro + device helpers
import os, random, numpy as np, torch, datetime as dt, pandas as pd, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

def set_seed(seed=42):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
set_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
supports_bf16 = torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8  # Ampere+

# 2) Load data
csv_path = 'sample_data/validation_examples_trimmed.csv'
df = pd.read_csv(csv_path)
train_df, test_df = train_test_split(df, test_size=0.15, random_state=42)

# 3) Build HF datasets (keep raw `dataset` around for callbacks)
from datasets import Dataset, DatasetDict
train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
test_dataset  = Dataset.from_pandas(test_df,  preserve_index=False)
dataset = DatasetDict({'train': train_dataset, 'test': test_dataset})

# 4) Tokenizer & model
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments, Seq2SeqTrainer,
    EarlyStoppingCallback, TrainerCallback
)

model_checkpoint = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)

# Generation defaults on the model (used by Trainer.generate & baseline)
gen_conf = model.generation_config
gen_conf.num_beams = 4
gen_conf.no_repeat_ngram_size = 3
gen_conf.length_penalty = 1.0
# Optional: encourage non-trivial summaries
# gen_conf.min_new_tokens = 15

# 5) Preprocess/tokenize
max_input_length  = 512
max_target_length = 128

def preprocess_function(examples):
    inputs  = examples["original"]
    targets = examples["summarization"]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(
    preprocess_function, batched=True,
    remove_columns=dataset["train"].column_names
)

# 6) Metrics
import evaluate
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {k: (v.mid.fmeasure if hasattr(v, "mid") else v) for k, v in result.items()}

# 7) Baseline ROUGE with *untuned* base model
@torch.inference_mode()
def baseline_eval(texts, refs, batch_size=8):
    preds = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(
            batch, max_length=max_input_length, truncation=True, padding=True, return_tensors="pt"
        ).to(device)
        gen = model.generate(
            **enc,
            max_new_tokens=max_target_length,
            num_beams=gen_conf.num_beams,
            no_repeat_ngram_size=gen_conf.no_repeat_ngram_size,
            length_penalty=gen_conf.length_penalty
        )
        preds.extend(tokenizer.batch_decode(gen, skip_special_tokens=True))
    scores = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
    return {k: (v.mid.fmeasure if hasattr(v, "mid") else v) for k, v in scores.items()}, preds

test_texts = test_df["original"].tolist()
test_refs  = test_df["summarization"].tolist()
baseline_scores, baseline_preds = baseline_eval(test_texts, test_refs)
print("📊 Baseline ROUGE (base BART, no fine-tune):", baseline_scores)

# 8) Training args (matched eval/save; warmup + label smoothing; mixed precision; grad ckpt; better generation)
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    predict_with_generate=True,
    generation_max_length=max_target_length,
    generation_num_beams=gen_conf.num_beams,
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=2,
    weight_decay=0.01,
    label_smoothing_factor=0.1,
    warmup_ratio=0.1,
    save_total_limit=2,
    push_to_hub=True,
    hub_model_id="dotslashderek/bart-large-cnn-prompt-summarization-v2",
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,
    logging_strategy="epoch",
    gradient_checkpointing=True,
    fp16=(torch.cuda.is_available() and not supports_bf16),
    bf16=supports_bf16
)

# 9) Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# 10) Rolling sample-predictions callback (appends to one log file)
# ✅ Drop-in fix: cast indices to Python ints (no NumPy ints)


class RollingSamplePredictionCallback(TrainerCallback):
    def __init__(self, tokenizer, dataset, num_samples=3, max_len=128, output_dir="./results"):
        self.tokenizer = tokenizer
        self.dataset = dataset          # <-- raw dataset with "original"/"summarization"
        self.num_samples = num_samples
        self.max_len = max_len
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)
        self.roll_path = os.path.join(self.output_dir, "samples_all.txt")

    def on_evaluate(self, args, state, control, model=None, **kwargs):
        model.eval()
        epoch = int(state.epoch or 0)
        stamp = dt.datetime.now().isoformat(timespec="seconds")

        # pick Python ints, not NumPy ints
        k = min(self.num_samples, len(self.dataset))
        idxs = random.sample(range(len(self.dataset)), k=k)   # <-- avoids np.int64

        header = f"\n\n📘 Epoch {epoch} — {stamp}\n" + ("-" * 100) + "\n"
        print(header)
        with open(self.roll_path, "a", encoding="utf-8") as f:
            f.write(header)
            for i, idx in enumerate(idxs, start=1):
                # ensure pure Python int indexing
                ex = self.dataset[int(idx)]
                inp = ex["original"]
                ref = ex["summarization"]

                enc = self.tokenizer(inp, return_tensors="pt", truncation=True, max_length=512).to(model.device)
                with torch.no_grad():
                    out = model.generate(
                        **enc,
                        max_new_tokens=self.max_len,
                        num_beams=4,
                        no_repeat_ngram_size=3,
                        length_penalty=1.0
                    )
                pred = self.tokenizer.decode(out[0], skip_special_tokens=True)

                entry = (
                        f"🟢 Sample {i}\n"
                        f"Input: {inp[:500]}...\n"
                        f"Pred : {pred[:500]}...\n"
                        f"Ref  : {ref[:500]}...\n"
                        + ("-" * 100) + "\n"
                )
                print(entry)
                f.write(entry)

        print(f"✅ Appended to {self.roll_path}")


# 11) Trainer + callbacks (processing_class fixes deprecation)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=2),
        RollingSamplePredictionCallback(tokenizer, dataset["test"], num_samples=3, max_len=128, output_dir="./results"),
    ],
)

# 12) Train + push
trainer.train()

for g in trainer.optimizer.param_groups:
    g["lr"] = 1e-5  # was 2e-5

trainer.args.generation_num_beams = 6
trainer.args.generation_max_length = 160
trainer.args.repetition_penalty = 1.1

trainer.args.num_train_epochs += 1
trainer.train(resume_from_checkpoint=True)

trainer.push_to_hub()

# 13) Quick plots (loss & ROUGE)
logs = pd.DataFrame(trainer.state.log_history)
logs.to_csv("./results/trainer_log_history.csv", index=False)

train_logs = logs[logs["loss"].notna()][["step", "loss"]].reset_index(drop=True)
eval_logs  = logs[logs["eval_loss"].notna()].reset_index(drop=True)

plt.figure(); plt.plot(train_logs["step"], train_logs["loss"])
plt.title("Training Loss vs Step"); plt.xlabel("Step"); plt.ylabel("Loss"); plt.grid(True); plt.show()

plt.figure(); plt.plot(eval_logs["epoch"], eval_logs["eval_loss"], marker="o")
plt.title("Validation Loss vs Epoch"); plt.xlabel("Epoch"); plt.ylabel("Eval Loss"); plt.grid(True); plt.show()

plt.figure()
for k, label in [("eval_rougeL","ROUGE-L"), ("eval_rouge1","ROUGE-1"), ("eval_rouge2","ROUGE-2")]:
    if k in eval_logs: plt.plot(eval_logs["epoch"], eval_logs[k], marker="o", label=label)
plt.title("ROUGE vs Epoch"); plt.xlabel("Epoch"); plt.ylabel("Score"); plt.legend(); plt.grid(True); plt.show()

print("Saved raw trainer logs to ./results/trainer_log_history.csv")


In [None]:
# %%
# 🚀 ONNX export + INT8 quant + CPU ROUGE compare (Colab-ready)

# --- 0) Install deps (quiet) ---
import sys, subprocess, pkgutil
def pip_install(pkgs):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-qU"] + pkgs)

need = []
for p in ["optimum", "onnxruntime", "onnx", "evaluate", "datasets", "transformers"]:
    if pkgutil.find_loader(p) is None:
        need.append(p)
if need:
    pip_install(["optimum[onnxruntime]", "onnx", "onnxruntime", "evaluate", "datasets", "transformers"])

# --- 1) Imports ---
import os, time, json, numpy as np, torch, pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForSeq2SeqLM
from onnxruntime.quantization import quantize_dynamic, QuantType
import evaluate

torch.set_num_threads(1)  # less noisy timings; tune as you like

# --- 2) Config ---
MODEL_ID = "dotslashderek/bart-large-cnn-prompt-summarization-v2"
ONNX_DIR = "onnx-bart"
INT8_DIR = "onnx-bart-int8-dynamic"
CSV_PATH = "sample_data/validation_examples_trimmed.csv"
CALC_SAMPLES = 128                      # number of test rows for quick ROUGE
GEN_KW = dict(num_beams=4, no_repeat_ngram_size=3, length_penalty=1.0, max_new_tokens=160)

os.makedirs(ONNX_DIR, exist_ok=True)
os.makedirs(INT8_DIR, exist_ok=True)

# --- 3) Tokenizer & small eval set ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
df = pd.read_csv(CSV_PATH)
if not {"original","summarization"}.issubset(df.columns):
    raise ValueError("CSV must contain 'original' and 'summarization' columns.")
test_df = df.sample(n=min(CALC_SAMPLES, len(df)), random_state=42).reset_index(drop=True)
texts = test_df["original"].astype(str).tolist()
refs  = test_df["summarization"].astype(str).tolist()

# --- 4) Export to ONNX (encoder/decoder/decoder_with_past) ---
# If ONNX files already exist, we won't re-export.
expected = ["encoder_model.onnx", "decoder_model.onnx", "decoder_with_past_model.onnx"]
missing = [f for f in expected if not os.path.exists(os.path.join(ONNX_DIR, f))]
if missing:
    print("🔧 Exporting to ONNX...")
    # Load PyTorch model and export straight to ONNX using Optimum
    ort_model = ORTModelForSeq2SeqLM.from_pretrained(
        MODEL_ID,
        export=True,
        use_external_data_format=False,    # set True if any file would exceed 2GB
        provider="CPUExecutionProvider"
    )
    ort_model.save_pretrained(ONNX_DIR)
    # Also copy tokenizer/config stuff for convenience
    tokenizer.save_pretrained(ONNX_DIR)
else:
    print("✅ ONNX already exported, skipping.")

# --- 5) Dynamic INT8 quantization (weight-only) ---
def dyn_quant(in_path, out_path):
    quantize_dynamic(
        model_input=in_path,
        model_output=out_path,
        per_channel=True,
        reduce_range=False,
        weight_type=QuantType.QInt8
    )

print("🧮 Quantizing (dynamic INT8)...")
for fname in expected:
    src = os.path.join(ONNX_DIR, fname)
    dst = os.path.join(INT8_DIR, fname)
    if not os.path.exists(dst):
        dyn_quant(src, dst)
print("✅ Dynamic quant complete.")

# Copy config/tokenizer assets to INT8 dir so loading works seamlessly
for fname in ["config.json","generation_config.json","tokenizer.json","tokenizer_config.json","vocab.json","merges.txt","special_tokens_map.json"]:
    src = os.path.join(ONNX_DIR, fname)
    if os.path.exists(src):
        try:
            import shutil
            shutil.copy(src, os.path.join(INT8_DIR, fname))
        except Exception:
            pass

# --- 6) Helper: run generation with an ORT model on CPU ---
def generate_rouge(model_dir, texts, refs, batch_size=8, gen_kw=GEN_KW):
    ort = ORTModelForSeq2SeqLM.from_pretrained(model_dir, provider="CPUExecutionProvider")
    preds = []
    start = time.time()
    with torch.inference_mode():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            enc = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
            out = ort.generate(**enc, **gen_kw)
            preds.extend(tokenizer.batch_decode(out, skip_special_tokens=True))
    elapsed = time.time() - start
    rouge = evaluate.load("rouge")
    scores = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
    scores = {k: (v.mid.fmeasure if hasattr(v, "mid") else v) for k, v in scores.items()}
    return scores, elapsed, preds

# --- 7) Compare FP32 ONNX vs INT8 ONNX on CPU ---
print("🏁 Running FP32 ONNX on CPU...")
fp32_scores, fp32_time, _ = generate_rouge(ONNX_DIR, texts, refs)

print("🏁 Running INT8 ONNX on CPU...")
int8_scores, int8_time, _ = generate_rouge(INT8_DIR, texts, refs)

# --- 8) Report ---
def fmt_scores(s): return {k: round(v, 4) for k, v in s.items()}
print("\n=== CPU ROUGE (sampled set) ===")
print("FP32:", fmt_scores(fp32_scores), f" | time: {fp32_time:.2f}s")
print("INT8:", fmt_scores(int8_scores), f" | time: {int8_time:.2f}s")
speedup = fp32_time / int8_time if int8_time > 0 else float("inf")
print(f"⚡ INT8 speedup vs FP32: {speedup:.2f}x on this sample")

# (Optional) peek one example
print("\nExample generation (INT8):")
print("INPUT:", texts[0][:300], "...")
print("REF  :", refs[0][:300], "...")
# quick single example gen
ort_int8 = ORTModelForSeq2SeqLM.from_pretrained(INT8_DIR, provider="CPUExecutionProvider")
enc = tokenizer(texts[0], return_tensors="pt", truncation=True, max_length=512)
out = ort_int8.generate(**enc, **GEN_KW)
print("PRED :", tokenizer.decode(out[0], skip_special_tokens=True)[:400], "...")


In [2]:
# %% [markdown]
# Local benchmark: PyTorch vs ONNX (FP32/INT8) on macOS (M3)

# %% Install (run once; restart kernel if needed)
# %pip install -qU "optimum[onnxruntime]" onnxruntime transformers evaluate datasets

# %% Imports & config
import os, time, numpy as np, pandas as pd, torch
from pathlib import Path
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from optimum.onnxruntime import ORTModelForSeq2SeqLM
import evaluate

HF_PT_REPO   = "dotslashderek/bart-large-cnn-prompt-summarization"              # PyTorch fine-tuned
HF_ONNX_REPO = "dotslashderek/bart-large-cnn-prompt-summarization-onnx"         # ONNX repo with subfolders
CSV_PATH     = "training_data/validation_examples_trimmed.csv"

SUB_FP32 = "fp32"            # subfolder in ONNX repo
SUB_INT8 = "int8-dynamic"    # subfolder in ONNX repo

MAX_SAMPLES = 128            # quick comparison; raise for fuller eval
MAX_SRC_LEN = 512
MAX_NEW     = 160            # match your training/serving length
GEN_KW = dict(num_beams=4, no_repeat_ngram_size=3, length_penalty=1.0, max_new_tokens=MAX_NEW)

# OnnxRuntime threading for fair timing; tune if you like
os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("MKL_NUM_THREADS", "1")

# %% Load data
df = pd.read_csv(CSV_PATH)
assert {"original","summarization"}.issubset(df.columns), "CSV must have 'original' and 'summarization'."
test_df = df.sample(n=min(MAX_SAMPLES, len(df)), random_state=42).reset_index(drop=True)
texts = test_df["original"].astype(str).tolist()
refs  = test_df["summarization"].astype(str).tolist()

rouge = evaluate.load("rouge")  # HF Evaluate ROUGE

# %% Tokenizer (shared)
tokenizer = AutoTokenizer.from_pretrained(HF_PT_REPO)

# %% Helper: ROUGE compute
def rouge_scores(preds, refs):
    res = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
    # make it pretty
    return {k: (v.mid.fmeasure if hasattr(v, "mid") else v) for k, v in res.items()}

# %% 1) PyTorch model (CPU or MPS)
use_mps = torch.backends.mps.is_available() and torch.backends.mps.is_built()
device = torch.device("mps" if use_mps else "cpu")
pt_model = AutoModelForSeq2SeqLM.from_pretrained(HF_PT_REPO).to(device)
pt_model.eval()

def run_pt(model, texts, batch_size=8):
    preds = []
    t0 = time.perf_counter()
    with torch.inference_mode():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            enc = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=MAX_SRC_LEN)
            enc = {k: v.to(device) for k, v in enc.items()}
            out = model.generate(**enc, **GEN_KW)
            preds.extend(tokenizer.batch_decode(out, skip_special_tokens=True))
            if use_mps:
                torch.mps.synchronize()
    elapsed = time.perf_counter() - t0
    return preds, elapsed

# %% 2) ONNX FP32 (CPU)
def load_onnx_dir(repo_id, subfolder):
    # Pull from Hub; Optimum handles local cache
    model = ORTModelForSeq2SeqLM.from_pretrained(
        repo_id,
        subfolder=subfolder,
        provider="CPUExecutionProvider",
        encoder_file_name="encoder_model.onnx",
        decoder_file_name="decoder_model.onnx",
        decoder_with_past_file_name="decoder_with_past_model.onnx",
    )
    return model

onnx_fp32 = load_onnx_dir(HF_ONNX_REPO, SUB_FP32)

def run_onnx(ort_model, texts, batch_size=8):
    preds = []
    t0 = time.perf_counter()
    with torch.inference_mode():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            enc = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=MAX_SRC_LEN)
            out = ort_model.generate(**enc, **GEN_KW)
            preds.extend(tokenizer.batch_decode(out, skip_special_tokens=True))
    elapsed = time.perf_counter() - t0
    return preds, elapsed

# %% 3) ONNX INT8 (CPU)
onnx_int8 = load_onnx_dir(HF_ONNX_REPO, SUB_INT8)

# %% Run all
print(f"PyTorch device: {'MPS' if use_mps else 'CPU'}")

pt_preds,   pt_time   = run_pt(pt_model, texts)
fp32_preds, fp32_time = run_onnx(onnx_fp32, texts)
int8_preds, int8_time = run_onnx(onnx_int8, texts)

pt_rouge   = rouge_scores(pt_preds, refs)
fp32_rouge = rouge_scores(fp32_preds, refs)
int8_rouge = rouge_scores(int8_preds, refs)

# %% Summarize
def round_scores(s): return {k: round(v, 4) for k, v in s.items()}

summary = pd.DataFrame([
    dict(model="PyTorch (MPS)" if use_mps else "PyTorch (CPU)", time_s=pt_time,   **round_scores(pt_rouge)),
    dict(model="ONNX FP32 (CPU)",                                time_s=fp32_time, **round_scores(fp32_rouge)),
    dict(model="ONNX INT8 (CPU)",                                time_s=int8_time, **round_scores(int8_rouge)),
]).sort_values("time_s").reset_index(drop=True)

display(summary)
print("\nSpeedups vs ONNX FP32:")
print(f"INT8: {fp32_time/int8_time:.2f}x faster" if int8_time>0 else "INT8: n/a")
print(f"PT  : {fp32_time/pt_time:.2f}x faster than ONNX FP32" if pt_time>0 else "PT: n/a")




PyTorch device: MPS


Unnamed: 0,model,time_s,rouge1,rouge2,rougeL,rougeLsum
0,ONNX INT8 (CPU),81.54719,0.5477,0.3779,0.496,0.497
1,PyTorch (MPS),119.115752,0.6054,0.4718,0.5659,0.5668
2,ONNX FP32 (CPU),175.124224,0.6143,0.4734,0.5714,0.5711



Speedups vs ONNX FP32:
INT8: 2.15x faster
PT  : 1.47x faster than ONNX FP32


In [10]:
# %% Local benchmark: PyTorch (CPU & MPS) vs ONNX (FP32 & INT8) on macOS
!pip install -U "optimum[onnxruntime]" onnxruntime transformers evaluate datasets

import os, time, numpy as np, pandas as pd, torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from optimum.onnxruntime import ORTModelForSeq2SeqLM
import evaluate

HF_PT_REPO   = "dotslashderek/bart-large-cnn-prompt-summarization"              # PyTorch fine-tuned repo
HF_ONNX_REPO = "dotslashderek/bart-large-cnn-prompt-summarization-onnx"         # ONNX repo with subfolders
SUB_FP32     = "fp32"
SUB_INT8     = "int8-dynamic"
CSV_PATH     = "training_data/validation_examples_trimmed.csv"

MAX_SAMPLES = 128
MAX_SRC_LEN = 512
MAX_NEW     = 160
BATCH_SIZE  = 8
GEN_KW = dict(num_beams=4, no_repeat_ngram_size=3, length_penalty=1.0, max_new_tokens=MAX_NEW)

# Make CPU timing less noisy
os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("MKL_NUM_THREADS", "1")

print("🔹 Loading CSV…")
df = pd.read_csv(CSV_PATH)
assert {"original", "summarization"}.issubset(df.columns), "CSV must have 'original' & 'summarization'."
test_df = df.sample(n=min(MAX_SAMPLES, len(df)), random_state=42).reset_index(drop=True)
texts = test_df["original"].astype(str).tolist()
refs  = test_df["summarization"].astype(str).tolist()
print(f"   → Using {len(texts)} samples.\n")

rouge = evaluate.load("rouge")

print("🔹 Loading tokenizer…")
tokenizer = AutoTokenizer.from_pretrained(HF_PT_REPO)

def run_rouge(preds, refs):
    res = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
    return {k: (v.mid.fmeasure if hasattr(v, "mid") else v) for k, v in res.items()}

def timeit(fn, label):
    t0 = time.perf_counter()
    out = fn()
    dt = time.perf_counter() - t0
    print(f"   ✓ {label} finished in {dt:.2f}s")
    return out, dt

def pt_generate(model, device, label):
    print(f"   • {label}: warm-up…")
    with torch.inference_mode():
        enc = tokenizer(texts[:min(4, len(texts))], return_tensors="pt", padding=True, truncation=True, max_length=MAX_SRC_LEN)
        enc = {k: v.to(device) for k, v in enc.items()}
        _ = model.generate(**enc, **GEN_KW)
        if device.type == "mps":
            torch.mps.synchronize()

    print(f"   • {label}: timed inference…")
    def _run():
        preds = []
        with torch.inference_mode():
            for i in range(0, len(texts), BATCH_SIZE):
                if i % (BATCH_SIZE * 4) == 0:
                    print(f"     - batch {i//BATCH_SIZE + 1}/{(len(texts)+BATCH_SIZE-1)//BATCH_SIZE}")
                batch = texts[i:i+BATCH_SIZE]
                enc = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=MAX_SRC_LEN)
                enc = {k: v.to(device) for k, v in enc.items()}
                out = model.generate(**enc, **GEN_KW)
                preds.extend(tokenizer.batch_decode(out, skip_special_tokens=True))
                if device.type == "mps":
                    torch.mps.synchronize()
        return preds
    return timeit(_run, label)

def ort_generate(repo_id, subfolder, label):
    print(f"   • {label}: loading ORT model ({subfolder})…")
    ort_model = ORTModelForSeq2SeqLM.from_pretrained(
        repo_id,
        subfolder=subfolder,
        provider="CPUExecutionProvider",
        encoder_file_name="encoder_model.onnx",
        decoder_file_name="decoder_model.onnx",
        decoder_with_past_file_name="decoder_with_past_model.onnx",
    )
    # warm-up
    print(f"   • {label}: warm-up…")
    with torch.inference_mode():
        enc = tokenizer(texts[:min(4, len(texts))], return_tensors="pt", padding=True, truncation=True, max_length=MAX_SRC_LEN)
        _ = ort_model.generate(**enc, **GEN_KW)

    print(f"   • {label}: timed inference…")
    def _run():
        preds = []
        with torch.inference_mode():
            for i in range(0, len(texts), BATCH_SIZE):
                if i % (BATCH_SIZE * 4) == 0:
                    print(f"     - batch {i//BATCH_SIZE + 1}/{(len(texts)+BATCH_SIZE-1)//BATCH_SIZE}")
                batch = texts[i:i+BATCH_SIZE]
                enc = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=MAX_SRC_LEN)
                out = ort_model.generate(**enc, **GEN_KW)
                preds.extend(tokenizer.batch_decode(out, skip_special_tokens=True))
        return preds
    return timeit(_run, label)

# ---------------------------
# 1) PyTorch CPU
# ---------------------------
print("🔹 Loading PyTorch model (CPU)…")
pt_cpu = AutoModelForSeq2SeqLM.from_pretrained(HF_PT_REPO).to("cpu").eval()
(pt_cpu_preds, pt_cpu_time) = pt_generate(pt_cpu, torch.device("cpu"), "PyTorch (CPU)")

# ---------------------------
# 2) PyTorch MPS (optional)
# ---------------------------
use_mps = torch.backends.mps.is_available() and torch.backends.mps.is_built()
pt_mps_preds = None
pt_mps_time  = None
if use_mps:
    print("\n🔹 Loading PyTorch model (MPS)…")
    pt_mps = AutoModelForSeq2SeqLM.from_pretrained(HF_PT_REPO).to("mps").eval()
    (pt_mps_preds, pt_mps_time) = pt_generate(pt_mps, torch.device("mps"), "PyTorch (MPS)")
else:
    print("\nℹ️ MPS not available — skipping GPU run. (PyTorch MPS is Apple’s Metal backend.)")  # MPS backend info. :contentReference[oaicite:1]{index=1}

# ---------------------------
# 3) ONNX FP32 (CPU)
# ---------------------------
print("\n🔹 Running ONNX FP32 (CPU)…")
(onnx_fp32_preds, onnx_fp32_time) = ort_generate(HF_ONNX_REPO, SUB_FP32, "ONNX FP32 (CPU)")

# ---------------------------
# 4) ONNX INT8 (CPU)
# ---------------------------
print("\n🔹 Running ONNX INT8 (CPU)…")
(onnx_int8_preds, onnx_int8_time) = ort_generate(HF_ONNX_REPO, SUB_INT8, "ONNX INT8 (CPU)")

# ---------------------------
# Scores
# ---------------------------
print("\n🔹 Computing ROUGE…")
pt_cpu_rouge   = run_rouge(pt_cpu_preds, refs)
onnx_fp32_rouge = run_rouge(onnx_fp32_preds, refs)
onnx_int8_rouge = run_rouge(onnx_int8_preds, refs)
if pt_mps_preds is not None:
    pt_mps_rouge = run_rouge(pt_mps_preds, refs)

def r4(d): return {k: round(v, 4) for k, v in d.items()}

rows = [
    dict(model="PyTorch (CPU)",     time_s=pt_cpu_time,   **r4(pt_cpu_rouge)),
    dict(model="ONNX FP32 (CPU)",   time_s=onnx_fp32_time, **r4(onnx_fp32_rouge)),
    dict(model="ONNX INT8 (CPU)",   time_s=onnx_int8_time, **r4(onnx_int8_rouge)),
]
if pt_mps_preds is not None:
    rows.append(dict(model="PyTorch (MPS)", time_s=pt_mps_time, **r4(pt_mps_rouge)))

summary = pd.DataFrame(rows).sort_values("time_s").reset_index(drop=True)
print("\n✅ Benchmark complete.\n")
display(summary)

print("\n🔹 Speedups vs ONNX FP32 (CPU):")
print(f"   - PyTorch (CPU): {onnx_fp32_time/pt_cpu_time:.2f}x faster" if pt_cpu_time>0 else "   - PyTorch (CPU): n/a")
print(f"   - ONNX INT8 (CPU): {onnx_fp32_time/onnx_int8_time:.2f}x faster" if onnx_int8_time>0 else "   - ONNX INT8 (CPU): n/a")
if pt_mps_time:
    print(f"   - PyTorch (MPS): {onnx_fp32_time/pt_mps_time:.2f}x faster")

# Peek one example per model
i = 0
print("\n🔎 Sample comparison (index 0):")
print("INPUT:", texts[i][:300], "…")
print("REF  :", refs[i][:300], "…")
print("\nPT-CPU :", pt_cpu_preds[i][:400], "…")
if pt_mps_preds is not None:
    print("PT-MPS :", pt_mps_preds[i][:400], "…")
print("ONNX32 :", onnx_fp32_preds[i][:400], "…")
print("INT8   :", onnx_int8_preds[i][:400], "…")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting transformers
  Using cached transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Using cached tokenizers-0.22.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
🔹 Loading CSV…
   → Using 128 samples.

🔹 Loading tokenizer…
🔹 Loading PyTorch model (CPU)…




   • PyTorch (CPU): warm-up…
   • PyTorch (CPU): timed inference…
     - batch 1/16
     - batch 5/16
     - batch 9/16
     - batch 13/16
   ✓ PyTorch (CPU) finished in 217.33s

🔹 Loading PyTorch model (MPS)…
   • PyTorch (MPS): warm-up…
   • PyTorch (MPS): timed inference…
     - batch 1/16
     - batch 5/16
     - batch 9/16
     - batch 13/16
   ✓ PyTorch (MPS) finished in 138.71s

🔹 Running ONNX FP32 (CPU)…
   • ONNX FP32 (CPU): loading ORT model (fp32)…




   • ONNX FP32 (CPU): warm-up…
   • ONNX FP32 (CPU): timed inference…
     - batch 1/16
     - batch 5/16
     - batch 9/16
     - batch 13/16
   ✓ ONNX FP32 (CPU) finished in 168.42s

🔹 Running ONNX INT8 (CPU)…
   • ONNX INT8 (CPU): loading ORT model (int8-dynamic)…




   • ONNX INT8 (CPU): warm-up…
   • ONNX INT8 (CPU): timed inference…
     - batch 1/16
     - batch 5/16
     - batch 9/16
     - batch 13/16
   ✓ ONNX INT8 (CPU) finished in 75.49s

🔹 Computing ROUGE…

✅ Benchmark complete.



Unnamed: 0,model,time_s,rouge1,rouge2,rougeL,rougeLsum
0,ONNX INT8 (CPU),75.49452,0.5477,0.3779,0.496,0.497
1,PyTorch (MPS),138.705917,0.6054,0.4718,0.5659,0.5668
2,ONNX FP32 (CPU),168.417562,0.6143,0.4734,0.5714,0.5711
3,PyTorch (CPU),217.331788,0.6054,0.4718,0.5659,0.5668



🔹 Speedups vs ONNX FP32 (CPU):
   - PyTorch (CPU): 0.77x faster
   - ONNX INT8 (CPU): 2.23x faster
   - PyTorch (MPS): 1.21x faster

🔎 Sample comparison (index 0):
INPUT: We’re a small tech company migrating from on-prem infrastructure to AWS. Our team lacks cloud expertise but must ensure security and uptime during transition. Could you propose a phased migration plan with architecture recommendations, compliance checkpoints, and rollback procedures that allow zero  …
REF  : Create a phased AWS migration plan for a small tech firm including architecture design, security compliance, and rollback options ensuring zero downtime. …

PT-CPU : Create a phased AWS migration plan for a small tech company including architecture recommendations, compliance checkpoints, and rollback procedures for zero-downtime transition that ensure security and uptime during the transition from on-prem toAWS with phased rollout and phased rollout. …
PT-MPS : Create a phased AWS migration plan for a small tech

In [15]:
# One-prompt-at-a-time latency test for INT8 ONNX (CPU)
# Requires: pip install -U "optimum[onnxruntime]" onnxruntime transformers evaluate pandas

import os, time, statistics, numpy as np, pandas as pd, torch
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForSeq2SeqLM

# --------------------
# Config
# --------------------
HF_ONNX_REPO = "dotslashderek/bart-large-cnn-prompt-summarization-onnx"
SUB_INT8     = "fp32"   # your INT8 subfolder on the Hub
CSV_PATH     = "training_data/validation_examples_trimmed.csv"  # same CSV as before
N_SAMPLES    = 64               # number of prompts to probe (1-by-1)
MAX_SRC_LEN  = 512
MAX_NEW      = 160              # match your serving budget
GEN_KW = dict(num_beams=4, no_repeat_ngram_size=3, length_penalty=1.0, max_new_tokens=MAX_NEW)

# Keep CPU latency stable (ORT recommends tuning threads for your use case)
# You can experiment with 1–2 for latency-sensitive single-request workloads.
os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("MKL_NUM_THREADS", "1")

print("Loading tokenizer…")
tokenizer = AutoTokenizer.from_pretrained("dotslashderek/bart-large-cnn-prompt-summarization")

print("Loading INT8 ONNX model (CPU)…")
model = ORTModelForSeq2SeqLM.from_pretrained(
    HF_ONNX_REPO,
    subfolder=SUB_INT8,
    provider="CPUExecutionProvider",  # CPU EP on macOS is the standard path :contentReference[oaicite:1]{index=1}
    encoder_file_name="encoder_model.onnx",
    decoder_file_name="decoder_model.onnx",
    decoder_with_past_file_name="decoder_with_past_model.onnx",
)

print("Reading dataset…")
df = pd.read_csv(CSV_PATH)
assert {"original","summarization"}.issubset(df.columns), "CSV must have 'original' and 'summarization'."
texts = df["original"].astype(str).tolist()[:N_SAMPLES]
refs  = df["summarization"].astype(str).tolist()[:N_SAMPLES]

# --------------------
# Warm-up (crucial for fair latency)
# --------------------
print("Warm-up (unmeasured)…")
with torch.inference_mode():
    enc = tokenizer(texts[:2], return_tensors="pt", padding=True, truncation=True, max_length=MAX_SRC_LEN)
    _ = model.generate(**enc, **GEN_KW)

# --------------------
# Per-request latency
# --------------------
latencies = []
preds = []

print("Timing per-request (1 prompt → 1 summary)…")
for i, txt in enumerate(texts, 1):
    enc = tokenizer(txt, return_tensors="pt", truncation=True, max_length=MAX_SRC_LEN)
    t0 = time.perf_counter()
    with torch.inference_mode():
        out = model.generate(**enc, **GEN_KW)
    dt = time.perf_counter() - t0
    pred = tokenizer.decode(out[0], skip_special_tokens=True)
    preds.append(pred)
    latencies.append(dt)
    if i % 8 == 0 or i == 1:
        print(f"  • {i}/{len(texts)}  latency={dt*1000:.1f} ms")

# --------------------
# Stats
# --------------------
def pct(a, p): return np.percentile(a, p)
mean_s = statistics.mean(latencies)
p50_s  = pct(latencies, 50)
p90_s  = pct(latencies, 90)
p99_s  = pct(latencies, 99)

print("\n=== Single-request latency (INT8 ONNX, CPU) ===")
print(f"mean: {mean_s*1000:.1f} ms | p50: {p50_s*1000:.1f} ms | p90: {p90_s*1000:.1f} ms | p99: {p99_s*1000:.1f} ms")
print(f"n={len(latencies)} requests")

# peek a couple of generations
print("\nSample outputs:")
for i in range(min(5, len(texts))):
    print(f"\nINPUT : {texts[i][:300]}…")
    print(f"PRED  : {preds[i][:300]}…")
    print(f"REF   : {refs[i][:300]}…")


Loading tokenizer…
Loading INT8 ONNX model (CPU)…




Reading dataset…
Warm-up (unmeasured)…
Timing per-request (1 prompt → 1 summary)…
  • 1/64  latency=2881.4 ms
  • 8/64  latency=2423.4 ms
  • 16/64  latency=1171.2 ms
  • 24/64  latency=1299.1 ms
  • 32/64  latency=1188.0 ms
  • 40/64  latency=1241.5 ms
  • 48/64  latency=1137.1 ms
  • 56/64  latency=1160.0 ms
  • 64/64  latency=1216.7 ms

=== Single-request latency (INT8 ONNX, CPU) ===
mean: 1443.0 ms | p50: 1178.9 ms | p90: 2414.4 ms | p99: 3699.1 ms
n=64 requests

Sample outputs:

INPUT : I'm handing you a crusty legacy service that was written in Java 8 circa 2016 and sprinkled with singletons, static utils, and a homegrown futures thing. I need a straight path to modernizing it without breaking prod. Please sketch a refactor plan that: (1) migrates to Java 21 or Kotlin (I'm leaning…
PRED  : Design a phased legacy java to kotlin refactor plan for an AWS legacy service: migrate to Java 21/Kotlin, replace async with Project Loom/ coroutines, standardize DI (Spring 4.x to Boot 3.x or 

In [18]:
# One-prompt-at-a-time latency test for INT8 ONNX (CPU)
# Requires: pip install -U "optimum[onnxruntime]" onnxruntime transformers evaluate pandas

import os, time, statistics, numpy as np, pandas as pd, torch
from transformers import AutoTokenizer, GenerationConfig
from optimum.onnxruntime import ORTModelForSeq2SeqLM

REPO = "dotslashderek/bart-large-cnn-prompt-summarization-onnx"
SUB  = "int8-dynamic"

# Load tokenizer/model FROM THE SAME SUBFOLDER
tok = AutoTokenizer.from_pretrained(REPO, subfolder=SUB)

ort = ORTModelForSeq2SeqLM.from_pretrained(
    REPO, subfolder=SUB, provider="CPUExecutionProvider",
    encoder_file_name="encoder_model.onnx",
    decoder_file_name="decoder_model.onnx",
    decoder_with_past_file_name="decoder_with_past_model.onnx",
)

# Print what we actually have
gc = ort.generation_config
print("Loaded IDs:", dict(
    bos=tok.bos_token_id, eos=tok.eos_token_id, pad=tok.pad_token_id,
    dec_start=gc.decoder_start_token_id, forced_bos=gc.forced_bos_token_id
))

# Force a safe BART setup
gc.pad_token_id  = tok.pad_token_id
gc.eos_token_id  = tok.eos_token_id
gc.bos_token_id  = (tok.bos_token_id or 0)
gc.decoder_start_token_id = gc.decoder_start_token_id or tok.eos_token_id or 2
gc.forced_bos_token_id    = 0 if gc.forced_bos_token_id is None else gc.forced_bos_token_id

# Also pass them explicitly to generate (belt & suspenders)
enc = tok("Summarize this: " + texts[0], return_tensors="pt", truncation=True, max_length=512)
out = ort.generate(
    **enc, num_beams=4, no_repeat_ngram_size=3, length_penalty=1.0, max_new_tokens=160,
    decoder_start_token_id=gc.decoder_start_token_id,
    forced_bos_token_id=gc.forced_bos_token_id,
    eos_token_id=gc.eos_token_id,
    pad_token_id=gc.pad_token_id,
)
print(tok.decode(out[0], skip_special_tokens=True))


print("Loading INT8 ONNX model (CPU)…")
model = ORTModelForSeq2SeqLM.from_pretrained(
    HF_ONNX_REPO,
    subfolder=SUB_INT8,
    provider="CPUExecutionProvider",  # CPU EP on macOS is the standard path :contentReference[oaicite:1]{index=1}
    encoder_file_name="encoder_model.onnx",
    decoder_file_name="decoder_model.onnx",
    decoder_with_past_file_name="decoder_with_past_model.onnx",
)

print("Reading dataset…")
df = pd.read_csv(CSV_PATH)
assert {"original","summarization"}.issubset(df.columns), "CSV must have 'original' and 'summarization'."
texts = df["original"].astype(str).tolist()[:N_SAMPLES]
refs  = df["summarization"].astype(str).tolist()[:N_SAMPLES]

# --------------------
# Warm-up (crucial for fair latency)
# --------------------
print("Warm-up (unmeasured)…")
with torch.inference_mode():
    enc = tokenizer(texts[:2], return_tensors="pt", padding=True, truncation=True, max_length=MAX_SRC_LEN)
    _ = model.generate(**enc, **GEN_KW)

# --------------------
# Per-request latency
# --------------------
latencies = []
preds = []

print("Timing per-request (1 prompt → 1 summary)…")
for i, txt in enumerate(texts, 1):
    enc = tokenizer(txt, return_tensors="pt", truncation=True, max_length=MAX_SRC_LEN)
    t0 = time.perf_counter()
    with torch.inference_mode():
        out = model.generate(**enc, **GEN_KW)
    dt = time.perf_counter() - t0
    pred = tokenizer.decode(out[0], skip_special_tokens=True)
    preds.append(pred)
    latencies.append(dt)
    if i % 8 == 0 or i == 1:
        print(f"  • {i}/{len(texts)}  latency={dt*1000:.1f} ms")

# --------------------
# Stats
# --------------------
def pct(a, p): return np.percentile(a, p)
mean_s = statistics.mean(latencies)
p50_s  = pct(latencies, 50)
p90_s  = pct(latencies, 90)
p99_s  = pct(latencies, 99)

print("\n=== Single-request latency (INT8 ONNX, CPU) ===")
print(f"mean: {mean_s*1000:.1f} ms | p50: {p50_s*1000:.1f} ms | p90: {p90_s*1000:.1f} ms | p99: {p99_s*1000:.1f} ms")
print(f"n={len(latencies)} requests")

# peek a couple of generations
print("\nSample outputs:")
for i in range(min(5, len(texts))):
    print(f"\nINPUT : {texts[i][:300]}…")
    print(f"PRED  : {preds[i][:300]}…")
    print(f"REF   : {refs[i][:300]}…")




Loaded IDs: {'bos': 0, 'eos': 2, 'pad': 1, 'dec_start': 2, 'forced_bos': 0}
Deliver a phased legacy service refactor plan with zero-downtime rollout on AWS (ECS + RDS Postgres): specific deliverables (tests, risks, rollback), sample code, and sample code for one endpoint with concurrency + tracing.
Loading INT8 ONNX model (CPU)…




Reading dataset…
Warm-up (unmeasured)…
Timing per-request (1 prompt → 1 summary)…
  • 1/64  latency=1873.3 ms
  • 8/64  latency=1418.1 ms
  • 16/64  latency=477.1 ms
  • 24/64  latency=466.7 ms
  • 32/64  latency=433.3 ms
  • 40/64  latency=498.1 ms
  • 48/64  latency=480.9 ms
  • 56/64  latency=467.6 ms
  • 64/64  latency=502.9 ms

=== Single-request latency (INT8 ONNX, CPU) ===
mean: 656.9 ms | p50: 493.8 ms | p90: 1367.0 ms | p99: 1899.6 ms
n=64 requests

Sample outputs:

INPUT : I'm handing you a crusty legacy service that was written in Java 8 circa 2016 and sprinkled with singletons, static utils, and a homegrown futures thing. I need a straight path to modernizing it without breaking prod. Please sketch a refactor plan that: (1) migrates to Java 21 or Kotlin (I'm leaning…
PRED  : I'm handing you a crusty legacy service that was written in Java 8 circa 2016 and sprinkled with singletons, static utils, and a homegrown futures thing. I need a straight path to modernize it without b

In [21]:
# %% Runtime self-check for PT vs ONNX FP32 vs ONNX INT8 (special tokens + generation_config + quick decode)
import os, json, torch, pandas as pd
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from optimum.onnxruntime import ORTModelForSeq2SeqLM

HF_PT_REPO   = "dotslashderek/bart-large-cnn-prompt-summarization"
HF_ONNX_REPO = "dotslashderek/bart-large-cnn-prompt-summarization-onnx"
SUB_FP32     = "fp32"
SUB_INT8     = "int8-dynamic"
CSV_PATH     = "training_data/validation_examples_trimmed.csv"

# ---------- helpers ----------
def to_scalarish(v):
    if v is None:
        return None
    if isinstance(v, (list, tuple)):
        try:
            return [int(x) for x in v]
        except Exception:
            return list(v)
    try:
        return int(v)
    except Exception:
        return v  # leave as-is (e.g., tensor) for visibility

def brief(d):
    return {k: to_scalarish(v) for k, v in d.items()}

NEEDED = [
    "generation_config.json", "tokenizer.json", "tokenizer_config.json",
    "vocab.json", "merges.txt", "special_tokens_map.json"
]

def list_present(repo_id, subfolder=None):
    tok = AutoTokenizer.from_pretrained(repo_id, subfolder=subfolder)
    local_dir = Path(tok.init_kwargs.get("name_or_path"))
    present = [f for f in NEEDED if (local_dir / f).exists()]
    return local_dir, present

# ---------- probe text ----------
df = pd.read_csv(CSV_PATH)
probe = df["original"].astype(str).iloc[0]
print("🔎 Probe text:", probe[:160], "...\n")

# ---------- 1) PyTorch ----------
print("====== PyTorch (fine-tuned) ======")
tok_pt = AutoTokenizer.from_pretrained(HF_PT_REPO)
pt = AutoModelForSeq2SeqLM.from_pretrained(HF_PT_REPO).eval()

ids_tok_pt = dict(bos=tok_pt.bos_token_id, eos=tok_pt.eos_token_id, pad=tok_pt.pad_token_id)
ids_gc_pt  = dict(
    dec_start=pt.generation_config.decoder_start_token_id,
    forced_bos=pt.generation_config.forced_bos_token_id,
    eos=pt.generation_config.eos_token_id,
    pad=pt.generation_config.pad_token_id,
)
print("tokenizer IDs :", brief(ids_tok_pt))
print("generation cfg:", brief(ids_gc_pt))

enc = tok_pt(probe, return_tensors="pt", truncation=True, max_length=512)
out_vanilla = pt.generate(**enc, max_new_tokens=80)
print("PT vanilla :", tok_pt.decode(out_vanilla[0], skip_special_tokens=True)[:160], "...")
out_forced = pt.generate(
    **enc, max_new_tokens=80,
    decoder_start_token_id=ids_gc_pt["dec_start"] if ids_gc_pt["dec_start"] is not None else tok_pt.eos_token_id,
    forced_bos_token_id=ids_gc_pt["forced_bos"] if ids_gc_pt["forced_bos"] is not None else (tok_pt.bos_token_id or 0),
    eos_token_id=tok_pt.eos_token_id, pad_token_id=tok_pt.pad_token_id
)
print("PT forced  :", tok_pt.decode(out_forced[0], skip_special_tokens=True)[:160], "...")
print()

# ---------- 2) ONNX FP32 ----------
print("====== ONNX FP32 (CPU) ======")
tok_32 = AutoTokenizer.from_pretrained(HF_ONNX_REPO, subfolder=SUB_FP32)
onnx32 = ORTModelForSeq2SeqLM.from_pretrained(
    HF_ONNX_REPO, subfolder=SUB_FP32, provider="CPUExecutionProvider",
    encoder_file_name="encoder_model.onnx",
    decoder_file_name="decoder_model.onnx",
    decoder_with_past_file_name="decoder_with_past_model.onnx",
)

local_fp32_dir, fp32_present = list_present(HF_ONNX_REPO, SUB_FP32)
print("Local FP32 dir:", local_fp32_dir)
print("Sidecars present:", fp32_present)

ids_tok_32 = dict(bos=tok_32.bos_token_id, eos=tok_32.eos_token_id, pad=tok_32.pad_token_id)
gc32 = onnx32.generation_config
ids_gc_32 = dict(
    dec_start=gc32.decoder_start_token_id,
    forced_bos=gc32.forced_bos_token_id,
    eos=gc32.eos_token_id,
    pad=gc32.pad_token_id,
)
print("tokenizer IDs :", brief(ids_tok_32))
print("generation cfg:", brief(ids_gc_32))

gc_path_32 = Path(local_fp32_dir) / "generation_config.json"
if gc_path_32.exists():
    print("generation_config.json (FP32) snippet:", gc_path_32.read_text()[:200].replace("\n"," "), "...")

enc32 = tok_32(probe, return_tensors="pt", truncation=True, max_length=512)
out32_v = onnx32.generate(**enc32, max_new_tokens=80)
print("ONNX32 vanilla:", tok_32.decode(out32_v[0], skip_special_tokens=True)[:160], "...")
out32_f = onnx32.generate(
    **enc32, max_new_tokens=80,
    decoder_start_token_id=ids_gc_32["dec_start"] if ids_gc_32["dec_start"] is not None else tok_32.eos_token_id,
    forced_bos_token_id=ids_gc_32["forced_bos"] if ids_gc_32["forced_bos"] is not None else (tok_32.bos_token_id or 0),
    eos_token_id=tok_32.eos_token_id, pad_token_id=tok_32.pad_token_id
)
print("ONNX32 forced :", tok_32.decode(out32_f[0], skip_special_tokens=True)[:160], "...")
print()

# ---------- 3) ONNX INT8 ----------
print("====== ONNX INT8 (CPU) ======")
tok_8 = AutoTokenizer.from_pretrained(HF_ONNX_REPO, subfolder=SUB_INT8)
onnx8 = ORTModelForSeq2SeqLM.from_pretrained(
    HF_ONNX_REPO, subfolder=SUB_INT8, provider="CPUExecutionProvider",
    encoder_file_name="encoder_model.onnx",
    decoder_file_name="decoder_model.onnx",
    decoder_with_past_file_name="decoder_with_past_model.onnx",
)

local_int8_dir, int8_present = list_present(HF_ONNX_REPO, SUB_INT8)
print("Local INT8 dir:", local_int8_dir)
print("Sidecars present:", int8_present)

ids_tok_8 = dict(bos=tok_8.bos_token_id, eos=tok_8.eos_token_id, pad=tok_8.pad_token_id)
gc8 = onnx8.generation_config
ids_gc_8 = dict(
    dec_start=gc8.decoder_start_token_id,
    forced_bos=gc8.forced_bos_token_id,
    eos=gc8.eos_token_id,
    pad=gc8.pad_token_id,
)
print("tokenizer IDs :", brief(ids_tok_8))
print("generation cfg:", brief(ids_gc_8))

gc_path_8 = Path(local_int8_dir) / "generation_config.json"
if gc_path_8.exists():
    print("generation_config.json (INT8) snippet:", gc_path_8.read_text()[:200].replace("\n"," "), "...")

enc8 = tok_8(probe, return_tensors="pt", truncation=True, max_length=512)
out8_v = onnx8.generate(**enc8, max_new_tokens=80)
print("INT8 vanilla :", tok_8.decode(out8_v[0], skip_special_tokens=True)[:160], "...")
out8_f = onnx8.generate(
    **enc8, max_new_tokens=80,
    decoder_start_token_id=ids_gc_8["dec_start"] if ids_gc_8["dec_start"] is not None else tok_8.eos_token_id,
    forced_bos_token_id=ids_gc_8["forced_bos"] if ids_gc_8["forced_bos"] is not None else (tok_8.bos_token_id or 0),
    eos_token_id=tok_8.eos_token_id, pad_token_id=tok_8.pad_token_id
)
print("INT8 forced  :", tok_8.decode(out8_f[0], skip_special_tokens=True)[:160], "...")
# mutate cfg then re-run
gc8.pad_token_id  = tok_8.pad_token_id
gc8.eos_token_id  = tok_8.eos_token_id
gc8.bos_token_id  = tok_8.bos_token_id or 0
gc8.decoder_start_token_id = gc8.decoder_start_token_id or tok_8.eos_token_id or 2
gc8.forced_bos_token_id    = 0 if gc8.forced_bos_token_id is None else gc8.forced_bos_token_id
out8_cfg = onnx8.generate(**enc8, max_new_tokens=80)
print("INT8 cfg-set :", tok_8.decode(out8_cfg[0], skip_special_tokens=True)[:160], "...")


🔎 Probe text: I'm handing you a crusty legacy service that was written in Java 8 circa 2016 and sprinkled with singletons, static utils, and a homegrown futures thing. I need ...





tokenizer IDs : {'bos': 0, 'eos': 2, 'pad': 1}
generation cfg: {'dec_start': 2, 'forced_bos': 0, 'eos': [2], 'pad': 1}
PT vanilla : I'm handing you a legacy service that was written in Java 8 circa 2016 and sprinkled with singletons, static utils, and a homegrown futures thing. I need a stra ...
PT forced  : I'm handing you a legacy service that was written in Java 8 circa 2016 and sprinkled with singletons, static utils, and a homegrown futures thing. I need a stra ...





Local FP32 dir: dotslashderek/bart-large-cnn-prompt-summarization-onnx
Sidecars present: []
tokenizer IDs : {'bos': 0, 'eos': 2, 'pad': 1}
generation cfg: {'dec_start': 2, 'forced_bos': 0, 'eos': [2], 'pad': 1}
ONNX32 vanilla: Design a phased legacy java to kotlin refactor plan for an AWS legacy service: migrate to Java 21/Kotlin, replace async with Loom/Coroutines, standardize DI (Sp ...
ONNX32 forced : Design a phased legacy java to kotlin refactor plan for an AWS legacy service: migrate to Java 21/Kotlin, replace async with Loom/Coroutines, standardize DI (Sp ...





Local INT8 dir: dotslashderek/bart-large-cnn-prompt-summarization-onnx
Sidecars present: []
tokenizer IDs : {'bos': 0, 'eos': 2, 'pad': 1}
generation cfg: {'dec_start': 2, 'forced_bos': 0, 'eos': [2], 'pad': 1}
INT8 vanilla : I'm handing you a crusty legacy service that was written in Java 8 circa 2016 and sprinkled with singletons, static utils, and a homegrown futures thing. I need ...
INT8 forced  : I'm handing you a crusty legacy service that was written in Java 8 circa 2016 and sprinkled with singletons, static utils, and a homegrown futures thing. I need ...
INT8 cfg-set : I'm handing you a crusty legacy service that was written in Java 8 circa 2016 and sprinkled with singletons, static utils, and a homegrown futures thing. I need ...


In [25]:
# Python
from datasets import load_dataset

# Load the Dolly 15k dataset from Hugging Face
dolly = load_dataset("databricks/databricks-dolly-15k")

# Print the first ten entries from the train split
for i, entry in enumerate(dolly["train"][:100]):
    print(f"Entry {i+1}:")
    print(entry)
    print("-" * 40)



Entry 1:
instruction
----------------------------------------
Entry 2:
context
----------------------------------------
Entry 3:
response
----------------------------------------
Entry 4:
category
----------------------------------------
