In [3]:
import csv

input_path = 'training_data/validation_examples.csv'
output_path = 'training_data/validation_examples_repaired.csv'

with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8', newline='') as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)
    header = next(reader)
    writer.writerow(header)
    good, bad = 0, 0
    for row in reader:
        if len(row) == 4:
            writer.writerow(row)
            good += 1
        else:
            bad += 1
print(f"Repair complete. {good} good rows written, {bad} bad rows skipped. Cleaned file: {output_path}")


Repair complete. 1527 good rows written, 4 bad rows skipped. Cleaned file: training_data/validation_examples_repaired.csv


In [4]:
import pandas as pd

input_path = 'training_data/validation_examples_repaired.csv'
output_path = 'training_data/validation_examples_trimmed.csv'

def trim_csv(input_path, output_path, chunk_size=500):
    first = True
    for chunk in pd.read_csv(input_path, chunksize=chunk_size):
        trimmed = chunk[['original', 'summarization']]
        trimmed.to_csv(output_path, mode='w' if first else 'a', index=False, header=first)
        first = False

trim_csv(input_path, output_path)
print(f"Trimmed CSV written to {output_path}")


Trimmed CSV written to training_data/validation_examples_trimmed.csv


In [5]:
import pandas as pd
from transformers import pipeline
from evaluate import load

# Load trimmed CSV
input_path = 'training_data/validation_examples_trimmed.csv'
df = pd.read_csv(input_path)

# Randomly select 10 examples
sample = df.sample(n=10, random_state=42)

# Load summarization pipeline (facebook/bart-large-cnn)
summarizer = pipeline('summarization', model='facebook/bart-large-cnn')
rouge = load('rouge')

generated_summaries = []
reference_summaries = []

for idx, row in sample.iterrows():
    prompt = row['original']
    reference = row['summarization']
    # Optionally prepend 'summarize: ' for consistency with test-summarization-model
    # prompt = 'summarize: ' + prompt
    result = summarizer(prompt, max_length=200, min_length=10, do_sample=False)
    generated = result[0]['summary_text']
    generated_summaries.append(generated)
    reference_summaries.append(reference)
    print(f"\nPrompt: {prompt}\nGenerated: {generated}\nReference: {reference}")

# Compute ROUGE scores
scores = rouge.compute(predictions=generated_summaries, references=reference_summaries)
print("\nROUGE scores (aggregated):")
for k, v in scores.items():
    print(f"{k}: {v:.4f}")

print("\nROUGE-L for each example:")
for i, (gen, ref) in enumerate(zip(generated_summaries, reference_summaries)):
    score = rouge.compute(predictions=[gen], references=[ref])
    print(f"Example {i+1}: ROUGE-L: {score['rougeL']:.4f}")


  from .autonotebook import tqdm as notebook_tqdm
Device set to use mps:0
Your max_length is set to 200, but your input_length is only 57. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=28)
Your max_length is set to 200, but your input_length is only 17. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=8)



Prompt: We’re a small tech company migrating from on-prem infrastructure to AWS. Our team lacks cloud expertise but must ensure security and uptime during transition. Could you propose a phased migration plan with architecture recommendations, compliance checkpoints, and rollback procedures that allow zero downtime?
Generated: Small tech company migrating from on-prem infrastructure to AWS. Company lacks cloud expertise but must ensure security and uptime. Could you propose a phased migration plan with architecture recommendations, compliance checkpoints, and rollback procedures that allow zero downtime?
Reference: Create a phased AWS migration plan for a small tech firm including architecture design, security compliance, and rollback options ensuring zero downtime.


Your max_length is set to 200, but your input_length is only 58. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=29)



Prompt: Describe the steps to create a secure password and tips for remembering it.
Generated: . Describe the steps to create a secure password and tips for remembering it.
Reference: List steps to create a secure password and tips for remembering it.


Your max_length is set to 200, but your input_length is only 59. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=29)



Prompt: I’m drafting a policy memo on the ethics of AI surveillance in public spaces. Include background, stakeholder analysis, risk taxonomy (privacy, chilling effects, bias), and potential regulatory responses. Suggest international comparisons and propose a balanced framework preserving safety and civil liberties.
Generated: I’m drafting a policy memo on the ethics of AI surveillance in public spaces. Include background, stakeholder analysis, risk taxonomy (privacy, chilling effects, bias) Suggest international comparisons and propose a balanced framework preserving safety and civil liberties.
Reference: I’m drafting a policy memo on the ethics of AI surveillance in public spaces. Include background, stakeholder analysis, risk taxonomy (privacy, chilling effects, bias), and potential regulatory responses.


Your max_length is set to 200, but your input_length is only 45. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=22)



Prompt: Could you help design a science club for high school students? Please include: (1) ideas for weekly activities, (2) tips for recruiting members, (3) advice for involving parents, (4) ways to showcase student projects, and (5) resources for learning more.
Generated: Could you help design a science club for high school students? Please include: (1) ideas for weekly activities, (2) tips for recruiting members, (3) advice for involving parents.
Reference: High school science club: weekly activities, member recruitment, parent involvement, project showcases, and learning resources.


Your max_length is set to 200, but your input_length is only 60. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=30)



Prompt: I’m preparing an investor brief for a company developing solid‑state batteries. Outline technology readiness, supply‑chain dependencies, competitive analysis, and scaling challenges. Include risk factors and policy incentives supporting adoption.
Generated: I’m preparing an investor brief for a company developing solid‑state batteries. Outline technology readiness, supply‑chain dependencies and competitive analysis. Include risk factors and policy incentives supporting adoption.
Reference: I’m preparing an investor brief for a company developing solid‑state batteries. Outline technology readiness, supply‑chain dependencies, competitive analysis, and scaling challenges.


Your max_length is set to 200, but your input_length is only 38. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=19)



Prompt: Write a technical explainer for a general audience comparing neural radiance fields (NeRFs) and traditional 3D modeling. Cover underlying principles, computational trade-offs, and practical use cases (AR/VR, film, gaming). End with a future outlook section and open research questions.
Generated: Write a technical explainer for a general audience comparing neural radiance fields (NeRFs) and traditional 3D modeling. Cover underlying principles, computational trade-offs, and practical use cases. End with a future outlook section and open research questions.
Reference: Write a technical explainer for a general audience comparing neural radiance fields (NeRFs) and traditional 3D modeling. Cover underlying principles, computational trade-offs, and practical use cases (AR/VR, film, gaming).


Your max_length is set to 200, but your input_length is only 42. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=21)



Prompt: I’m helping a national park service redesign its visitor experience to balance conservation and tourism. Could you propose visitor flow management, interpretive design, and partnerships with local businesses?
Generated: A national park service is redesigning its visitor experience to balance conservation and tourism. Could you propose visitor flow management, interpretive design, and partnerships with local businesses?
Reference: Develop a sustainable park tourism strategy combining visitor flow design, educational interpretation, and local business collaboration.


Your max_length is set to 200, but your input_length is only 33. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=16)



Prompt: I’m creating an interactive public website to visualize air pollution exposure data by neighborhood. Could you propose design principles, accessibility features, and communication strategies that balance scientific accuracy with clarity for the public?
Generated: I’m creating an interactive public website to visualize air pollution exposure data by neighborhood. Could you propose design principles, accessibility features, and communication strategies that balance scientific accuracy with clarity for the public?
Reference: Develop a public-facing air pollution visualization site with accessible design, clear storytelling, and scientifically accurate communication.


Your max_length is set to 200, but your input_length is only 48. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)



Prompt: Our performing arts center wants to become a carbon-neutral venue within five years. Could you outline operational upgrades, renewable procurement options, and audience engagement programs?
Generated: Performing arts center wants to become a carbon-neutral venue within five years. Could you outline operational upgrades, renewable procurement options, and audience engagement programs?
Reference: Create a five-year carbon-neutral plan for a performing arts venue including facility upgrades, renewable sourcing, and audience engagement.

Prompt: Our university’s research office wants a standardized way to showcase faculty projects online. Could you design a data schema, submission workflow, and tagging system that help students and partners discover relevant work? Include long-term maintenance ideas.
Generated: Our university’s research office wants a standardized way to showcase faculty projects online. Could you design a data schema, submission workflow, and tagging system? Incl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Example 4: ROUGE-L: 0.1860
Example 5: ROUGE-L: 0.8148
Example 6: ROUGE-L: 0.7941
Example 7: ROUGE-L: 0.3721
Example 8: ROUGE-L: 0.2353
Example 9: ROUGE-L: 0.4000
Example 10: ROUGE-L: 0.3529


In [None]:
!pip install evaluate rouge_score

# 1. Log in to Hugging Face
from huggingface_hub import notebook_login
from google.colab import userdata

userdata.get('HF_TOKEN')
notebook_login()

# 2. Read CSV and split into train/test
import pandas as pd
from sklearn.model_selection import train_test_split

csv_path = 'sample_data/validation_examples_trimmed.csv'
df = pd.read_csv(csv_path)
train_df, test_df = train_test_split(df, test_size=0.15, random_state=42)

# 3. Convert to Hugging Face Dataset
from datasets import Dataset, DatasetDict

train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
test_dataset = Dataset.from_pandas(test_df, preserve_index=False)
dataset = DatasetDict({'train': train_dataset, 'test': test_dataset})

# 4. Load tokenizer and model
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

model_checkpoint = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# 5. Preprocess data (updated target tokenization)
max_input_length = 512
max_target_length = 128


def preprocess_function(examples):
    inputs = examples["original"]
    targets = examples["summarization"]
    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True
    )
    with tokenizer.as_target_tokenizer():
        pass  # no-op to silence very old notebooks; using text_target below
    labels = tokenizer(
        text_target=targets,
        max_length=max_target_length,
        truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_datasets = dataset.map(preprocess_function, batched=True,
                                 remove_columns=[c for c in df.columns if c not in ("original", "summarization")])

# 6. Load model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# 7. Metrics
import evaluate

rouge = evaluate.load("rouge")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # predictions may come as tuple(logits, ...) depending on HF internals
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them
    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Return mid.fmeasure for each ROUGE variant
    return {k: v.mid.fmeasure if hasattr(v, "mid") else v for k, v in result.items()}


# 8. Training args
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",  # <-- match eval
    predict_with_generate=True,
    generation_max_length=128,
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=2,
    weight_decay=0.01,
    save_total_limit=2,
    push_to_hub=True,
    hub_model_id="dotslashderek/bart-large-cnn-finetuned-summarization",
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,
    logging_strategy="epoch"  # optional: keeps logs aligned
)

# 9. Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# 10. Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 11. Train and push to hub
trainer.train()
trainer.push_to_hub()


In [6]:
# Another attempt and include some baseline testing against the main model (again, run in colab)

# 0. Repro & device helpers
import random, numpy as np, torch


def set_seed(seed=42):
    random.seed(seed);
    np.random.seed(seed);
    torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)


set_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
supports_bf16 = torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8  # Ampere+

# 1. Log in to Hugging Face (token handling up to you)
from huggingface_hub import notebook_login
from google.colab import userdata

_ = userdata.get('HF_TOKEN')  # optional if you use Secrets
notebook_login()

# 2. Read CSV and split into train/test
import pandas as pd
from sklearn.model_selection import train_test_split

csv_path = 'sample_data/validation_examples_trimmed.csv'
df = pd.read_csv(csv_path)
train_df, test_df = train_test_split(df, test_size=0.15, random_state=42)

# 3. Convert to Hugging Face Dataset
from datasets import Dataset, DatasetDict

train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
test_dataset = Dataset.from_pandas(test_df, preserve_index=False)
dataset = DatasetDict({'train': train_dataset, 'test': test_dataset})

# 4. Tokenizer & model
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments, Seq2SeqTrainer
)

model_checkpoint = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)

# Generation defaults (used by Trainer.generate & our baseline)
# These live on the model's generation config
model.generation_config.num_beams = 4
model.generation_config.no_repeat_ngram_size = 3
model.generation_config.length_penalty = 1.0

# 5. Preprocess
max_input_length = 512
max_target_length = 128


def preprocess_function(examples):
    inputs = examples["original"]
    targets = examples["summarization"]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# drop original columns; keep tokenized fields only
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

# 6. Metrics
import evaluate

rouge = evaluate.load("rouge")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # replace -100 for decoding
    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {k: (v.mid.fmeasure if hasattr(v, "mid") else v) for k, v in result.items()}


# 7. Baseline evaluation on the *untuned* base model
@torch.inference_mode()
def baseline_eval(texts, refs, batch_size=8):
    preds = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        enc = tokenizer(batch, max_length=max_input_length, truncation=True, padding=True, return_tensors="pt").to(
            device)
        gen = model.generate(
            **enc,
            max_new_tokens=max_target_length,  # similar length budget as training
            num_beams=model.generation_config.num_beams,
            no_repeat_ngram_size=model.generation_config.no_repeat_ngram_size,
            length_penalty=model.generation_config.length_penalty
        )
        preds.extend(tokenizer.batch_decode(gen, skip_special_tokens=True))
    scores = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
    return {k: (v.mid.fmeasure if hasattr(v, "mid") else v) for k, v in scores.items()}, preds


# Prepare test texts/refs for baseline
test_texts = test_df["original"].tolist()
test_refs = test_df["summarization"].tolist()
baseline_scores, baseline_preds = baseline_eval(test_texts, test_refs)
print("Baseline ROUGE (base BART, no fine-tune):", baseline_scores)
print("\nSample baseline predictions:")
for i in range(min(3, len(test_texts))):
    print(f"\n# {i + 1}\nINPUT: {test_texts[i][:200]}...")
    print(f"PRED : {baseline_preds[i][:200]}...")
    print(f"REF  : {test_refs[i][:200]}...")

# 8. Training args
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",  # must match when load_best_model_at_end=True
    predict_with_generate=True,
    generation_max_length=max_target_length,
    generation_num_beams=model.generation_config.num_beams,
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=2,
    weight_decay=0.01,
    label_smoothing_factor=0.1,
    warmup_ratio=0.1,
    save_total_limit=2,
    push_to_hub=True,
    hub_model_id="dotslashderek/bart-large-cnn-finetuned-summarization",
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,
    logging_strategy="epoch",
    gradient_checkpointing=True,
    fp16=not supports_bf16 and torch.cuda.is_available(),
    bf16=supports_bf16
)

# 9. Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# 10. Trainer (use processing_class to silence deprecation)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    processing_class=tokenizer,  # replaces deprecated "tokenizer" arg
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 11. Train and push to hub
trainer.train()
trainer.push_to_hub()


In [None]:
# This gave us dotslashderek/bart-large-cnn-prompt-summarization-v2 (most promising)
# Going to use this for initial quantization POC efforts
# Improvements would likely require more / better data

# Epoch	Training Loss	Validation Loss	Rouge1	Rouge2	Rougel	Rougelsum
# 1	2.381100	2.161340	0.594487	0.447341	0.549184	0.549877
# 2	1.905400	2.088969	0.612007	0.476901	0.571949	0.572764
# 3	1.796500	2.091182	0.617459	0.485547	0.579515	0.579718


# %%
# 🔧 Full training cell for BART summarization (Colab-ready)

# 0) Repro + device helpers
import os, random, numpy as np, torch, datetime as dt, pandas as pd, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

def set_seed(seed=42):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
set_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
supports_bf16 = torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8  # Ampere+

# 2) Load data
csv_path = 'sample_data/validation_examples_trimmed.csv'
df = pd.read_csv(csv_path)
train_df, test_df = train_test_split(df, test_size=0.15, random_state=42)

# 3) Build HF datasets (keep raw `dataset` around for callbacks)
from datasets import Dataset, DatasetDict
train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
test_dataset  = Dataset.from_pandas(test_df,  preserve_index=False)
dataset = DatasetDict({'train': train_dataset, 'test': test_dataset})

# 4) Tokenizer & model
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments, Seq2SeqTrainer,
    EarlyStoppingCallback, TrainerCallback
)

model_checkpoint = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)

# Generation defaults on the model (used by Trainer.generate & baseline)
gen_conf = model.generation_config
gen_conf.num_beams = 4
gen_conf.no_repeat_ngram_size = 3
gen_conf.length_penalty = 1.0
# Optional: encourage non-trivial summaries
# gen_conf.min_new_tokens = 15

# 5) Preprocess/tokenize
max_input_length  = 512
max_target_length = 128

def preprocess_function(examples):
    inputs  = examples["original"]
    targets = examples["summarization"]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(
    preprocess_function, batched=True,
    remove_columns=dataset["train"].column_names
)

# 6) Metrics
import evaluate
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {k: (v.mid.fmeasure if hasattr(v, "mid") else v) for k, v in result.items()}

# 7) Baseline ROUGE with *untuned* base model
@torch.inference_mode()
def baseline_eval(texts, refs, batch_size=8):
    preds = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(
            batch, max_length=max_input_length, truncation=True, padding=True, return_tensors="pt"
        ).to(device)
        gen = model.generate(
            **enc,
            max_new_tokens=max_target_length,
            num_beams=gen_conf.num_beams,
            no_repeat_ngram_size=gen_conf.no_repeat_ngram_size,
            length_penalty=gen_conf.length_penalty
        )
        preds.extend(tokenizer.batch_decode(gen, skip_special_tokens=True))
    scores = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
    return {k: (v.mid.fmeasure if hasattr(v, "mid") else v) for k, v in scores.items()}, preds

test_texts = test_df["original"].tolist()
test_refs  = test_df["summarization"].tolist()
baseline_scores, baseline_preds = baseline_eval(test_texts, test_refs)
print("📊 Baseline ROUGE (base BART, no fine-tune):", baseline_scores)

# 8) Training args (matched eval/save; warmup + label smoothing; mixed precision; grad ckpt; better generation)
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    predict_with_generate=True,
    generation_max_length=max_target_length,
    generation_num_beams=gen_conf.num_beams,
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=2,
    weight_decay=0.01,
    label_smoothing_factor=0.1,
    warmup_ratio=0.1,
    save_total_limit=2,
    push_to_hub=True,
    hub_model_id="dotslashderek/bart-large-cnn-prompt-summarization-v2",
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,
    logging_strategy="epoch",
    gradient_checkpointing=True,
    fp16=(torch.cuda.is_available() and not supports_bf16),
    bf16=supports_bf16
)

# 9) Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# 10) Rolling sample-predictions callback (appends to one log file)
# ✅ Drop-in fix: cast indices to Python ints (no NumPy ints)


class RollingSamplePredictionCallback(TrainerCallback):
    def __init__(self, tokenizer, dataset, num_samples=3, max_len=128, output_dir="./results"):
        self.tokenizer = tokenizer
        self.dataset = dataset          # <-- raw dataset with "original"/"summarization"
        self.num_samples = num_samples
        self.max_len = max_len
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)
        self.roll_path = os.path.join(self.output_dir, "samples_all.txt")

    def on_evaluate(self, args, state, control, model=None, **kwargs):
        model.eval()
        epoch = int(state.epoch or 0)
        stamp = dt.datetime.now().isoformat(timespec="seconds")

        # pick Python ints, not NumPy ints
        k = min(self.num_samples, len(self.dataset))
        idxs = random.sample(range(len(self.dataset)), k=k)   # <-- avoids np.int64

        header = f"\n\n📘 Epoch {epoch} — {stamp}\n" + ("-" * 100) + "\n"
        print(header)
        with open(self.roll_path, "a", encoding="utf-8") as f:
            f.write(header)
            for i, idx in enumerate(idxs, start=1):
                # ensure pure Python int indexing
                ex = self.dataset[int(idx)]
                inp = ex["original"]
                ref = ex["summarization"]

                enc = self.tokenizer(inp, return_tensors="pt", truncation=True, max_length=512).to(model.device)
                with torch.no_grad():
                    out = model.generate(
                        **enc,
                        max_new_tokens=self.max_len,
                        num_beams=4,
                        no_repeat_ngram_size=3,
                        length_penalty=1.0
                    )
                pred = self.tokenizer.decode(out[0], skip_special_tokens=True)

                entry = (
                        f"🟢 Sample {i}\n"
                        f"Input: {inp[:500]}...\n"
                        f"Pred : {pred[:500]}...\n"
                        f"Ref  : {ref[:500]}...\n"
                        + ("-" * 100) + "\n"
                )
                print(entry)
                f.write(entry)

        print(f"✅ Appended to {self.roll_path}")


# 11) Trainer + callbacks (processing_class fixes deprecation)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=2),
        RollingSamplePredictionCallback(tokenizer, dataset["test"], num_samples=3, max_len=128, output_dir="./results"),
    ],
)

# 12) Train + push
trainer.train()

for g in trainer.optimizer.param_groups:
    g["lr"] = 1e-5  # was 2e-5

trainer.args.generation_num_beams = 6
trainer.args.generation_max_length = 160
trainer.args.repetition_penalty = 1.1

trainer.args.num_train_epochs += 1
trainer.train(resume_from_checkpoint=True)

trainer.push_to_hub()

# 13) Quick plots (loss & ROUGE)
logs = pd.DataFrame(trainer.state.log_history)
logs.to_csv("./results/trainer_log_history.csv", index=False)

train_logs = logs[logs["loss"].notna()][["step", "loss"]].reset_index(drop=True)
eval_logs  = logs[logs["eval_loss"].notna()].reset_index(drop=True)

plt.figure(); plt.plot(train_logs["step"], train_logs["loss"])
plt.title("Training Loss vs Step"); plt.xlabel("Step"); plt.ylabel("Loss"); plt.grid(True); plt.show()

plt.figure(); plt.plot(eval_logs["epoch"], eval_logs["eval_loss"], marker="o")
plt.title("Validation Loss vs Epoch"); plt.xlabel("Epoch"); plt.ylabel("Eval Loss"); plt.grid(True); plt.show()

plt.figure()
for k, label in [("eval_rougeL","ROUGE-L"), ("eval_rouge1","ROUGE-1"), ("eval_rouge2","ROUGE-2")]:
    if k in eval_logs: plt.plot(eval_logs["epoch"], eval_logs[k], marker="o", label=label)
plt.title("ROUGE vs Epoch"); plt.xlabel("Epoch"); plt.ylabel("Score"); plt.legend(); plt.grid(True); plt.show()

print("Saved raw trainer logs to ./results/trainer_log_history.csv")
