# Small Prompt Fine-Tuning

A concise Hugging Face `Seq2SeqTrainer` setup for fine-tuning on the short or very-short prompt datasets. Adjust the cell parameters before launching training (ideally on a GPU-enabled runtime such as Google Colab).

In [None]:
from __future__ import annotations

import numpy as np
import torch
from datasets import DatasetDict, load_dataset
from pathlib import Path
from transformers import (AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq,
                          Seq2SeqTrainer, Seq2SeqTrainingArguments)

PROJECT_ROOT = Path('..').resolve()
SHORT_TRAIN = PROJECT_ROOT / 'src' / 'training_data' / 'dsp-train.csv'
SHORT_TEST = PROJECT_ROOT / 'src' / 'training_data' / 'dsp-test.csv'
VERY_SHORT_TRAIN = PROJECT_ROOT / 'src' / 'training_data' / 'dvsp-train.csv'
VERY_SHORT_TEST = PROJECT_ROOT / 'src' / 'training_data' / 'dvsp-test.csv'

BASE_MODEL = 'Falconsai/text_summarization'
MAX_SOURCE_LENGTH = 512
MAX_TARGET_LENGTH = 128

np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)


def load_datasets(use_very_short: bool = False) -> DatasetDict:
    if use_very_short:
        train_file = VERY_SHORT_TRAIN
        eval_file = VERY_SHORT_TEST
    else:
        train_file = SHORT_TRAIN
        eval_file = SHORT_TEST
    data_files = {'train': str(train_file), 'validation': str(eval_file)}
    return load_dataset('csv', data_files=data_files)


def prepare_tokenizer():
    return AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)


def preprocess_factory(tokenizer):
    def preprocess(batch):
        inputs = tokenizer(batch['original'], truncation=True, max_length=MAX_SOURCE_LENGTH)
        labels = tokenizer(batch['compressed_prompt'], truncation=True, max_length=MAX_TARGET_LENGTH)
        inputs['labels'] = labels['input_ids']
        return inputs
    return preprocess

def build_trainer(dataset: DatasetDict, tokenizer):
    model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL)
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
    training_args = Seq2SeqTrainingArguments(
        output_dir='results',
        evaluation_strategy='epoch',
        save_strategy='epoch',
        learning_rate=3e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        weight_decay=0.01,
        save_total_limit=2,
        num_train_epochs=3,
        predict_with_generate=True,
        generation_max_length=MAX_TARGET_LENGTH,
        logging_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='eval_loss',
        greater_is_better=False,
    )
    return Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset['train'],
        eval_dataset=dataset['validation'],
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

In [None]:
use_very_short = False  # Set True to train on the ≤64 token subset
datasets = load_datasets(use_very_short=use_very_short)
print(datasets)

In [None]:
tokenizer = prepare_tokenizer()
processed = datasets.map(preprocess_factory(tokenizer), batched=True)
keep_columns = ['input_ids', 'attention_mask', 'labels']
processed = processed.remove_columns([c for c in processed['train'].column_names if c not in keep_columns])
trainer = build_trainer(processed, tokenizer)
print('Trainer ready. Uncomment trainer.train() when running on GPU.')

In [None]:
# trainer.train()
# trainer.save_model('results/final')