In [1]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [15]:
import os
import json
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
from datasets import Dataset
import torch

### Define Paths and Model Name

In [3]:
DATA_PATH = "/content/fine_tune_dataset.json"
OUTPUT_DIR = "/content/fine_tuned_model"

### Load Data Function

In [4]:
def load_data(data_path):
    """Load the dataset from JSON."""
    with open(data_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return Dataset.from_list(data)

### Tokenization Function

In [5]:
def tokenize_data(examples, tokenizer, max_input_length=512, max_output_length=128):
    """
    Tokenize the dataset.
    - Inputs: Subject + Text so far
    - Outputs: Next words
    """
    inputs = examples["input"]  # Extract the 'input' field from the batch
    outputs = examples["output"]  # Extract the 'output' field from the batch
    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        padding="max_length",
        truncation=True,
    )
    labels = tokenizer(
        outputs,
        max_length=max_output_length,
        padding="max_length",
        truncation=True,
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

### Load Dataset and Model

In [6]:
MODEL_NAME = 'facebook/bart-base'

In [7]:
# Load dataset
print(f"Loading dataset from {DATA_PATH}...")
dataset = load_data(DATA_PATH)
print(f"Loaded {len(dataset)} examples.")

# Load tokenizer and model
print(f"Loading model and tokenizer ({MODEL_NAME})...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

Loading dataset from /content/fine_tune_dataset.json...
Loaded 21674 examples.
Loading model and tokenizer (facebook/bart-base)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

### Tokenize the Dataset

In [8]:
# Tokenize dataset
print("Tokenizing dataset...")
tokenized_dataset = dataset.map(
    lambda examples: tokenize_data(examples, tokenizer),
    batched=True,
    remove_columns=["input", "output"],
)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

Tokenizing dataset...


Map:   0%|          | 0/21674 [00:00<?, ? examples/s]

### Define Training Arguments

In [22]:
print("Setting up training arguments...")
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    evaluation_strategy="steps",  # Ensure evaluation happens during training
    eval_steps=2000,  # Adjust based on dataset size and training speed
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,  # Set a higher number; early stopping will stop earlier if needed
    save_total_limit=2,
    logging_steps=2000,
    save_steps=2000,
    save_strategy="steps",
    fp16=True if torch.cuda.is_available() else False,
    load_best_model_at_end=True,  # Enable loading the best model
    metric_for_best_model="eval_loss",  # Metric to monitor (change as needed)
    greater_is_better=False,  # Set to False for loss, True for accuracy, etc.
    report_to="wandb",  # If using Weights and Biases
)

Setting up training arguments...




### Initialize and Run Trainer

In [23]:
print("Initializing Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Stop after 3 non-improving evals
)

Initializing Trainer...


In [None]:
# Train
print("Starting fine-tuning...")
trainer.train()

Starting fine-tuning...


Step,Training Loss,Validation Loss
500,0.0665,0.162446
1000,0.1142,0.135349
1500,0.1765,0.122891
2000,0.1637,0.107169
2500,0.1536,0.095309
3000,0.1287,0.083425
3500,0.113,0.075364
4000,0.1098,0.067262
4500,0.1028,0.061101
5000,0.0983,0.054704


Step,Training Loss,Validation Loss
500,0.0665,0.162446
1000,0.1142,0.135349
1500,0.1765,0.122891
2000,0.1637,0.107169
2500,0.1536,0.095309
3000,0.1287,0.083425
3500,0.113,0.075364
4000,0.1098,0.067262
4500,0.1028,0.061101
5000,0.0983,0.054704


### Save the Fine-Tuned Model

In [None]:
print(f"Saving model to {OUTPUT_DIR}...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Model saved.")