# Fine-Tuning GPT

In [1]:
import tensorflow as tf, math, numpy as np
from tensorflow.keras import mixed_precision
from transformers import GPT2TokenizerFast, TFGPT2LMHeadModel, create_optimizer
from pathlib import Path

mixed_precision.set_global_policy("mixed_float16")   # GPU speed-up
import tensorflow as tf, os
print(tf.config.list_physical_devices("GPU"))


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
# ── 1.  tokenizer with course-specific special tokens ──────────────
SPECIAL = ["<|question|>", "<|answer|>", "<|statement|>", "<|end|>"]
tok = GPT2TokenizerFast.from_pretrained("gpt2")
tok.pad_token = tok.eos_token                     # GPT-2 needs explicit pad
tok.add_special_tokens({"additional_special_tokens": SPECIAL})

# ── 2.  read the corpus and pre-tokenise in one call ───────────────
BLOCK = 512                                       # GPT-2 context span
txt_path = "data/clean_corpus.txt"
lines = Path(txt_path).read_text().splitlines()     # ≈ 1 line = 1 sample

enc = tok(lines,
          truncation=True,
          max_length=BLOCK,
          padding="max_length",                     # left-padded to 1024
          return_tensors="np")                      # gives NumPy arrays

input_ids      = enc["input_ids"]                   # shape (N, 1024)
attention_mask = enc["attention_mask"]

# ── 3.  wrap the arrays in tf.data  ────────────────────────────────
def as_ds(arr):          # helper: slice a 2-D NumPy array
    return tf.data.Dataset.from_tensor_slices(arr)

ds_ids  = as_ds(input_ids)
ds_mask = as_ds(attention_mask)

dataset = tf.data.Dataset.zip((ds_ids, ds_mask)).map(
    lambda ids, mask: {"input_ids": ids,
                       "attention_mask": mask,
                       "labels": ids},      # causal-LM target = ids
    num_parallel_calls=tf.data.AUTOTUNE)

# ── 4.  train / valid split, shuffle, batch ────────────────────────
SIZE   = tf.data.experimental.cardinality(dataset).numpy()
split  = int(0.95 * SIZE)

train_ds = (dataset.take(split)
                     .shuffle(10_000)
                     .batch(8, drop_remainder=True)
                     .prefetch(tf.data.AUTOTUNE))

valid_ds = (dataset.skip(split)
                     .batch(8, drop_remainder=True)
                     .prefetch(tf.data.AUTOTUNE))

# ── 5.  build & compile the model ──────────────────────────────────
model = TFGPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tok))             # **critical**

EPOCHS  = 3
STEPS   = tf.data.experimental.cardinality(train_ds).numpy() * EPOCHS
WARMUP  = int(0.1 * STEPS)

opt, lr_schedule = create_optimizer(
        init_lr=5e-5,
        num_train_steps=STEPS,
        num_warmup_steps=WARMUP,
        weight_decay_rate=0.01)

model.compile(optimizer=opt)                        # HF supplies loss

# ── 6.  train ──────────────────────────────────────────────────────
hist = model.fit(
        train_ds,
        validation_data=valid_ds,
        epochs=EPOCHS,
        # callbacks=[tf.keras.callbacks.EarlyStopping(
        #                monitor="val_loss",
        #                patience=2,
        #                restore_best_weights=True)]
)

print("final validation perplexity:",
      round(math.exp(hist.history["val_loss"][-1]), 2))

# ── 7.  save checkpoint ────────────────────────────────────────────
SAVE_DIR = "phpe400_finetuned"
model.save_pretrained(SAVE_DIR)
tok.save_pretrained(SAVE_DIR)
print("✓ saved to", SAVE_DIR)


All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3


2025-05-06 21:44:18.381807: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Epoch 2/3
Epoch 3/3
final validation perplexity: 4.18
✓ saved to phpe400_finetuned


In [4]:
from transformers import GPT2TokenizerFast, TFGPT2LMHeadModel
tok   = GPT2TokenizerFast.from_pretrained("phpe400_finetuned")
model = TFGPT2LMHeadModel.from_pretrained("phpe400_finetuned")


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at phpe400_finetuned.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [11]:
prompt = "<|question|> What is a rational preference? <|answer|> "
inputs  = tok(prompt, return_tensors="tf")

eos_id  = tok.convert_tokens_to_ids("<|end|>")      # the end-marker you added
gen_ids = model.generate(
            **inputs,
            max_new_tokens=240,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            eos_token_id=eos_id,
            pad_token_id=eos_id,
)

print(tok.decode(gen_ids[0], skip_special_tokens=True))


 What is a rational preference?  ��� in���� the�U����anceteness���et�
