In [1]:
!pip install -U "transformers>=4.41" "trl>=0.9.6" "peft>=0.11.0" \
  accelerate datasets bitsandbytes wandb

Collecting transformers>=4.41
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl>=0.9.6
  Downloading trl-0.24.0-py3-none-any.whl.metadata (11 kB)
Collecting datasets
  Downloading datasets-4.2.0-py3-none-any.whl.metadata (18 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-21.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Downloading transformers-4.57.1-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m139.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.24.0-py3-none-any.whl (423 kB)
[2K   [90m━━━━━━━━━━━━━━

In [7]:
from inspect import signature
from trl import DPOConfig

print("=== DPOConfig signature ===")
sig = signature(DPOConfig.__init__)
for name, param in sig.parameters.items():
    print(f"{name}: {param}")

=== DPOConfig signature ===
self: self
output_dir: output_dir: Optional[str] = None
overwrite_output_dir: overwrite_output_dir: bool = False
do_train: do_train: bool = False
do_eval: do_eval: bool = False
do_predict: do_predict: bool = False
eval_strategy: eval_strategy: Union[transformers.trainer_utils.IntervalStrategy, str] = 'no'
prediction_loss_only: prediction_loss_only: bool = False
per_device_train_batch_size: per_device_train_batch_size: int = 8
per_device_eval_batch_size: per_device_eval_batch_size: int = 8
per_gpu_train_batch_size: per_gpu_train_batch_size: Optional[int] = None
per_gpu_eval_batch_size: per_gpu_eval_batch_size: Optional[int] = None
gradient_accumulation_steps: gradient_accumulation_steps: int = 1
eval_accumulation_steps: eval_accumulation_steps: Optional[int] = None
eval_delay: eval_delay: float = 0
torch_empty_cache_steps: torch_empty_cache_steps: Optional[int] = None
learning_rate: learning_rate: float = 1e-06
weight_decay: weight_decay: float = 0.0
adam_bet

In [4]:
import os
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import DPOTrainer, DPOConfig
import wandb

# ------------------------
# Configuration
# ------------------------
MODEL_REPO          = "meta-llama/Llama-3.1-8B-Instruct"
DATA_JSONL          = "dpo_pairs.jsonl"
OUTPUT_DIR          = "dpo_llama31_med"

USE_QLORA           = True
BETA                = 0.2
LR                  = 1e-5
EPOCHS              = 3
BATCH_SIZE          = 2
GRAD_ACCUM          = 8
MAX_SEQ_LEN         = 768
MAX_PROMPT_LEN      = 736
MAX_COMPLETION_LEN  = 8   # single-token-style completions are short

SEED                = 42
SAVE_STEPS          = 200
EVAL_STEPS          = 200
LOGGING_STEPS       = 20

LORA_R              = 16
LORA_ALPHA          = 32
LORA_DROPOUT        = 0.05
LORA_TARGET_MODULES = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]

# Weights & Biases
USE_WANDB           = True
WANDB_PROJECT       = "llama31-med-dpo"
WANDB_RUN_NAME      = "dpo-qlora-8b-med"
WANDB_ENTITY        = None  # set your org/team or keep None

# Hugging Face Hub (make sure you already ran huggingface_hub.login(...))
PUSH_TO_HUB         = True
HUB_MODEL_ID        = "Easonwangzk/dpo-llama31-med-adapter"
HUB_PRIVATE_REPO    = True

# ------------------------
# Tokenizer
# ------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# ------------------------
# Load base model (LoRA or QLoRA)
# ------------------------
def load_model():
    if USE_QLORA:
        quant_cfg = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
        )
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_REPO, quantization_config=quant_cfg, device_map="auto"
        )
        model = prepare_model_for_kbit_training(model)
    else:
        use_bf16 = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8
        torch_dtype = torch.bfloat16 if use_bf16 else torch.float16
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_REPO, torch_dtype=torch_dtype, device_map="auto"
        )
    return model

# ------------------------
# Attach LoRA adapters
# ------------------------
def attach_lora(model):
    lora_cfg = LoraConfig(
        r=LORA_R,
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        target_modules=LORA_TARGET_MODULES,
        bias="none",
        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, lora_cfg)
    model.print_trainable_parameters()
    return model

# ------------------------
# Dataset Loading
# ------------------------
def load_datasets(path: str):
    data = load_dataset("json", data_files=path, split="train")
    n = len(data)
    eval_size = max(1, int(0.15 * n))
    train_ds = data.select(range(n - eval_size))
    eval_ds  = data.select(range(n - eval_size, n))
    print(f"Train samples: {len(train_ds)}, Eval samples: {len(eval_ds)}")
    return train_ds, eval_ds

# ------------------------
# WANDB init (optional)
# ------------------------
def maybe_init_wandb():
    if not USE_WANDB:
        os.environ["WANDB_DISABLED"] = "true"
        return
    wandb.init(
        project=WANDB_PROJECT,
        name=WANDB_RUN_NAME,
        entity=WANDB_ENTITY,
        config={
            "model_repo": MODEL_REPO,
            "use_qlora": USE_QLORA,
            "beta": BETA,
            "lr": LR,
            "epochs": EPOCHS,
            "batch_size": BATCH_SIZE,
            "grad_accum": GRAD_ACCUM,
            "max_seq_len": MAX_SEQ_LEN,
            "max_prompt_len": MAX_PROMPT_LEN,
            "max_completion_len": MAX_COMPLETION_LEN,
            "lora_r": LORA_R,
            "lora_alpha": LORA_ALPHA,
            "lora_dropout": LORA_DROPOUT,
            "save_steps": SAVE_STEPS,
            "eval_steps": EVAL_STEPS,
        },
    )

# ------------------------
# Main Training
# ------------------------
def main():
    torch.manual_seed(SEED)
    maybe_init_wandb()

    train_ds, eval_ds = load_datasets(DATA_JSONL)
    model = load_model()
    model = attach_lora(model)

    # Build training configuration using DPOConfig (per your TRL signature)
    dpo_args = DPOConfig(
        # General trainer args
        output_dir=OUTPUT_DIR,
        do_train=True,
        do_eval=True,
        eval_strategy="steps",
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRAD_ACCUM,
        learning_rate=LR,
        num_train_epochs=EPOCHS,
        logging_steps=LOGGING_STEPS,
        save_steps=SAVE_STEPS,
        eval_steps=EVAL_STEPS,
        save_total_limit=3,
        warmup_ratio=0.05,
        lr_scheduler_type="cosine",
        bf16=True if torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8 else False,
        fp16=not (torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8),
        seed=SEED,
        report_to=("wandb" if USE_WANDB else "none"),
        run_name=WANDB_RUN_NAME if USE_WANDB else None,

        # Push-to-Hub
        push_to_hub=PUSH_TO_HUB,
        hub_model_id=HUB_MODEL_ID,
        hub_private_repo=HUB_PRIVATE_REPO,
        hub_strategy="every_save",

        # DPO-specific args present in your DPOConfig
        beta=BETA,
        max_length=MAX_SEQ_LEN,
        max_prompt_length=MAX_PROMPT_LEN,
        max_completion_length=MAX_COMPLETION_LEN,
        label_smoothing=0.0,
        loss_type=["sigmoid"],  # list[str] required by your signature
        # Optional: disable_dropout=True keeps training stable on small data
        disable_dropout=True,
    )

    print("Starting DPO fine-tuning...")
    trainer = DPOTrainer(
        model=model,
        args=dpo_args,              # DPOConfig goes here
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        processing_class=tokenizer, # tokenizer must be passed via 'processing_class'
        # ref_model=None,           # let TRL handle the reference model internally (frozen clone)
    )

    trainer.train()

    # Save LoRA adapter and tokenizer locally
    trainer.model.save_pretrained(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    print(f"DPO training completed. Adapter saved to: {OUTPUT_DIR}")

    # Push adapter to Hugging Face Hub
    if PUSH_TO_HUB:
        trainer.push_to_hub(commit_message="Upload DPO LoRA adapter")

    if USE_WANDB:
        wandb.finish()

if __name__ == "__main__":
    main()

Generating train split: 0 examples [00:00, ? examples/s]

Train samples: 255, Eval samples: 45


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

trainable params: 41,943,040 || all params: 8,072,204,288 || trainable%: 0.5196
Starting DPO fine-tuning...


Extracting prompt in train dataset:   0%|          | 0/255 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/255 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/255 [00:00<?, ? examples/s]

Extracting prompt in eval dataset:   0%|          | 0/45 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/45 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/45 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128009}.


Step,Training Loss,Validation Loss


DPO training completed. Adapter saved to: dpo_llama31_med


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...a31_med/training_args.bin: 100%|##########| 6.87kB / 6.87kB            

  ...adapter_model.safetensors:  25%|##4       | 41.9MB /  168MB            

  ...lama31_med/tokenizer.json: 100%|##########| 17.2MB / 17.2MB            

No files have been modified since last commit. Skipping to prevent empty commit.


0,1
train/epoch,▁▆█
train/global_step,▁▆█
train/grad_norm,▁█
train/learning_rate,█▁
train/logits/chosen,█▁
train/logits/rejected,█▁
train/logps/chosen,█▁
train/logps/rejected,█▁
train/loss,█▁
train/rewards/accuracies,▁█

0,1
total_flos,0
train/epoch,3
train/global_step,48
train/grad_norm,9.4304
train/learning_rate,0.0
train/logits/chosen,1.14709
train/logits/rejected,0.82157
train/logps/chosen,-3.07736
train/logps/rejected,-13.12426
train/loss,0.4124
