# Install and Import Libraries
Install necessary libraries like wandb, transformers, and datasets. Then, import them.

In [1]:
# Import libraries
import wandb
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset
import torch

In [2]:
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mcehrett[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Do LoRA with W&B logging

In [3]:
import os
import random
import time
import wandb
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig,
    TrainerCallback,
    logging
)
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
import torch


In [4]:
# Initialize WandB
wandb.init(project="llms_finetune", job_type="training")

# Access configuration
config = wandb.config
config.model_name = "Qwen/Qwen2.5-0.5B-Instruct"
config.dataset_name = "HuggingFaceH4/MATH-500"
config.lora_r = 8
config.lora_alpha = 32
config.lora_dropout = 0.05
config.per_device_train_batch_size = 8
config.learning_rate = 1e-4
config.num_train_epochs = 2
config.fp16 = True
config.bf16 = False
config.save_steps = 10
config.eval_steps = 10
config.save_total_limit = 1
config.optim = "paged_adamw_32bit"


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


## Prepare data (just as in previous noteboooks)

In [5]:

# Load the dataset
ds = load_dataset(config.dataset_name)
train_val_dataset = ds["test"].train_test_split(test_size=0.1)
train_dataset = train_val_dataset["train"]
eval_dataset = train_val_dataset["test"]

# Load the model and tokenizer
model_name = config.model_name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto"
)

# The model may not have a pad token set by default, so set it (using the EOS token)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

from workshop_utils import tokenize_and_mask, tokenize_for_generation, generate_and_print, data_collator

data_collator_fn = lambda features: data_collator(features, tokenizer=tokenizer)

# Map the formatting function over the dataset.
train_dataset_tokenized = train_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})
eval_dataset_tokenized = eval_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})

# Get a sample dataset so we can examine model generations before and after training
sample_dataset = eval_dataset.select(range(3))
sample_dataset_tokenized = sample_dataset.map(tokenize_for_generation, batched=False, fn_kwargs={"tokenizer": tokenizer})

train_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
eval_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
sample_dataset_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])

from peft import LoraConfig, get_peft_model, TaskType

# Define LoRA Config
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    inference_mode=False, 
    r=config.lora_r, 
    lora_alpha=config.lora_alpha, 
    lora_dropout=config.lora_dropout,
    target_modules=["q_proj", "v_proj"] # Replace with the target modules of your model
)

# Add LoRA adapter to the model
model = get_peft_model(model, lora_config)

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

## Perform training (while logging to W&B)

In [6]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./qwen-lora-math",          # Output directory
    num_train_epochs=config.num_train_epochs,              # Number of training epochs
    per_device_train_batch_size=config.per_device_train_batch_size,   # Batch size per device during training
    optim=config.optim,        # Optimizer, you might need to install accelerate: pip install accelerate -U
    save_steps=config.save_steps,                   # Save checkpoint every X updates steps
    eval_steps=config.eval_steps,                   # Evaluate every X updates steps
    eval_strategy="steps",           # Evaluation strategy
    save_total_limit=config.save_total_limit,              # Limit the total amount of checkpoints
    load_best_model_at_end=True,     # Load the best model when finished training (default is True)
    logging_steps=10,                # Log every X updates steps
    learning_rate=config.learning_rate,              # Learning rate
    fp16=config.fp16,                       # Use mixed precision training
    bf16=config.bf16,                      # Use bfloat16 training
    report_to="wandb"
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=eval_dataset_tokenized,
    data_collator=data_collator_fn,     # Data collator if needed
)

# Train the model
trainer.train()

# Finish the WandB run
wandb.finish()

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


[2025-02-28 14:48:49,984] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/cehrett/.conda/envs/LLMsFT/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/cehrett/.conda/envs/LLMsFT/compiler_compat/ld: /software/slurm/spackages/linux-rocky8-x86_64/gcc-12.3.0/cuda-12.3.0-p2hoh7xwcu52zilqglv3nnc5bwnritue/lib64/libcufile.so: undefined reference to `log2f@GLIBC_2.2.5'
/home/cehrett/.conda/envs/LLMsFT/compiler_compat/ld: /software/slurm/spackages/linux-rocky8-x86_64/gcc-12.2.0/anaconda3-2023.09-0-3mhml42fa64byxqyd5fig5tbih625dp2/lib/libstdc++.so.6: undefined reference to `fesetround@GLIBC_2.2.5'
/home/cehrett/.conda/envs/LLMsFT/compiler_compat/ld: /software/slurm/spackages/linux-rocky8-x86_64/gcc-12.3.0/cuda-12.3.0-p2hoh7xwcu52zilqglv3nnc5bwnritue/lib64/libcufile.so: undefined reference to `dlvsym'
/home/cehrett/.conda/envs/LLMsFT/compiler_compat/ld: /software/slurm/spackages/linux-rocky8-x86_64/gcc-12.3.0/cuda-12.3.0-p2hoh7xwcu52zilqglv3nnc5bwnritue/lib64/libcufile.so: undefined reference to `dlop

Step,Training Loss,Validation Loss
10,0.7208,0.717406
20,0.7266,0.700726
30,0.6636,0.695522
40,0.7218,0.692667
50,0.7412,0.691015
60,0.7893,0.689844
70,0.7092,0.688991
80,0.7331,0.688324
90,0.6329,0.688021
100,0.7031,0.687885


0,1
eval/loss,█▄▃▂▂▁▁▁▁▁▁
eval/runtime,▅█▁▁▅▆▅▇▆▄▅
eval/samples_per_second,▄▁██▄▃▄▂▃▅▄
eval/steps_per_second,▄▁██▄▃▄▂▃▅▄
train/epoch,▁▁▂▂▂▂▃▃▄▄▄▄▅▅▆▆▆▆▇▇███
train/global_step,▁▁▂▂▂▂▃▃▄▄▄▄▅▅▆▆▆▆▇▇███
train/grad_norm,█▁▃▅▂▁▂▄▂▂▂
train/learning_rate,█▇▇▆▅▅▄▃▂▂▁
train/loss,▅▅▂▅▆█▄▅▁▄▄

0,1
eval/loss,0.68783
eval/runtime,0.9019
eval/samples_per_second,55.437
eval/steps_per_second,7.761
total_flos,1266800085433344.0
train/epoch,2.0
train/global_step,114.0
train/grad_norm,0.73051
train/learning_rate,0.0
train/loss,0.7058


# Run a WandB Sweep
Define a sweep configuration with hyperparameters to tune. Use `wandb.sweep()` to create a sweep and `wandb.agent()` to run the sweep agent, optimizing the hyperparameters.

In [7]:
# Set up the config we'll use for each run during the sweep

config_defaults = {
    "model_name": "Qwen/Qwen2.5-0.5B-Instruct",
    "dataset_name": "HuggingFaceH4/MATH-500",
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "num_train_epochs": 2,
    "fp16": True,
    "bf16": False,
    "save_steps": 20,
    "eval_steps": 20,
    "eval_strategy": "steps",
    "save_total_limit": 1,
    "epochs": 2,
    "optim": "paged_adamw_32bit"
}


In [8]:
# Define a sweep configuration with hyperparameters to tune
sweep_config = {
    "method": "random",  # Random search; other options include "grid", "bayesian", etc.
    "metric": {
        "name": "eval/loss",
        "goal": "minimize"
    },
    "parameters": {
        "learning_rate": {
            "values": [1e-5, 1e-4, 1e-3]
        },
        "batch_size": {
            "values": [4, 8]
        },
        "lora_r": {
            "values": [4, 8, 16]
        }
    }
}

# Create a sweep
sweep_id = wandb.sweep(sweep_config, project="llm-finetuning")

# Define the training function
def train():
    # Initialize a new wandb run
    wandb.init(config=config_defaults)
    config = wandb.config
    
    # Load the dataset
    ds = load_dataset(wandb.config.dataset_name)
    train_val_dataset = ds["test"].train_test_split(test_size=0.1)
    train_dataset = train_val_dataset["train"]
    eval_dataset = train_val_dataset["test"]
    
    # Load the model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    model = AutoModelForCausalLM.from_pretrained(config.model_name, device_map="auto")

    # The model may not have a pad token set by default, so set it (using the EOS token)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    from workshop_utils import tokenize_and_mask, tokenize_for_generation, generate_and_print, data_collator
    
    data_collator_fn = lambda features: data_collator(features, tokenizer=tokenizer)
    
    # Map the formatting function over the dataset.
    train_dataset_tokenized = train_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})
    eval_dataset_tokenized = eval_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})
    
    # Get a sample dataset so we can examine model generations before and after training
    sample_dataset = eval_dataset.select(range(3))
    sample_dataset_tokenized = sample_dataset.map(tokenize_for_generation, batched=False, fn_kwargs={"tokenizer": tokenizer})
    
    train_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
    eval_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
    sample_dataset_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])
    
    from peft import LoraConfig, get_peft_model, TaskType
    
    # Define LoRA Config
    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM, 
        inference_mode=False, 
        r=config.lora_r, 
        lora_alpha=config.lora_alpha, 
        lora_dropout=config.lora_dropout,
        target_modules=["q_proj", "v_proj"] # Replace with the target modules of your model
    )
    
    # Add LoRA adapter to the model
    model = get_peft_model(model, lora_config)
    
    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=config.epochs,
        per_device_train_batch_size=config.batch_size,
        save_steps=config.save_steps,
        eval_steps=config.eval_steps,
        eval_strategy=config.eval_strategy,
        save_total_limit=1,
        learning_rate=config.learning_rate,
        fp16=config.fp16,
        bf16=config.bf16,
        logging_dir="./logs",
        logging_steps=10,
        load_best_model_at_end=True,
        report_to="wandb",
    )
    
    
    # Create Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset_tokenized,
        eval_dataset=eval_dataset_tokenized,
        tokenizer=tokenizer,
        data_collator=data_collator_fn,
    )
    
    # Train the model
    trainer.train()
    
    # Save the model
    # model.save_pretrained(f"./model_{wandb.run.id}")
    # tokenizer.save_pretrained(f"./model_{wandb.run.id}")
    
    # # Log model checkpoint as artifact
    # artifact = wandb.Artifact(f"model_{wandb.run.id}", type="model")
    # artifact.add_dir(f"./model_{wandb.run.id}")
    # wandb.log_artifact(artifact)
    
    # Finish the WandB run
    wandb.finish()

# Run the sweep agent
wandb.agent(sweep_id, function=train, count=3)

Create sweep with ID: ciseumso
Sweep URL: https://wandb.ai/cehrett/llm-finetuning/sweeps/ciseumso


[34m[1mwandb[0m: Agent Starting Run: 7vf3ial4 with config:
[34m[1mwandb[0m: 	batch_size: 4
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	lora_r: 16
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


  trainer = Trainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss
20,0.78,0.750669
40,0.6827,0.74214
60,0.7347,0.738769
80,0.6836,0.737239
100,0.6947,0.736443
120,0.6586,0.735757
140,0.7472,0.734871
160,0.6498,0.734308
180,0.829,0.734044
200,0.7718,0.733845


0,1
eval/loss,█▄▃▂▂▂▁▁▁▁▁
eval/runtime,█▁▁▁▁▁▁▁▁▁▁
eval/samples_per_second,▁██████████
eval/steps_per_second,▁██████████
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇███
train/grad_norm,▃▂▂█▂▂▃▁▂▂▁▂▁▃▃▁▂▃▂▂▁▃
train/learning_rate,██▇▇▇▆▆▆▅▅▅▄▄▄▃▃▃▂▂▂▁▁
train/loss,█▆▁▃█▅▅▃▅▄▂▃▄▅▁▂▃▇▃▆▃▂

0,1
eval/loss,0.73368
eval/runtime,0.5781
eval/samples_per_second,86.495
eval/steps_per_second,12.109
total_flos,1050475838421504.0
train/epoch,2.0
train/global_step,226.0
train/grad_norm,1.02329
train/learning_rate,0.0
train/loss,0.6218


[34m[1mwandb[0m: Agent Starting Run: u8d1dtkt with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	lora_r: 4
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


  trainer = Trainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss
20,0.6229,0.747199
40,0.7151,0.739966
60,0.6146,0.737876
80,0.6415,0.736833
100,0.7246,0.736279


0,1
eval/loss,█▃▂▁▁
eval/runtime,▁▅█▂▂
eval/samples_per_second,█▄▁▇▇
eval/steps_per_second,█▄▁▇▇
train/epoch,▁▂▂▂▃▃▄▄▄▅▆▆▆▇▇██
train/global_step,▁▂▂▂▃▃▄▄▄▅▆▆▆▇▇██
train/grad_norm,▅█▃▁▂▁▂▂▃▃▆
train/learning_rate,█▇▇▆▅▅▄▃▂▂▁
train/loss,█▁▆▄▅▁▄▂▅▅▂

0,1
eval/loss,0.73628
eval/runtime,0.7221
eval/samples_per_second,69.242
eval/steps_per_second,9.694
total_flos,1297336262238720.0
train/epoch,2.0
train/global_step,114.0
train/grad_norm,1.87369
train/learning_rate,0.0
train/loss,0.6332


[34m[1mwandb[0m: Agent Starting Run: il0iyxef with config:
[34m[1mwandb[0m: 	batch_size: 4
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	lora_r: 8
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


  trainer = Trainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss
20,0.8724,0.837065
40,0.7487,0.811215
60,0.7924,0.789239
80,0.7239,0.775309
100,0.7285,0.768237
120,0.6983,0.763917
140,0.7962,0.761057
160,0.6938,0.759
180,0.8757,0.757795
200,0.8106,0.75698


0,1
eval/loss,█▆▄▃▂▂▁▁▁▁▁
eval/runtime,█▁▂▁▂▂▂▂▁▂▁
eval/samples_per_second,▁█▇█▇▇▇▆█▇█
eval/steps_per_second,▁█▇█▇▇▇▆█▇█
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇███
train/grad_norm,▆▄▅█▅▅▇▂▅▃▁▃▁▅▆▂▂▄▃▂▁▄
train/learning_rate,██▇▇▇▆▆▆▅▅▅▄▄▄▃▃▃▂▂▂▁▁
train/loss,▇▇▂▃█▅▅▃▅▃▁▂▃▅▁▂▃▇▂▅▂▁

0,1
eval/loss,0.75657
eval/runtime,0.7512
eval/samples_per_second,66.556
eval/steps_per_second,9.318
total_flos,1048893678598656.0
train/epoch,2.0
train/global_step,226.0
train/grad_norm,1.5932
train/learning_rate,0.0
train/loss,0.6544
