# Install and Import Libraries
Install necessary libraries like wandb, transformers, and datasets. Then, import them.

In [1]:
# Import libraries
import wandb
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset
import torch

## Do LoRA with W&B logging

In [2]:
import os
import random
import time
import wandb
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig,
    TrainerCallback,
    logging
)
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
import torch


In [3]:
# Initialize WandB
wandb.init(project="llms_finetune", job_type="training")

# Access configuration
config = wandb.config
config.model_name = "Qwen/Qwen2.5-0.5B-Instruct"
config.dataset_name = "HuggingFaceH4/MATH-500"
config.lora_r = 8
config.lora_alpha = 32
config.lora_dropout = 0.05
config.per_device_train_batch_size = 4
config.gradient_accumulation_steps = 1
config.learning_rate = 1e-4
config.weight_decay = 0.001
config.warmup_ratio = 0.03
config.lr_scheduler_type = "cosine"
config.num_train_epochs = 1
config.fp16 = True
config.bf16 = False
config.max_grad_norm = 0.3
config.group_by_length = True
config.save_steps = 20
config.eval_steps = 20
config.save_total_limit = 1
config.optim = "paged_adamw_32bit"


[34m[1mwandb[0m: Currently logged in as: [33mcehrett[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


## Prepare data (just as in previous noteboooks)

In [4]:

# Load the dataset
ds = load_dataset(config.dataset_name)
train_val_dataset = ds["test"].train_test_split(test_size=0.1)
train_dataset = train_val_dataset["train"]
eval_dataset = train_val_dataset["test"]

# Load the model and tokenizer
model_name = config.model_name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto"
)

# The model may not have a pad token set by default, so set it (using the EOS token)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

from workshop_utils import tokenize_and_mask, tokenize_for_generation, generate_and_print, data_collator

data_collator_fn = lambda features: data_collator(features, tokenizer=tokenizer)

# Map the formatting function over the dataset.
train_dataset_tokenized = train_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})
eval_dataset_tokenized = eval_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})

# Get a sample dataset so we can examine model generations before and after training
sample_dataset = eval_dataset.select(range(3))
sample_dataset_tokenized = sample_dataset.map(tokenize_for_generation, batched=False, fn_kwargs={"tokenizer": tokenizer})

train_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
eval_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
sample_dataset_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])

from peft import LoraConfig, get_peft_model, TaskType

# Define LoRA Config
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    inference_mode=False, 
    r=config.lora_r, 
    lora_alpha=config.lora_alpha, 
    lora_dropout=config.lora_dropout,
    target_modules=["q_proj", "v_proj"] # Replace with the target modules of your model
)

# Add LoRA adapter to the model
model = get_peft_model(model, lora_config)

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

## Perform training (while logging to W&B)

In [5]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./qwen-lora-math",          # Output directory
    num_train_epochs=config.num_train_epochs,              # Number of training epochs
    per_device_train_batch_size=config.per_device_train_batch_size,   # Batch size per device during training
    gradient_accumulation_steps=config.gradient_accumulation_steps,   # Number of updates steps to accumulate before performing a backward/update pass
    optim=config.optim,        # Optimizer, you might need to install accelerate: pip install accelerate -U
    save_steps=config.save_steps,                   # Save checkpoint every X updates steps
    eval_steps=config.eval_steps,                   # Evaluate every X updates steps
    eval_strategy="steps",           # Evaluation strategy
    save_total_limit=config.save_total_limit,              # Limit the total amount of checkpoints
    load_best_model_at_end=True,     # Load the best model when finished training (default is True)
    logging_steps=10,                # Log every X updates steps
    learning_rate=config.learning_rate,              # Learning rate
    weight_decay=config.weight_decay,              # Weight decay
    fp16=config.fp16,                       # Use mixed precision training
    bf16=config.bf16,                      # Use bfloat16 training
    max_grad_norm=config.max_grad_norm,               # Gradient clipping max norm
    max_steps=-1,                    # If > 0: set total number of training steps to perform. Override num_train_epochs.
    warmup_ratio=config.warmup_ratio,               # Linear warmup over warmup_ratio fraction of the total number of training steps.
    group_by_length=config.group_by_length,            # Group sequences of roughly the same length together for more efficient training
    lr_scheduler_type=config.lr_scheduler_type,       # Learning rate scheduler type
    report_to="wandb"
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=eval_dataset_tokenized,
    data_collator=data_collator_fn,     # Data collator if needed
)

# Train the model
trainer.train()

# Finish the WandB run
wandb.finish()

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


[2025-02-21 13:00:28,354] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)




/home/cehrett/.conda/envs/LLMsFT/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/cehrett/.conda/envs/LLMsFT/compiler_compat/ld: /software/slurm/spackages/linux-rocky8-x86_64/gcc-12.3.0/cuda-12.3.0-p2hoh7xwcu52zilqglv3nnc5bwnritue/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/home/cehrett/.conda/envs/LLMsFT/compiler_compat/ld: /software/slurm/spackages/linux-rocky8-x86_64/gcc-12.3.0/cuda-12.3.0-p2hoh7xwcu52zilqglv3nnc5bwnritue/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/home/cehrett/.conda/envs/LLMsFT/compiler_compat/ld: /software/slurm/spackages/linux-rocky8-x86_64/gcc-12.3.0/cuda-12.3.0-p2hoh7xwcu52zilqglv3nnc5bwnritue/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/home/cehrett/.conda/envs/LLMsFT/compiler_compat/ld: /software/slurm/spackages/linux-rocky8-x86_64/gcc-12.3.0/cuda-12.3.0-p2hoh7xwcu52zilqglv3nn

Step,Training Loss,Validation Loss
20,0.9954,3.187785
40,0.71,0.306875
60,0.6443,0.318583
80,0.7439,0.322805
100,0.7492,0.302879


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


0,1
eval/loss,█▁▁▁▁
eval/runtime,▄█▄▁▇
eval/samples_per_second,▅▁▅█▂
eval/steps_per_second,▅▁▅█▂
train/epoch,▁▂▂▂▃▃▄▄▄▅▆▆▆▇▇██
train/global_step,▁▂▂▂▃▃▄▄▄▅▆▆▆▇▇██
train/grad_norm,▄█▂▂▃▁▂▃▁▂▃
train/learning_rate,██▇▆▆▅▄▃▂▁▁
train/loss,█▃▂▂▂▁▂▂▁▂▁

0,1
eval/loss,0.30288
eval/runtime,0.538
eval/samples_per_second,92.941
eval/steps_per_second,13.012
total_flos,317532316214784.0
train/epoch,1.0
train/global_step,113.0
train/grad_norm,2.18722
train/learning_rate,0.0
train/loss,0.6771


# Run a WandB Sweep
Define a sweep configuration with hyperparameters to tune. Use `wandb.sweep()` to create a sweep and `wandb.agent()` to run the sweep agent, optimizing the hyperparameters.

In [6]:
# Set up the config we'll use for each run during the sweep

config_defaults = {
    "model_name": "Qwen/Qwen2.5-0.5B-Instruct",
    "dataset_name": "HuggingFaceH4/MATH-500",
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "gradient_accumulation_steps": 1,
    "learning_rate": 1e-4,
    "weight_decay": 0.001,
    "warmup_ratio": 0.03,
    "lr_scheduler_type": "cosine",
    "fp16": True,
    "bf16": False,
    "max_grad_norm": 0.3,
    "group_by_length": True,
    "save_steps": 20,
    "eval_steps": 20,
    "save_total_limit": 1,
    "epochs": 2,
    "optim": "paged_adamw_32bit"
}

In [9]:
# Define a sweep configuration with hyperparameters to tune
sweep_config = {
    "method": "random",  # Random search; other options include "grid", "bayesian", etc.
    "metric": {
        "name": "eval/loss",
        "goal": "minimize"
    },
    "parameters": {
        "learning_rate": {
            "values": [1e-5, 1e-4, 1e-3]
        },
        "batch_size": {
            "values": [4, 8, 16]
        },
        "lora_r": {
            "values": [4, 8, 16]
        }
    }
}

# Create a sweep
sweep_id = wandb.sweep(sweep_config, project="llm-finetuning")

# Define the training function
def train():
    # Initialize a new wandb run
    wandb.init(config=config_defaults)
    config = wandb.config
    
    # Load the dataset
    ds = load_dataset(wandb.config.dataset_name)
    train_val_dataset = ds["test"].train_test_split(test_size=0.1)
    train_dataset = train_val_dataset["train"]
    eval_dataset = train_val_dataset["test"]
    
    # Load the model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(wandb.config.model_name)
    model = AutoModelForCausalLM.from_pretrained(wandb.config.model_name, device_map="auto")

    # The model may not have a pad token set by default, so set it (using the EOS token)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    from workshop_utils import tokenize_and_mask, tokenize_for_generation, generate_and_print, data_collator
    
    data_collator_fn = lambda features: data_collator(features, tokenizer=tokenizer)
    
    # Map the formatting function over the dataset.
    train_dataset_tokenized = train_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})
    eval_dataset_tokenized = eval_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})
    
    # Get a sample dataset so we can examine model generations before and after training
    sample_dataset = eval_dataset.select(range(3))
    sample_dataset_tokenized = sample_dataset.map(tokenize_for_generation, batched=False, fn_kwargs={"tokenizer": tokenizer})
    
    train_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
    eval_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
    sample_dataset_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])
    
    from peft import LoraConfig, get_peft_model, TaskType
    
    # Define LoRA Config
    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM, 
        inference_mode=False, 
        r=config.lora_r, 
        lora_alpha=config.lora_alpha, 
        lora_dropout=config.lora_dropout,
        target_modules=["q_proj", "v_proj"] # Replace with the target modules of your model
    )
    
    # Add LoRA adapter to the model
    model = get_peft_model(model, lora_config)
    
    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=wandb.config.epochs,
        per_device_train_batch_size=wandb.config.batch_size,
        learning_rate=wandb.config.learning_rate,
        weight_decay=wandb.config.weight_decay,
        gradient_accumulation_steps=wandb.config.gradient_accumulation_steps,
        fp16=wandb.config.fp16,
        bf16=wandb.config.bf16,
        max_grad_norm=wandb.config.max_grad_norm,
        warmup_ratio=wandb.config.warmup_ratio,
        lr_scheduler_type=wandb.config.lr_scheduler_type,
        logging_dir="./logs",
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        report_to="wandb",
    )
    
    # Create Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset_tokenized,
        eval_dataset=eval_dataset_tokenized,
        tokenizer=tokenizer,
        data_collator=data_collator_fn,
    )
    
    # Train the model
    trainer.train()
    
    # Save the model
    model.save_pretrained(f"./model_{wandb.run.id}")
    tokenizer.save_pretrained(f"./model_{wandb.run.id}")
    
    # # Log model checkpoint as artifact
    # artifact = wandb.Artifact(f"model_{wandb.run.id}", type="model")
    # artifact.add_dir(f"./model_{wandb.run.id}")
    # wandb.log_artifact(artifact)
    
    # Finish the WandB run
    wandb.finish()

# Run the sweep agent
wandb.agent(sweep_id, function=train)

Create sweep with ID: 2pumb6fb
Sweep URL: https://wandb.ai/cehrett/llm-finetuning/sweeps/2pumb6fb


[34m[1mwandb[0m: Agent Starting Run: l4ppcmc6 with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	lora_r: 4


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

  trainer = Trainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss
1,4.8782,0.334904
2,0.2547,0.310008


0,1
eval/loss,█▁
eval/runtime,█▁
eval/samples_per_second,▁█
eval/steps_per_second,▁█
train/epoch,▁▂▄▄▅▇██
train/global_step,▁▂▄▄▅▇██
train/grad_norm,▆█▁▁▁
train/learning_rate,█▇▅▃▁
train/loss,█▄▁▁▁

0,1
eval/loss,0.31001
eval/runtime,0.5172
eval/samples_per_second,96.682
eval/steps_per_second,13.535
total_flos,1495595392270848.0
train/epoch,2.0
train/global_step,58.0
train/grad_norm,0.41889
train/learning_rate,1e-05
train/loss,0.2547


[34m[1mwandb[0m: Agent Starting Run: 9ee38iuh with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	lora_r: 4


  trainer = Trainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss
1,5.8052,0.347682
2,0.2415,0.317483


0,1
eval/loss,█▁
eval/runtime,▁█
eval/samples_per_second,█▁
eval/steps_per_second,█▁
train/epoch,▁▂▄▄▅▇██
train/global_step,▁▂▄▄▅▇██
train/grad_norm,▇█▁▁▁
train/learning_rate,█▇▅▃▁
train/loss,█▄▁▁▁

0,1
eval/loss,0.31748
eval/runtime,0.5383
eval/samples_per_second,92.888
eval/steps_per_second,13.004
total_flos,1518546826418688.0
train/epoch,2.0
train/global_step,58.0
train/grad_norm,0.37967
train/learning_rate,1e-05
train/loss,0.2415


[34m[1mwandb[0m: Agent Starting Run: p48y19bp with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	lora_r: 8


  trainer = Trainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss
1,5.0255,0.346627
2,0.2422,0.318864


0,1
eval/loss,█▁
eval/runtime,█▁
eval/samples_per_second,▁█
eval/steps_per_second,▁█
train/epoch,▁▂▄▄▅▇██
train/global_step,▁▂▄▄▅▇██
train/grad_norm,█▇▁▁▁
train/learning_rate,█▇▅▃▁
train/loss,█▄▁▁▁

0,1
eval/loss,0.31886
eval/runtime,0.536
eval/samples_per_second,93.284
eval/steps_per_second,13.06
total_flos,1519692985096704.0
train/epoch,2.0
train/global_step,58.0
train/grad_norm,0.25393
train/learning_rate,1e-05
train/loss,0.2422


[34m[1mwandb[0m: Agent Starting Run: p7o9k0k0 with config:
[34m[1mwandb[0m: 	batch_size: 4
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	lora_r: 16


  trainer = Trainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss
1,0.3453,0.309762
2,0.3106,0.308633


0,1
eval/loss,█▁
eval/runtime,█▁
eval/samples_per_second,▁█
eval/steps_per_second,▁█
train/epoch,▁▁▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▇▇▇▇███
train/grad_norm,▆█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,████▇▇▇▆▆▆▅▅▄▄▃▃▂▂▂▁▁▁
train/loss,█▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/loss,0.30863
eval/runtime,0.5347
eval/samples_per_second,93.518
eval/steps_per_second,13.093
total_flos,1050475838421504.0
train/epoch,2.0
train/global_step,226.0
train/grad_norm,0.49755
train/learning_rate,0.0
train/loss,0.3106


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.
