In [1]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
)
import torch

# Load the dataset
ds = load_dataset("HuggingFaceH4/MATH-500")
# Split the dataset into training and validation sets
train_val_dataset = ds["test"].train_test_split(test_size=0.1)
train_dataset = train_val_dataset["train"]
eval_dataset = train_val_dataset["test"]

 # This example only has a test split, so we use that, for demonstration purposes.

 # Load the model and tokenizer
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto"
)

# The model may not have a pad token set by default, so set it (using the EOS token)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [2]:
from workshop_utils import tokenize_and_mask, tokenize_for_generation, generate_and_print, data_collator

data_collator_fn = lambda features: data_collator(features, tokenizer=tokenizer) 

In [3]:
# Map the formatting function over the dataset.
# This applies the formatting function to each example in the dataset.
# The result is that we have a dataset where each math problem is formatted as a prompt for the model,
# and the solution is formatted as a response that the model should generate.
# Each example is also tokenized
# (If your dataset is large you might use batched=True; here we keep it simple.)
train_dataset_tokenized = train_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})
eval_dataset_tokenized = eval_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})

# Get a sample dataset so we can examine model generations before and after training
sample_dataset = eval_dataset.select(range(3))
sample_dataset_tokenized = sample_dataset.map(tokenize_for_generation, batched=False, fn_kwargs={"tokenizer": tokenizer})

train_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
eval_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
sample_dataset_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [4]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_params = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        all_params += num_params
        if param.requires_grad:
            trainable_params += num_params
    print(f"trainable params: {trainable_params:,d} || all params: {all_params:,d} || trainable%: {100 * trainable_params / all_params:.2f}%")

print_trainable_parameters(model)

trainable params: 494,032,768 || all params: 494,032,768 || trainable%: 100.00%


In [5]:
from peft import LoraConfig, get_peft_model, TaskType

# Define LoRA Config
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    inference_mode=False, 
    r=8, 
    lora_alpha=32, 
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"] # Replace with the target modules of your model
)

# Add LoRA adapter to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 540,672 || all params: 494,573,440 || trainable%: 0.1093


In [6]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./qwen-lora-math",          # Output directory
    num_train_epochs=3,              # Number of training epochs
    per_device_train_batch_size=8,   # Batch size per device during training
    gradient_accumulation_steps=1,   # Number of updates steps to accumulate before performing a backward/update pass
    optim="paged_adamw_32bit",        # Optimizer, you might need to install accelerate: pip install accelerate -U
    save_steps=20,                   # Save checkpoint every X updates steps
    eval_steps=20,                   # Evaluate every X updates steps
    eval_strategy="steps",           # Evaluation strategy
    save_total_limit=1,              # Limit the total amount of checkpoints
    load_best_model_at_end=True,     # Load the best model when finished training (default is True)
    logging_steps=10,                # Log every X updates steps
    learning_rate=1e-4,              # Learning rate
    weight_decay=0.001,              # Weight decay
    fp16=True,                       # Use mixed precision training
    bf16=False,                      # Use bfloat16 training
    max_grad_norm=0.3,               # Gradient clipping max norm
    max_steps=-1,                    # If > 0: set total number of training steps to perform. Override num_train_epochs.
    warmup_ratio=0.03,               # Linear warmup over warmup_ratio fraction of the total number of training steps.
    group_by_length=True,            # Group sequences of roughly the same length together for more efficient training
    lr_scheduler_type="cosine",       # Learning rate scheduler type
    report_to="none",                # Disable logging
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=eval_dataset_tokenized,
    data_collator=data_collator_fn,     # Data collator if needed
)

# Train the model
trainer.train()

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


[2025-02-28 14:20:58,190] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/cehrett/.conda/envs/LLMsFT/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/cehrett/.conda/envs/LLMsFT/compiler_compat/ld: /software/slurm/spackages/linux-rocky8-x86_64/gcc-12.3.0/cuda-12.3.0-p2hoh7xwcu52zilqglv3nnc5bwnritue/lib64/libcufile.so: undefined reference to `log2f@GLIBC_2.2.5'
/home/cehrett/.conda/envs/LLMsFT/compiler_compat/ld: /software/slurm/spackages/linux-rocky8-x86_64/gcc-12.2.0/anaconda3-2023.09-0-3mhml42fa64byxqyd5fig5tbih625dp2/lib/libstdc++.so.6: undefined reference to `fesetround@GLIBC_2.2.5'
/home/cehrett/.conda/envs/LLMsFT/compiler_compat/ld: /software/slurm/spackages/linux-rocky8-x86_64/gcc-12.3.0/cuda-12.3.0-p2hoh7xwcu52zilqglv3nnc5bwnritue/lib64/libcufile.so: undefined reference to `dlvsym'
/home/cehrett/.conda/envs/LLMsFT/compiler_compat/ld: /software/slurm/spackages/linux-rocky8-x86_64/gcc-12.3.0/cuda-12.3.0-p2hoh7xwcu52zilqglv3nnc5bwnritue/lib64/libcufile.so: undefined reference to `dlop

Step,Training Loss,Validation Loss
20,0.7482,0.686028
40,0.8187,0.691287
60,0.706,0.672091
80,0.7559,0.704082
100,0.7215,0.680891
120,0.6395,0.701159
140,0.7753,0.698023
160,0.6918,0.678332


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=171, training_loss=0.7346053506895812, metrics={'train_runtime': 36.2842, 'train_samples_per_second': 37.206, 'train_steps_per_second': 4.713, 'total_flos': 996680617810944.0, 'train_loss': 0.7346053506895812, 'epoch': 3.0})

In [7]:
# Save the LORA model
output_dir = "./qwen-lora-math-final"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./qwen-lora-math-final/tokenizer_config.json',
 './qwen-lora-math-final/special_tokens_map.json',
 './qwen-lora-math-final/vocab.json',
 './qwen-lora-math-final/merges.txt',
 './qwen-lora-math-final/added_tokens.json',
 './qwen-lora-math-final/tokenizer.json')

In [None]:
# Load the LORA model
from peft import PeftModel
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", device_map="auto")
model = PeftModel.from_pretrained(model, output_dir)

In [8]:
# Generate and print model outputs after training
generate_and_print(sample_dataset, sample_dataset_tokenized, model, tokenizer)

Problem:
Consider the function $z(x,y)$ describing the paraboloid \[z = (2x - y)^2 - 2y^2
- 3y.\]Archimedes and Brahmagupta are playing a game.  Archimedes first chooses
$x.$  Afterwards, Brahmagupta chooses $y.$  Archimedes wishes to minimize $z$
while Brahmagupta wishes to maximize $z.$  Assuming that Brahmagupta will play
optimally, what value of $x$ should Archimedes choose?

True Solution:
Expanding $z,$ we get \begin{align*} z &= 4x^2 - 4xy + y^2 - 2y^2 - 3y \\ &=
-y^2 - (4x + 3) y + 4x^2. \end{align*}After Archimedes chooses $x,$ Brahmagupta
will choose \[y = -\frac{4x + 3}{2}\]in order to maximize $z.$  Then
\begin{align*} z &= -\left( -\frac{4x + 3}{2} \right)^2 - (4x + 3) \left(
-\frac{4x + 3}{2} \right)^2 + 4x^2 \\ &= 8x^2 + 6x + \frac{9}{4}. \end{align*}To
minimize this expression, Archimedes should choose $x = -\frac{6}{16} =
\boxed{-\frac{3}{8}}.$

Model's Solution:
We want to minimize or maximize the function $f(x) = z(x,y) = (2x-y)^2 -
2y^2-3y$.  The derivative is  \[\f

In [9]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
)
import torch
from transformers import BitsAndBytesConfig

# Load the dataset
ds = load_dataset("HuggingFaceH4/MATH-500")
# Split the dataset into training and validation sets
train_val_dataset = ds["test"].train_test_split(test_size=0.1)
train_dataset = train_val_dataset["train"]
eval_dataset = train_val_dataset["test"]

 # This example only has a test split, so we use that, for demonstration purposes.

# Configuration for bitsandbytes (QLoRA)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16, # Or torch.bfloat16 if supported
)

 # Load the model and tokenizer
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# The model may not have a pad token set by default, so set it (using the EOS token)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [10]:
from peft import prepare_model_for_kbit_training


# Prepare Model for QLoRA
model = prepare_model_for_kbit_training(model)

In [11]:
# Configure LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    target_modules=["q_proj", "v_proj"] # Adjust for your model
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 540,672 || all params: 494,573,440 || trainable%: 0.1093


In [13]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./qwen-qlora-math",          # Output directory
    num_train_epochs=3,              # Number of training epochs
    per_device_train_batch_size=8,   # Batch size per device during training
    gradient_accumulation_steps=1,   # Number of updates steps to accumulate before performing a backward/update pass
    optim="paged_adamw_32bit",       # Optimizer, you might need to install accelerate: pip install accelerate -U
    save_steps=20,                   # Save checkpoint every X updates steps
    eval_steps=20,                   # Evaluate every X updates steps
    eval_strategy="steps",           # Evaluation strategy
    save_total_limit=1,              # Limit the total amount of checkpoints
    load_best_model_at_end=True,     # Load the best model when finished training (default is True)
    logging_steps=10,                # Log every X updates steps
    learning_rate=1e-4,              # Learning rate
    weight_decay=0.001,              # Weight decay
    fp16=True,                       # Use mixed precision training
    bf16=False,                      # Use bfloat16 training
    max_grad_norm=0.3,               # Gradient clipping max norm
    max_steps=-1,                    # If > 0: set total number of training steps to perform. Override num_train_epochs.
    warmup_ratio=0.03,               # Linear warmup over warmup_ratio fraction of the total number of training steps.
    group_by_length=True,            # Group sequences of roughly the same length together for more efficient training
    lr_scheduler_type="cosine",      # Learning rate scheduler type
    report_to="none",                # Disable logging
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=eval_dataset_tokenized,
    data_collator=data_collator_fn,     # Data collator if needed
)

# Train the model
trainer.train()

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
20,0.7493,0.724248
40,0.842,0.739596
60,0.7325,0.719729
80,0.7908,0.754252
100,0.7394,0.729048
120,0.6635,0.749899
140,0.7987,0.746554
160,0.7216,0.726491


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwargs)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwargs)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwargs)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwargs)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwargs)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwargs)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwargs)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwargs)


TrainOutput(global_step=171, training_loss=0.756615131919147, metrics={'train_runtime': 58.6451, 'train_samples_per_second': 23.02, 'train_steps_per_second': 2.916, 'total_flos': 996680617810944.0, 'train_loss': 0.756615131919147, 'epoch': 3.0})

In [14]:
# Save the QLoRA model
output_dir = "./qwen-qlora-math-final"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./qwen-qlora-math-final/tokenizer_config.json',
 './qwen-qlora-math-final/special_tokens_map.json',
 './qwen-qlora-math-final/vocab.json',
 './qwen-qlora-math-final/merges.txt',
 './qwen-qlora-math-final/added_tokens.json',
 './qwen-qlora-math-final/tokenizer.json')

In [15]:
# Load the QLoRA model
from peft import PeftModel
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", device_map="auto", quantization_config=bnb_config)
model = PeftModel.from_pretrained(model, output_dir)


In [16]:
# Generate and print model outputs after training
generate_and_print(sample_dataset, sample_dataset_tokenized, model, tokenizer)

Problem:
Consider the function $z(x,y)$ describing the paraboloid \[z = (2x - y)^2 - 2y^2
- 3y.\]Archimedes and Brahmagupta are playing a game.  Archimedes first chooses
$x.$  Afterwards, Brahmagupta chooses $y.$  Archimedes wishes to minimize $z$
while Brahmagupta wishes to maximize $z.$  Assuming that Brahmagupta will play
optimally, what value of $x$ should Archimedes choose?

True Solution:
Expanding $z,$ we get \begin{align*} z &= 4x^2 - 4xy + y^2 - 2y^2 - 3y \\ &=
-y^2 - (4x + 3) y + 4x^2. \end{align*}After Archimedes chooses $x,$ Brahmagupta
will choose \[y = -\frac{4x + 3}{2}\]in order to maximize $z.$  Then
\begin{align*} z &= -\left( -\frac{4x + 3}{2} \right)^2 - (4x + 3) \left(
-\frac{4x + 3}{2} \right)^2 + 4x^2 \\ &= 8x^2 + 6x + \frac{9}{4}. \end{align*}To
minimize this expression, Archimedes should choose $x = -\frac{6}{16} =
\boxed{-\frac{3}{8}}.$

Model's Solution:
The minimum value of $z$ occurs when $y$ is at its maximum.  To find this, we
take the partial derivative w

## Prompt tuning

In [17]:
from peft import PromptTuningConfig, get_peft_model

# Define Prompt Tuning Config
prompt_tuning_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM, 
    num_virtual_tokens=20, # Length of the prompt
    prompt_tuning_init="TEXT",
    prompt_tuning_init_text="Solve the following math problem:",
    tokenizer_name_or_path=model_name,
)

# Add prompt tuning adapter to the model
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", device_map="auto")
model = get_peft_model(model, prompt_tuning_config)
model.print_trainable_parameters()

trainable params: 17,920 || all params: 494,050,688 || trainable%: 0.0036


In [18]:
# Training Arguments (example - adjust as needed)
training_args = TrainingArguments(
    output_dir="./qwen-prompt-tuning-math",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=20,
    eval_steps=20,
    evaluation_strategy="steps",
    save_total_limit=1,
    load_best_model_at_end=True,
    logging_steps=10,
    learning_rate=5e-3,  # Smaller learning rate often helps
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="none",
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=eval_dataset_tokenized,
    data_collator=data_collator_fn,
)

trainer.train()

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss
20,0.7369,0.68912
40,0.8216,0.697645
60,0.7177,0.682443
80,0.7788,0.713648
100,0.741,0.687617
120,0.6682,0.709285
140,0.8036,0.705559
160,0.7197,0.685136


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=171, training_loss=0.7510400850173325, metrics={'train_runtime': 33.8498, 'train_samples_per_second': 39.882, 'train_steps_per_second': 5.052, 'total_flos': 995177216596992.0, 'train_loss': 0.7510400850173325, 'epoch': 3.0})

In [19]:
# Save the model
output_dir = "./qwen-prompt-tuning-math-final"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


('./qwen-prompt-tuning-math-final/tokenizer_config.json',
 './qwen-prompt-tuning-math-final/special_tokens_map.json',
 './qwen-prompt-tuning-math-final/vocab.json',
 './qwen-prompt-tuning-math-final/merges.txt',
 './qwen-prompt-tuning-math-final/added_tokens.json',
 './qwen-prompt-tuning-math-final/tokenizer.json')

In [20]:

# Load the model
from peft import PeftModel
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", device_map="auto")
model = PeftModel.from_pretrained(model, output_dir)


In [21]:

# Generate and print model outputs after training
generate_and_print(sample_dataset, sample_dataset_tokenized, model, tokenizer)



Problem:
Consider the function $z(x,y)$ describing the paraboloid \[z = (2x - y)^2 - 2y^2
- 3y.\]Archimedes and Brahmagupta are playing a game.  Archimedes first chooses
$x.$  Afterwards, Brahmagupta chooses $y.$  Archimedes wishes to minimize $z$
while Brahmagupta wishes to maximize $z.$  Assuming that Brahmagupta will play
optimally, what value of $x$ should Archimedes choose?

True Solution:
Expanding $z,$ we get \begin{align*} z &= 4x^2 - 4xy + y^2 - 2y^2 - 3y \\ &=
-y^2 - (4x + 3) y + 4x^2. \end{align*}After Archimedes chooses $x,$ Brahmagupta
will choose \[y = -\frac{4x + 3}{2}\]in order to maximize $z.$  Then
\begin{align*} z &= -\left( -\frac{4x + 3}{2} \right)^2 - (4x + 3) \left(
-\frac{4x + 3}{2} \right)^2 + 4x^2 \\ &= 8x^2 + 6x + \frac{9}{4}. \end{align*}To
minimize this expression, Archimedes should choose $x = -\frac{6}{16} =
\boxed{-\frac{3}{8}}.$

Model's Solution:
The maximum value occurs when $x=1,$ so we want to find the minimum of the
quadratic in $y.$  The vertex is