In [2]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
)
import torch

# Load the dataset
ds = load_dataset("HuggingFaceH4/MATH-500")
# Split the dataset into training and validation sets
train_val_dataset = ds["test"].train_test_split(test_size=0.1)
train_dataset = train_val_dataset["train"]
eval_dataset = train_val_dataset["test"]

 # This example only has a test split, so we use that, for demonstration purposes.

 # Load the model and tokenizer
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto"
)

# The model may not have a pad token set by default, so set it (using the EOS token)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [3]:
from workshop_utils import tokenize_and_mask, tokenize_for_generation, generate_and_print, data_collator

data_collator_fn = lambda features: data_collator(features, tokenizer=tokenizer) 

In [4]:
# Map the formatting function over the dataset.
# This applies the formatting function to each example in the dataset.
# The result is that we have a dataset where each math problem is formatted as a prompt for the model,
# and the solution is formatted as a response that the model should generate.
# Each example is also tokenized
# (If your dataset is large you might use batched=True; here we keep it simple.)
train_dataset_tokenized = train_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})
eval_dataset_tokenized = eval_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})

# Get a sample dataset so we can examine model generations before and after training
sample_dataset = eval_dataset.select(range(3))
sample_dataset_tokenized = sample_dataset.map(tokenize_for_generation, batched=False, fn_kwargs={"tokenizer": tokenizer})

train_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
eval_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
sample_dataset_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [5]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_params = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        all_params += num_params
        if param.requires_grad:
            trainable_params += num_params
    print(f"trainable params: {trainable_params:,d} || all params: {all_params:,d} || trainable%: {100 * trainable_params / all_params:.2f}%")

print_trainable_parameters(model)

trainable params: 494,032,768 || all params: 494,032,768 || trainable%: 100.00%


In [6]:
from peft import LoraConfig, get_peft_model, TaskType

# Define LoRA Config
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    inference_mode=False, 
    r=8, 
    lora_alpha=32, 
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"] # Replace with the target modules of your model
)

# Add LoRA adapter to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 540,672 || all params: 494,573,440 || trainable%: 0.1093


In [7]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./qwen-lora-math",          # Output directory
    num_train_epochs=3,              # Number of training epochs
    per_device_train_batch_size=4,   # Batch size per device during training
    gradient_accumulation_steps=1,   # Number of updates steps to accumulate before performing a backward/update pass
    optim="paged_adamw_32bit",        # Optimizer, you might need to install accelerate: pip install accelerate -U
    save_steps=20,                   # Save checkpoint every X updates steps
    eval_steps=20,                   # Evaluate every X updates steps
    eval_strategy="steps",           # Evaluation strategy
    save_total_limit=1,              # Limit the total amount of checkpoints
    load_best_model_at_end=True,     # Load the best model when finished training (default is True)
    logging_steps=10,                # Log every X updates steps
    learning_rate=1e-4,              # Learning rate
    weight_decay=0.001,              # Weight decay
    fp16=True,                       # Use mixed precision training
    bf16=False,                      # Use bfloat16 training
    max_grad_norm=0.3,               # Gradient clipping max norm
    max_steps=-1,                    # If > 0: set total number of training steps to perform. Override num_train_epochs.
    warmup_ratio=0.03,               # Linear warmup over warmup_ratio fraction of the total number of training steps.
    group_by_length=True,            # Group sequences of roughly the same length together for more efficient training
    lr_scheduler_type="cosine",       # Learning rate scheduler type
    report_to="none",                # Disable logging
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=eval_dataset_tokenized,
    data_collator=data_collator_fn,     # Data collator if needed
)

# Train the model
trainer.train()

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


[2025-02-24 08:57:49,284] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)




/home/cehrett/.conda/envs/LLMsFT/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/cehrett/.conda/envs/LLMsFT/compiler_compat/ld: /software/slurm/spackages/linux-rocky8-x86_64/gcc-12.3.0/cuda-12.3.0-p2hoh7xwcu52zilqglv3nnc5bwnritue/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/home/cehrett/.conda/envs/LLMsFT/compiler_compat/ld: /software/slurm/spackages/linux-rocky8-x86_64/gcc-12.3.0/cuda-12.3.0-p2hoh7xwcu52zilqglv3nnc5bwnritue/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/home/cehrett/.conda/envs/LLMsFT/compiler_compat/ld: /software/slurm/spackages/linux-rocky8-x86_64/gcc-12.3.0/cuda-12.3.0-p2hoh7xwcu52zilqglv3nnc5bwnritue/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/home/cehrett/.conda/envs/LLMsFT/compiler_compat/ld: /software/slurm/spackages/linux-rocky8-x86_64/gcc-12.3.0/cuda-12.3.0-p2hoh7xwcu52zilqglv3nn

Step,Training Loss,Validation Loss
20,1.1627,5.697272
40,0.6983,0.292774
60,0.5854,0.271366
80,0.7519,0.256127
100,0.758,0.293317
120,0.5391,0.264649
140,0.6633,0.2929
160,0.7919,0.276542
180,0.5449,0.27103
200,0.6618,0.261133


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=339, training_loss=0.7342726417699043, metrics={'train_runtime': 54.3279, 'train_samples_per_second': 24.849, 'train_steps_per_second': 6.24, 'total_flos': 939082372732416.0, 'train_loss': 0.7342726417699043, 'epoch': 3.0})

In [None]:
# Save the LORA model
output_dir = "./qwen-lora-math-final"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./qwen-lora-math-final/tokenizer_config.json',
 './qwen-lora-math-final/special_tokens_map.json',
 './qwen-lora-math-final/vocab.json',
 './qwen-lora-math-final/merges.txt',
 './qwen-lora-math-final/added_tokens.json',
 './qwen-lora-math-final/tokenizer.json')

In [None]:
# Load the LORA model
from peft import PeftModel
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", device_map="auto")
model = PeftModel.from_pretrained(model, output_dir)

In [13]:
# Generate and print model outputs after training
generate_and_print(sample_dataset, sample_dataset_tokenized, model, tokenizer)

Problem:
What is $\frac{9}{2}$ expressed as a decimal?

True Solution:
We can solve this problem by division.  Alternatively, we can multiply the
numerator and denominator by 5, yielding $\frac{45}{10}$.  Since dividing a
number by 10 shifts the decimal point to the left by one place, this yields
$\boxed{4.5}$.

Model's Solution:
We can write $9/2$ as $4\cdot 1 + (3)\cdot 0.5$. Thus, $9/2 = \boxed{4.5}$.

--------------------------------------------------------------------------------

Problem:
When rolling a certain unfair six-sided die with faces numbered 1, 2, 3, 4, 5,
and 6, the probability of obtaining face $F$ is greater than $1/6$, the
probability of obtaining the face opposite face $F$ is less than $1/6$, the
probability of obtaining each of the other faces is $1/6$, and the sum of the
numbers on each pair of opposite faces is 7. When two such dice are rolled, the
probability of obtaining a sum of 7 is $ \frac{47}{288} $. Given that the
probability of obtaining face $F$ is $m/n

In [19]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
)
import torch
from transformers import BitsAndBytesConfig

# Load the dataset
ds = load_dataset("HuggingFaceH4/MATH-500")
# Split the dataset into training and validation sets
train_val_dataset = ds["test"].train_test_split(test_size=0.1)
train_dataset = train_val_dataset["train"]
eval_dataset = train_val_dataset["test"]

 # This example only has a test split, so we use that, for demonstration purposes.

# Configuration for bitsandbytes (QLoRA)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16, # Or torch.bfloat16 if supported
)

 # Load the model and tokenizer
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# The model may not have a pad token set by default, so set it (using the EOS token)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [37]:
print_trainable_parameters(model)

trainable params: 0 || all params: 315,660,160 || trainable%: 0.00%


In [None]:
from peft import prepare_model_for_kbit_training


# Prepare Model for QLoRA
model = prepare_model_for_kbit_training(model)

In [22]:
print_trainable_parameters(model)

trainable params: 0 || all params: 315,119,488 || trainable%: 0.00%


In [None]:
# Configure LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    target_modules=["q_proj", "v_proj"] # Adjust for your model
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 540,672 || all params: 494,573,440 || trainable%: 0.1093


In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./qwen-qlora-math",          # Output directory
    num_train_epochs=3,              # Number of training epochs
    per_device_train_batch_size=4,   # Batch size per device during training
    gradient_accumulation_steps=1,   # Number of updates steps to accumulate before performing a backward/update pass
    optim="paged_adamw_32bit",       # Optimizer, you might need to install accelerate: pip install accelerate -U
    save_steps=20,                   # Save checkpoint every X updates steps
    eval_steps=20,                   # Evaluate every X updates steps
    eval_strategy="steps",           # Evaluation strategy
    save_total_limit=1,              # Limit the total amount of checkpoints
    load_best_model_at_end=True,     # Load the best model when finished training (default is True)
    logging_steps=10,                # Log every X updates steps
    learning_rate=1e-4,              # Learning rate
    weight_decay=0.001,              # Weight decay
    fp16=True,                       # Use mixed precision training
    bf16=False,                      # Use bfloat16 training
    max_grad_norm=0.3,               # Gradient clipping max norm
    max_steps=-1,                    # If > 0: set total number of training steps to perform. Override num_train_epochs.
    warmup_ratio=0.03,               # Linear warmup over warmup_ratio fraction of the total number of training steps.
    group_by_length=True,            # Group sequences of roughly the same length together for more efficient training
    lr_scheduler_type="cosine",      # Learning rate scheduler type
    report_to="none",                # Disable logging
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=eval_dataset_tokenized,
    data_collator=data_collator_fn,     # Data collator if needed
)

# Train the model
trainer.train()

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
20,1.4365,4.844313
40,0.8249,0.470934
60,0.6271,0.389089
80,0.8031,0.332961
100,0.8436,0.348609
120,0.6354,0.323884
140,0.713,0.325938
160,0.7585,0.335284
180,0.6888,0.333518
200,0.6534,0.340771


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwargs)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwargs)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwargs)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwargs)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwargs)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwargs)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwargs)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwargs)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class ins

TrainOutput(global_step=339, training_loss=0.7846824117114762, metrics={'train_runtime': 105.4254, 'train_samples_per_second': 12.805, 'train_steps_per_second': 3.216, 'total_flos': 940712552322048.0, 'train_loss': 0.7846824117114762, 'epoch': 3.0})

In [25]:
# Save the QLoRA model
output_dir = "./qwen-qlora-math-final"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./qwen-qlora-math-final/tokenizer_config.json',
 './qwen-qlora-math-final/special_tokens_map.json',
 './qwen-qlora-math-final/vocab.json',
 './qwen-qlora-math-final/merges.txt',
 './qwen-qlora-math-final/added_tokens.json',
 './qwen-qlora-math-final/tokenizer.json')

In [33]:
# Load the QLoRA model
from peft import PeftModel
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", device_map="auto", quantization_config=bnb_config)
model = PeftModel.from_pretrained(model, output_dir)


In [36]:
# Generate and print model outputs after training
generate_and_print(sample_dataset, sample_dataset_tokenized, model, tokenizer)

Problem:
What is $\frac{9}{2}$ expressed as a decimal?

True Solution:
We can solve this problem by division.  Alternatively, we can multiply the
numerator and denominator by 5, yielding $\frac{45}{10}$.  Since dividing a
number by 10 shifts the decimal point to the left by one place, this yields
$\boxed{4.5}$.

Model's Solution:
We can divide 9 into two equal parts, and then add them together to get
$\boxed{4.5}$.

--------------------------------------------------------------------------------

Problem:
When rolling a certain unfair six-sided die with faces numbered 1, 2, 3, 4, 5,
and 6, the probability of obtaining face $F$ is greater than $1/6$, the
probability of obtaining the face opposite face $F$ is less than $1/6$, the
probability of obtaining each of the other faces is $1/6$, and the sum of the
numbers on each pair of opposite faces is 7. When two such dice are rolled, the
probability of obtaining a sum of 7 is $ \frac{47}{288} $. Given that the
probability of obtaining face 

## Prompt tuning

In [38]:
from peft import PromptTuningConfig, get_peft_model

# Define Prompt Tuning Config
prompt_tuning_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM, 
    num_virtual_tokens=20, # Length of the prompt
    prompt_tuning_init="TEXT",
    prompt_tuning_init_text="Solve the following math problem:",
    tokenizer_name_or_path=model_name,
)

# Add prompt tuning adapter to the model
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", device_map="auto")
model = get_peft_model(model, prompt_tuning_config)
model.print_trainable_parameters()

trainable params: 17,920 || all params: 494,050,688 || trainable%: 0.0036


In [None]:
# Training Arguments (example - adjust as needed)
training_args = TrainingArguments(
    output_dir="./qwen-prompt-tuning-math",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=20,
    eval_steps=20,
    evaluation_strategy="steps",
    save_total_limit=1,
    load_best_model_at_end=True,
    logging_steps=10,
    learning_rate=5e-3,  # Smaller learning rate often helps
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="none",
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=eval_dataset_tokenized,
    data_collator=data_collator_fn,
)

trainer.train()

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss
20,0.969,0.57349
40,0.728,0.486395
60,0.6082,0.348999
80,0.7578,0.526887
100,0.7992,0.473614
120,0.5966,0.290938
140,0.7106,0.314572
160,0.7323,0.33645
180,0.6587,0.305356
200,0.6085,0.333414


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=339, training_loss=0.7338813905519018, metrics={'train_runtime': 51.3114, 'train_samples_per_second': 26.31, 'train_steps_per_second': 6.607, 'total_flos': 939293573796864.0, 'train_loss': 0.7338813905519018, 'epoch': 3.0})

In [40]:
# Save the model
output_dir = "./qwen-prompt-tuning-math-final"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


('./qwen-prompt-tuning-math-final/tokenizer_config.json',
 './qwen-prompt-tuning-math-final/special_tokens_map.json',
 './qwen-prompt-tuning-math-final/vocab.json',
 './qwen-prompt-tuning-math-final/merges.txt',
 './qwen-prompt-tuning-math-final/added_tokens.json',
 './qwen-prompt-tuning-math-final/tokenizer.json')

In [41]:

# Load the model
from peft import PeftModel
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", device_map="auto")
model = PeftModel.from_pretrained(model, output_dir)


In [42]:

# Generate and print model outputs after training
generate_and_print(sample_dataset, sample_dataset_tokenized, model, tokenizer)



Problem:
What is $\frac{9}{2}$ expressed as a decimal?

True Solution:
We can solve this problem by division.  Alternatively, we can multiply the
numerator and denominator by 5, yielding $\frac{45}{10}$.  Since dividing a
number by 10 shifts the decimal point to the left by one place, this yields
$\boxed{4.5}$.

Model's Solution:
$\frac{9}{2} = 4.5$

--------------------------------------------------------------------------------

Problem:
When rolling a certain unfair six-sided die with faces numbered 1, 2, 3, 4, 5,
and 6, the probability of obtaining face $F$ is greater than $1/6$, the
probability of obtaining the face opposite face $F$ is less than $1/6$, the
probability of obtaining each of the other faces is $1/6$, and the sum of the
numbers on each pair of opposite faces is 7. When two such dice are rolled, the
probability of obtaining a sum of 7 is $ \frac{47}{288} $. Given that the
probability of obtaining face $F$ is $m/n$, where $m$ and $n$ are relatively
prime positive integ