In [5]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
)
import torch

# Load the dataset
ds = load_dataset("HuggingFaceH4/MATH-500")
# Split the dataset into training and validation sets
train_val_dataset = ds["test"].train_test_split(test_size=0.1)
train_dataset = train_val_dataset["train"]
eval_dataset = train_val_dataset["test"]

 # This example only has a test split, so we use that, for demonstration purposes.

 # Load the model and tokenizer
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto"
)

# The model may not have a pad token set by default, so set it (using the EOS token)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [6]:
!module list


Currently Loaded Modules:
  1) anaconda3/2023.09-0   2) cuda/12.3.0

 



In [7]:
from workshop_utils import tokenize_and_mask, tokenize_for_generation, generate_and_print, data_collator

data_collator_fn = lambda features: data_collator(features, tokenizer=tokenizer) 

In [8]:
# Map the formatting function over the dataset.
# This applies the formatting function to each example in the dataset.
# The result is that we have a dataset where each math problem is formatted as a prompt for the model,
# and the solution is formatted as a response that the model should generate.
# Each example is also tokenized
# (If your dataset is large you might use batched=True; here we keep it simple.)
train_dataset_tokenized = train_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})
eval_dataset_tokenized = eval_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})

# Get a sample dataset so we can examine model generations before and after training
sample_dataset = eval_dataset.select(range(3))
sample_dataset_tokenized = sample_dataset.map(tokenize_for_generation, batched=False, fn_kwargs={"tokenizer": tokenizer})

train_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
eval_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
sample_dataset_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [9]:
from transformers import Trainer, TrainingArguments

# Set up training arguments.
training_args = TrainingArguments(
    output_dir="./qwen-finetuned-math",
    per_device_train_batch_size=8,  # Adjust as needed
    num_train_epochs=2,
    logging_steps=10,
    save_steps=10,
    fp16=True,  # Use mixed precision if supported.
    eval_strategy="steps",  # Evaluate every eval_steps
    eval_steps=10,  # Evaluate every x steps
    save_total_limit=1, # Only save one checkpoint
    load_best_model_at_end=True, # Load the best model at the end of training
    report_to="none"
)

# Set up the Trainer.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=eval_dataset_tokenized,
    data_collator=data_collator_fn,
)

# Start fine-tuning.
trainer.train()

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


[2025-02-25 10:39:34,352] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/cehrett/.conda/envs/LLMsFT/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/cehrett/.conda/envs/LLMsFT/compiler_compat/ld: /software/slurm/spackages/linux-rocky8-x86_64/gcc-12.3.0/cuda-12.3.0-p2hoh7xwcu52zilqglv3nnc5bwnritue/lib64/libcufile.so: undefined reference to `log2f@GLIBC_2.2.5'
/home/cehrett/.conda/envs/LLMsFT/compiler_compat/ld: /software/slurm/spackages/linux-rocky8-x86_64/gcc-12.2.0/anaconda3-2023.09-0-3mhml42fa64byxqyd5fig5tbih625dp2/lib/libstdc++.so.6: undefined reference to `fesetround@GLIBC_2.2.5'
/home/cehrett/.conda/envs/LLMsFT/compiler_compat/ld: /software/slurm/spackages/linux-rocky8-x86_64/gcc-12.3.0/cuda-12.3.0-p2hoh7xwcu52zilqglv3nnc5bwnritue/lib64/libcufile.so: undefined reference to `dlvsym'
/home/cehrett/.conda/envs/LLMsFT/compiler_compat/ld: /software/slurm/spackages/linux-rocky8-x86_64/gcc-12.3.0/cuda-12.3.0-p2hoh7xwcu52zilqglv3nnc5bwnritue/lib64/libcufile.so: undefined reference to `dlop

Step,Training Loss,Validation Loss
10,0.8999,1.035551
20,0.9773,1.005583
30,0.9294,0.998815
40,0.9068,0.975086
50,0.837,0.977636
60,0.7431,0.97849
70,0.4962,1.031475
80,0.4917,1.017838
90,0.4438,1.045021
100,0.4556,0.986534


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=171, training_loss=0.526299475403557, metrics={'train_runtime': 242.6472, 'train_samples_per_second': 5.564, 'train_steps_per_second': 0.705, 'total_flos': 1916930919720960.0, 'train_loss': 0.526299475403557, 'epoch': 3.0})

In [10]:
# Save the model and tokenizer
model_path = "./qwen-finetuned-math-final"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)


('./qwen-finetuned-math-final/tokenizer_config.json',
 './qwen-finetuned-math-final/special_tokens_map.json',
 './qwen-finetuned-math-final/vocab.json',
 './qwen-finetuned-math-final/merges.txt',
 './qwen-finetuned-math-final/added_tokens.json',
 './qwen-finetuned-math-final/tokenizer.json')

In [11]:
# Load the saved model
from transformers import AutoModelForCausalLM, AutoTokenizer

model_path = "./qwen-finetuned-math-final"
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [12]:
# Generate and print model outputs after training
generate_and_print(sample_dataset, sample_dataset_tokenized, model, tokenizer)

Problem:
Roslyn has ten boxes. Five of the boxes contain pencils, four of the boxes
contain pens, and two of the boxes contain both pens and pencils. How many boxes
contain neither pens nor pencils?

True Solution:
Of the 5 boxes with pencils, 2 have pens also, so $5-2=3$ have pencils only.
Similarly, $4-2 =2$ of the boxes have pens only:   [asy] unitsize(0.05cm);
label("Pencils", (2,74)); label("Pens", (80,74)); draw(Circle((30,45), 22));
draw(Circle((58, 45), 22)); label("$2$", (44, 45));
label(scale(0.8)*"$3$",(28,58)); label(scale(0.8)*"$2$",(63,58)); [/asy]  That
gives us $3+2+2=7$ boxes with pens, pencils, or both.  This leaves $10-7 =
\boxed{3}$ with neither.

Model's Solution:
The total number of boxes that contain pencils is $5-2=3$. The total number of
boxes that contain pens is $4-1=3$. Therefore, there are $3+3=\boxed{6}$ boxes
that contain both pens and pencils.

--------------------------------------------------------------------------------

Problem:
William Sydney Porte

In [13]:
import gc
import torch

# Clear CUDA cache
torch.cuda.empty_cache()
# Garbage collection
gc.collect()

24

The `Trainer` class takes care of a lot of things under the hood. If you'd rather deal with these details directly yourself, you can avoid using the `Trainer` class and set up the training logic yourself.

In [15]:
from torch.utils.data import DataLoader
from tqdm import tqdm

# Define your dataloaders using the custom data_collator to pad variable-length sequences
train_dataloader = DataLoader(train_dataset_tokenized, batch_size=4, shuffle=True, collate_fn=data_collator_fn)
eval_dataloader = DataLoader(eval_dataset_tokenized, batch_size=4, shuffle=False, collate_fn=data_collator_fn)

# Define your optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Training loop
num_epochs = 2
device = model.device

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    total_loss = 0

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        # Move batch tensors to the right device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} - Average training loss: {avg_loss:.4f}")

    # Evaluation loop (optional)
    model.eval()  # Set the model to evaluation mode
    eval_loss = 0
    with torch.no_grad():  # Disable gradient calculation during evaluation
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            # Move batch tensors to the right device
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**batch)
            loss = outputs.loss
            eval_loss += loss.item()

    avg_eval_loss = eval_loss / len(eval_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} - Average evaluation loss: {avg_eval_loss:.4f}")

print("Training complete!")

Epoch 1/2: 100%|██████████| 113/113 [00:52<00:00,  2.17it/s]


Epoch 1/2 - Average training loss: 0.3830


Evaluating: 100%|██████████| 13/13 [00:01<00:00,  7.32it/s]


Epoch 1/2 - Average evaluation loss: 1.3583


Epoch 2/2: 100%|██████████| 113/113 [00:51<00:00,  2.21it/s]


Epoch 2/2 - Average training loss: 0.2716


Evaluating: 100%|██████████| 13/13 [00:01<00:00,  7.32it/s]

Epoch 2/2 - Average evaluation loss: 1.4568
Training complete!



