In [1]:
!pip install transformers torch torchvision torchaudio pandas peft datasets trl

Collecting torchvision
  Downloading torchvision-0.22.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading torchaudio-2.7.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.6 kB)
Collecting pillow!=8.3.*,>=5.3.0 (from torchvision)
  Using cached pillow-11.2.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (8.9 kB)
Downloading torchvision-0.22.1-cp312-cp312-macosx_11_0_arm64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torchaudio-2.7.1-cp312-cp312-macosx_11_0_arm64.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached pillow-11.2.1-cp312-cp312-macosx_11_0_arm64.whl (3.0 MB)
Installing collected packages: pillow, torchvision, torchaudio
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [torchaudio]3[0m [torchaudio]]
[1A[2KSuccessfully instal

In [1]:
from transformers import TrainingArguments
import os
import logging
import sys
from pathlib import Path
from peft import LoraConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
logging.basicConfig(
    stream=sys.stdout,
    level=logging.INFO,  # or DEBUG
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    force=True  # Overwrites previous configs
)

In [3]:
WORKING_DIR = Path(os.getcwd())
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
resume = True
training_eval = False
output_dir = WORKING_DIR / f"training-output/{model_name}"
output_dir

PosixPath('/Users/patrik/Documents/Masterarbeit/code/studies/math-infer-finetune/fine_tuning/training-output/Qwen/Qwen2.5-0.5B-Instruct')

In [4]:
from datasets import Dataset as HFDataset
import pandas as pd


def get_dataset():
    p = WORKING_DIR / "train_data.xlsx"
    df = pd.read_excel(p)
    df = df.dropna(axis=0)
    ds = HFDataset.from_pandas(df)
    ds = ds.train_test_split(test_size=0.1, seed=42)
    return ds

ds = get_dataset()

2025-06-10 11:58:12,317 - INFO - datasets - PyTorch version 2.7.1 available.


In [5]:
def format_dataset(ds):
    def format_chat(example):
        user_message = example["input"]
        assistant_message = example["output"]
        messages = [
            {"role": "user", "content": user_message},
            {"role": "assistant", "content": assistant_message}
        ]
        return {
            "messages": messages
        }
    ds = ds.map(format_chat, batched=False)
    return ds

ds = format_dataset(ds)

Map: 100%|██████████| 2475/2475 [00:00<00:00, 33769.03 examples/s]
Map: 100%|██████████| 276/276 [00:00<00:00, 21273.28 examples/s]


In [6]:
import torch

if torch.backends.mps.is_available():
    print("MPS is available")
    mps_device = torch.device("mps")
    print(f"MPS device: {mps_device}")

    try:
        # Try creating a bfloat16 tensor on MPS
        x = torch.zeros(1, 1, dtype=torch.bfloat16, device=mps_device)
        print("bfloat16 is supported on MPS!")
    except Exception as e:
        print(f"bfloat16 is NOT fully supported on MPS or an error occurred: {e}")

    # Check PyTorch version
    print(f"PyTorch version: {torch.__version__}")
    print(f"MPS backend built: {torch.backends.mps.is_built()}")
    if hasattr(torch.backends.mps, 'is_macos_or_newer'):
        print(f"macOS version: {torch.backends.mps.is_macos_or_newer(12, 3)}") # Checks if macOS is 12.3 or newer
else:
    print("MPS is not available or built with MPS enabled.")

MPS is available
MPS device: mps
bfloat16 is supported on MPS!
PyTorch version: 2.7.1
MPS backend built: True
macOS version: False


  return torch._C._mps_is_on_macos_or_newer(major, minor)


In [7]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

if torch.cuda.is_available():
    print("CUDA is available! Using GPU.")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    print(f"Current CUDA device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name(0)}")
    device_map_strategy = "cuda"
elif torch.backends.mps.is_available():
    print("MPS (Metal Performance Shaders) is available! Using Apple Silicon GPU.")
    device_map_strategy = "mps"
else:
    print("CUDA is not available. Using CPU.")
    num_threads = int(os.environ.get("SLURM_CPUS_PER_TASK", 1)) # Default to 1 if not in Slurm
    torch.set_num_threads(num_threads)
    print(f"PyTorch using {torch.get_num_threads()} CPU threads.")
    device_map_strategy = "cpu"

MPS (Metal Performance Shaders) is available! Using Apple Silicon GPU.


In [25]:
training_args = TrainingArguments(
    output_dir=output_dir.as_posix(),
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    
    num_train_epochs=3, # we use EarlyStoppingCallback to stop training if eval_loss doesn't improve for 3 evals

    logging_steps=25,

    save_strategy="steps",
    save_steps=100,

    save_total_limit=2,
    eval_strategy="steps" if training_eval else "no",
    eval_steps=100 if training_eval else None,

    load_best_model_at_end=True if training_eval else False,
    metric_for_best_model="eval_loss" if training_eval else None,
    greater_is_better=False if training_eval else None,
    logging_dir="./logs",
    report_to="none",
    fp16=True,
    bf16=False
)


lora_config = LoraConfig(
    r=4,
    lora_alpha=8,
    lora_dropout=0.05,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj", 
        "gate_proj", "up_proj", "down_proj"
    ],
    task_type="CAUSAL_LM",
    bias="lora_only",
    modules_to_save=["lm_head", "embed_token"],
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device_map_strategy,
    torch_dtype=torch.float16 if device_map_strategy=="mps" else torch.float32,
    load_in_8bit=True if device_map_strategy == "cuda" else False,
)

In [26]:
from trl import SFTTrainer, SFTConfig
from transformers import EarlyStoppingCallback


training_args = SFTConfig(packing=True, **training_args.to_dict())
os.makedirs(output_dir, exist_ok=True)
with open(f"{output_dir}/training_args.json", "w") as f:
    f.write(training_args.to_json_string())

trainer = SFTTrainer(
    model=model,
    args=training_args,
    peft_config=lora_config, # is None if not using LoRA
    train_dataset=ds["train"],
    eval_dataset=ds["test"] if training_eval else None,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)] if training_eval else None,
)

if resume:
    trainer.train(resume_from_checkpoint=True)
else:
    trainer.train()
    
trainer.save_model(output_dir.as_posix())

Converting train dataset to ChatML: 100%|██████████| 2475/2475 [00:00<00:00, 8639.59 examples/s]
Applying chat template to train dataset: 100%|██████████| 2475/2475 [00:00<00:00, 15845.97 examples/s]
Tokenizing train dataset: 100%|██████████| 2475/2475 [00:01<00:00, 1666.28 examples/s]
Packing train dataset: 100%|██████████| 2475/2475 [00:00<00:00, 1374953.96 examples/s]


ValueError: fp16 mixed precision requires a GPU (not 'mps').

In [None]:
import shutil

# Zip the folder (e.g., 'my_folder' → 'my_folder.zip')
shutil.make_archive(output_dir.as_posix(), 'zip', output_dir)

In [None]:
from google.colab import files
files.download(output_dir.as_posix() + '.zip')