In [1]:
import os
import torch
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoModel
from trl import SFTConfig, SFTTrainer

In [2]:
repo_id = "Qwen/Qwen3-4B-Instruct-2507"
model = AutoModelForCausalLM.from_pretrained(repo_id, 
                                             torch_dtype='auto',
                                             device_map='mps')

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
print(f'Model memory footprint: {model.get_memory_footprint()/1e6} GB')

Model memory footprint: 8044.936448 GB


In [4]:
total_params = sum(p.numel() for p in model.parameters())
print(total_params/1e9)

4.022468096


In [5]:
from collections import defaultdict

layer_params = defaultdict(int)
# Accumulate parameters per layer (by prefix)
for name, param in model.named_parameters():
    if param.requires_grad:
        # Extract the layer/module name (e.g., "transformer.h.0.attn")
        layer_name = ".".join(name.split(".")[:3])  # Adjust depth as needed
        layer_params[layer_name] += param.numel()

# Print parameter count per layer
for layer_name, param_count in sorted(layer_params.items()):
    print(f"{layer_name:<60} {param_count:,} parameters")

model.embed_tokens.weight                                    388,956,160 parameters
model.layers.0                                               100,930,816 parameters
model.layers.1                                               100,930,816 parameters
model.layers.10                                              100,930,816 parameters
model.layers.11                                              100,930,816 parameters
model.layers.12                                              100,930,816 parameters
model.layers.13                                              100,930,816 parameters
model.layers.14                                              100,930,816 parameters
model.layers.15                                              100,930,816 parameters
model.layers.16                                              100,930,816 parameters
model.layers.17                                              100,930,816 parameters
model.layers.18                                              100,930,816 par

In [6]:
config = LoraConfig(r = 8,
                    lora_alpha=16,
                    bias='none',
                    lora_dropout=0.05,
                    task_type='CAUSAL_LM',
                    target_modules=['o_proj', 'qkv_proj', 'gate_up_proj', 'down_proj'],
)

In [7]:
model = get_peft_model(model, config)

'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [8]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen3ForCausalLM(
      (model): Qwen3Model(
        (embed_tokens): Embedding(151936, 2560)
        (layers): ModuleList(
          (0-35): 36 x Qwen3DecoderLayer(
            (self_attn): Qwen3Attention(
              (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
              (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
              (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
              (o_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=2560, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2560, b

In [9]:
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(total_params/1e9)

0.005455872


In [10]:
train_p, tot_p = model.get_nb_trainable_parameters()
print(f'Trainable parameters:      {train_p/1e6:.2f}M')
print(f'Total parameters:          {tot_p/1e6:.2f}M')
print(f'% of trainable parameters: {100*train_p/tot_p:.2f}%')

Trainable parameters:      5.46M
Total parameters:          4027.92M
% of trainable parameters: 0.14%


In [11]:
ds = load_dataset("voidful/reasoning_gemini_300k", num_proc=8)
ds = ds['train']

In [12]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B-Instruct-2507")
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Hello! I'm Qwen, a large-scale language model independently developed by the Tongyi Lab under Alibaba Group. I can assist you with answering questions, writing stories, creating documents, coding, and more. I'm also capable of logical reasoning, programming, and even playing games! If you have any questions or need help, just let me know, and I'll do my best to support you. 😊<|im_end|>


In [13]:
def format_dataset(example):
    converted_sample = [
            {"role": "user", "content": example['message']},
            {"role": "assistant", "content": '<think>' + example['reasoning'] + '</think>' + example['answer']},
        ]
    return {'messages': converted_sample}

In [14]:
ds = ds.map(format_dataset)
ds = ds.remove_columns(['message', 'reasoning', 'answer'])

In [15]:
print(tokenizer.chat_template)

{%- if tools %}
    {{- '<|im_start|>system\n' }}
    {%- if messages[0].role == 'system' %}
        {{- messages[0].content + '\n\n' }}
    {%- endif %}
    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
    {%- for tool in tools %}
        {{- "\n" }}
        {{- tool | tojson }}
    {%- endfor %}
    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
{%- else %}
    {%- if messages[0].role == 'system' %}
        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
    {%- endif %}
{%- endif %}
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
{%- for message in messages[::-1] %}
    {%- set index = (messages|length - 

In [16]:
print(tokenizer.apply_chat_template(ds[0]['messages'], tokenize=False))

<|im_start|>user
Ashley wants a champagne toast at her wedding.  She wants to serve 2 glasses of champagne to each of her 120 wedding guests.  1 bottle of champagne has 6 servings.  How many bottles of champagne will she need?<|im_end|>
<|im_start|>assistant
<think>
Let's break this problem down step by step.

First, I need to figure out the total number of champagne servings Ashley needs.
She has 120 guests and wants to serve 2 glasses to each guest.
Number of servings = Number of guests * Servings per guest
Number of servings = 120 * 2
Number of servings = 240

Second, I need to figure out how many servings are in one bottle of champagne.
The problem states that 1 bottle of champagne has 6 servings.

Third, I need to figure out how many bottles of champagne Ashley needs to get 240 servings.
Number of bottles = Total number of servings / Servings per bottle
Number of bottles = 240 / 6

Now let's do the calculation:
Number of bottles = 240 / 6
To divide 240 by 6, we can think of it as 

In [22]:
sft_config = SFTConfig(
    ## GROUP 1: Memory usage
    # These arguments will squeeze the most out of your GPU's RAM
    # Checkpointing
    gradient_checkpointing=True,    # this saves a LOT of memory
    # Set this to avoid exceptions in newer versions of PyTorch
    gradient_checkpointing_kwargs={'use_reentrant': False}, 
    # Gradient Accumulation / Batch size
    # Actual batch (for updating) is same (1x) as micro-batch size
    gradient_accumulation_steps=1,  
    # The initial (micro) batch size to start off with
    per_device_train_batch_size=8, 
    # If batch size would cause OOM, halves its size until it works
    auto_find_batch_size=True,
    bf16=False,

    ## GROUP 2: Dataset-related
    # Dataset
    # packing a dataset means no padding is needed
    packing=True,


    ## GROUP 3: These are typical training parameters
    num_train_epochs=10,
    learning_rate=3e-4,
    # Optimizer
    # 8-bit Adam optimizer - doesn't help much if you're using LoRA!
    optim='paged_adamw_8bit',       
    
    ## GROUP 4: Logging parameters
    logging_steps=10,
    logging_dir='./logs',
    output_dir='./qwen3_adapter',
    report_to='none',
    use_mps_device=True
)

In [None]:
fasdf

In [23]:
trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    args=sft_config,
    train_dataset=ds
)



In [25]:
dl = trainer.get_train_dataloader()
batch = next(iter(dl))



In [26]:
batch['input_ids'][0], batch['labels'][0]

(tensor([151644,    872,    198,  ...,    914,  43613,     25], device='mps:0'),
 tensor([151644,    872,    198,  ...,    914,  43613,     25], device='mps:0'))

In [27]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


KeyboardInterrupt: 