In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

In [3]:
import openai
from dotenv import load_dotenv

import art
from art.local import LocalBackend

load_dotenv()

backend = LocalBackend()
model = art.TrainableModel(
    name="009",
    project="yes-no-maybe",
    base_model="Qwen/Qwen3-30B-A3B-Instruct-2507",
    # _internal_config=art.dev.InternalModelConfig(
    #     _decouple_vllm_and_unsloth=True,
    #     engine_args=art.dev.EngineArgs(gpu_memory_utilization=0.7),
    # ),
)
await model.register(backend)


async def rollout(client: openai.AsyncOpenAI, prompt: str) -> art.Trajectory:
    messages: art.Messages = [
        {
            "role": "user",
            "content": prompt,
        }
    ]
    chat_completion = await client.chat.completions.create(
        messages=messages, model=model.name, max_tokens=100, timeout=100
    )
    choice = chat_completion.choices[0]
    content = choice.message.content
    assert isinstance(content, str)
    if content == "yes":
        reward = 0.5
    elif content == "no":
        reward = 0.75
    elif content == "maybe":
        reward = 1.0
    else:
        reward = 0.0
    return art.Trajectory(messages_and_choices=[*messages, choice], reward=reward)


def with_quotes(w):
    return f"'{w}'"


prompts = [
    f"{prefix} with {', '.join([with_quotes(w) if use_quotes else w for w in words]) if len(words) == 3 else f'{words[0]}' + (f' or {words[1]}' if len(words) > 1 else '')}"
    for prefix in ["respond", "just respond"]
    for use_quotes in [True, False]
    for words in [
        ["yes", "no", "maybe"],
        ["maybe", "yes", "no"],
        ["no", "yes", "maybe"],
        ["yes", "maybe", "no"],
        ["yes", "no"],
        ["maybe", "no"],
        ["no", "maybe"],
        ["no", "yes"],
        ["yes", "no"],
    ]
]

openai_client = model.openai_client()
for _ in range(await model.get_step(), 1_000):
    train_groups = await art.gather_trajectory_groups(
        (
            art.TrajectoryGroup(rollout(openai_client, prompt) for _ in range(32))
            for prompt in prompts
        ),
        pbar_desc="gather",
    )
    await model.train(
        train_groups,
        config=art.TrainConfig(learning_rate=1e-4),
        # _config=art.dev.TrainConfig(
        #     precalculate_logprobs=True,
        # ),
    )

[34m[1mwandb[0m: Currently logged in as: [33mbradhilton[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


INFO 09-17 23:49:13 [__init__.py:235] Automatically detected platform cuda.



Please restructure your imports with 'import unsloth' at the top of your file.
  import unsloth  # type: ignore # noqa: F401


ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
INFO 09-17 23:49:19 [__init__.py:235] Automatically detected platform cuda.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.9.6: Fast Qwen3_Moe patching. Transformers: 4.53.2. vLLM: 0.10.0.
   \\   /|    NVIDIA H200. Num GPUs = 1. Max memory: 139.811 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 9.0. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.
Unsloth: Patching vLLM v1 graph capture
Unsloth: Patching vLLM v0 graph capture
Unsloth: vLLM loading unsloth/Qwen3-30B-A3B-Instruct-2507 with actual GPU utilization = 78.66%
Unsloth: Your GPU has CUDA compute capability 9.0 with V

Loading safetensors checkpoint shards:   0% Completed | 0/16 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:   6% Completed | 1/16 [00:00<00:10,  1.40it/s]
Loading safetensors checkpoint shards:  12% Completed | 2/16 [00:01<00:10,  1.31it/s]
Loading safetensors checkpoint shards:  19% Completed | 3/16 [00:02<00:10,  1.29it/s]
Loading safetensors checkpoint shards:  25% Completed | 4/16 [00:03<00:09,  1.30it/s]
Loading safetensors checkpoint shards:  31% Completed | 5/16 [00:03<00:08,  1.30it/s]
Loading safetensors checkpoint shards:  38% Completed | 6/16 [00:04<00:07,  1.30it/s]
Loading safetensors checkpoint shards:  44% Completed | 7/16 [00:05<00:06,  1.30it/s]
Loading safetensors checkpoint shards:  50% Completed | 8/16 [00:06<00:06,  1.30it/s]
Loading safetensors checkpoint shards:  56% Completed | 9/16 [00:06<00:05,  1.29it/s]
Loading safetensors checkpoint shards:  62% Completed | 10/16 [00:07<00:04,  1.29it/s]
Loading safetensors checkpoint shards:  69% Completed | 11/16

INFO 09-17 23:49:45 [default_loader.py:262] Loading weights took 11.81 seconds
INFO 09-17 23:49:45 [punica_selector.py:19] Using PunicaWrapperGPU.
INFO 09-17 23:49:46 [model_runner.py:1115] Model loading took 56.9483 GiB and 12.457003 seconds
INFO 09-17 23:49:46 [fused_moe.py:688] Using configuration from /home/sky/sky_workdir/.venv/lib/python3.10/site-packages/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json for MoE layer.
INFO 09-17 23:49:47 [worker.py:295] Memory profiling takes 0.94 seconds
INFO 09-17 23:49:47 [worker.py:295] the current vLLM instance can use total_gpu_memory (139.81GiB) x gpu_memory_utilization (0.79) = 109.98GiB
INFO 09-17 23:49:47 [worker.py:295] model weights take 56.95GiB; non_torch_memory takes 0.16GiB; PyTorch activation peak memory takes 2.19GiB; the rest of the memory reserved for KV Cache is 50.68GiB.
INFO 09-17 23:49:47 [executor_base.py:113] # cuda blocks: 34597, # CPU blocks: 4096
INFO 09-17 23:49:47 [executor_base.

Capturing CUDA graph shapes: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 53/53 [00:09<00:00,  5.86it/s]


INFO 09-17 23:49:58 [model_runner.py:1537] Graph capturing finished in 9 secs, took 1.34 GiB
INFO 09-17 23:49:58 [vllm_utils.py:728] Unsloth: Patched vLLM v0 graph capture finished in 9 secs.
INFO 09-17 23:49:59 [llm_engine.py:424] init engine (profile, create kv cache, warmup model) took 13.06 seconds
Unsloth: Just some info: will skip parsing ['q_norm', 'layer_norm2', 'pre_feedforward_layernorm', 'post_attention_layernorm', 'input_layernorm', 'post_layernorm', 'norm1', 'k_norm', 'post_feedforward_layernorm', 'layer_norm1', 'norm2']
Unsloth: Just some info: will skip parsing ['cross_attn_post_attention_layernorm', 'q_norm', 'layer_norm2', 'pre_feedforward_layernorm', 'post_attention_layernorm', 'input_layernorm', 'post_layernorm', 'norm1', 'k_norm', 'cross_attn_input_layernorm', 'post_feedforward_layernorm', 'layer_norm1', 'norm2']
Unsloth: Making `model.base_model.model.model` require gradients


gather:   0%|          | 0/1152 [00:00<?, ?it/s]



Packed 832 trajectories into 2 sequences of length 2048


train:   0%|          | 0/2 [00:00<?, ?it/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 10,000,000 | Num Epochs = 3 | Total steps = 30,000,000
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 1 x 1) = 2
 "-____-"     Trainable parameters = 6,684,672 of 1,549,357,056 (0.43% trained)


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1024x2048 and 1x128)