In [1]:
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
base_tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-3B')
instruct_tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-3B-Instruct')

print(base_tokenizer.chat_template)

In [3]:
def make_prefix(numbers, target, template_type):
    # NOTE: also need to change reward_score/countdown.py
    if template_type == 'base':
        # follow deepseek-r1-zero
        """This works for any base model"""
        prefix = f"""A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer.
User: Using the numbers {numbers}, create an equation that equals {target}. You can use basic arithmetic operations (+, -, *, /) and each number can only be used once. Show your work in <think> </think> tags. And return the final answer in <answer> </answer> tags, for example <answer> (1 + 2) / 3 </answer>.
Assistant: Let me solve this step by step.
<think>"""
    elif template_type == 'qwen-instruct':
        """This works for Qwen Instruct Models"""
        prefix = f"""<|im_start|>system\nYou are a helpful assistant. You first thinks about the reasoning process in the mind and then provides the user with the answer.<|im_end|>\n<|im_start|>user\n Using the numbers {numbers}, create an equation that equals {target}. You can use basic arithmetic operations (+, -, *, /) and each number can only be used once. Show your work in <think> </think> tags. And return the final answer in <answer> </answer> tags, for example <answer> (1 + 2) / 3 </answer>.<|im_end|>\n<|im_start|>assistant\nLet me solve this step by step.\n<think>"""
    return prefix

In [4]:
numbers = [ 44, 19, 35 ]
target = 99

In [5]:
base_prompt = make_prefix(numbers, target, 'base')
print(base_prompt)

A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer.
User: Using the numbers [44, 19, 35], create an equation that equals 99. You can use basic arithmetic operations (+, -, *, /) and each number can only be used once. Show your work in <think> </think> tags. And return the final answer in <answer> </answer> tags, for example <answer> (1 + 2) / 3 </answer>.
Assistant: Let me solve this step by step.
<think>


In [6]:
instruct_prompt = make_prefix(numbers, target, 'qwen-instruct')
print(instruct_prompt)

<|im_start|>system
You are a helpful assistant. You first thinks about the reasoning process in the mind and then provides the user with the answer.<|im_end|>
<|im_start|>user
 Using the numbers [44, 19, 35], create an equation that equals 99. You can use basic arithmetic operations (+, -, *, /) and each number can only be used once. Show your work in <think> </think> tags. And return the final answer in <answer> </answer> tags, for example <answer> (1 + 2) / 3 </answer>.<|im_end|>
<|im_start|>assistant
Let me solve this step by step.
<think>


### base model inference

In [7]:
from vllm import LLM, SamplingParams

2025-04-07 21:33:03,268	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
No module named 'vllm._version'
  from vllm.version import __version__ as VLLM_VERSION


In [8]:
sampling_params = SamplingParams(
    temperature=0.6, 
    max_tokens=4096
)
base_llm = LLM(model='Qwen/Qwen2.5-3B', max_model_len=4096)
base_resp = base_llm.generate(base_prompt, sampling_params)[0]
print(base_resp.outputs[0].text)

INFO 04-07 21:33:11 llm_engine.py:237] Initializing an LLM engine (vdev) with config: model='Qwen/Qwen2.5-3B', speculative_config=None, tokenizer='Qwen/Qwen2.5-3B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=Qwen/Qwen2.5-3B, use_v2_block_manager=True, num_scheduler_steps=1, chunked_prefill_enabled=False multi_step_stream_outputs=True, enable_prefix_

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.65it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  2.05it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.97it/s]


INFO 04-07 21:33:19 model_runner.py:1071] Loading model weights took 5.7915 GB





INFO 04-07 21:33:21 gpu_executor.py:122] # GPU blocks: 23035, # CPU blocks: 7281
INFO 04-07 21:33:21 gpu_executor.py:126] Maximum concurrency for 32768 tokens per request: 11.25x
INFO 04-07 21:33:23 model_runner.py:1402] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 04-07 21:33:23 model_runner.py:1406] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 04-07 21:33:32 model_runner.py:1530] Graph capturing finished in 9 secs.


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.00it/s, est. speed input: 142.82 toks/s, output: 113.65 toks/s]

 We need to use the numbers 44, 19, and 35 exactly once to create an equation that equals 99. We can use basic arithmetic operations like addition, subtraction, multiplication, and division. Let's start by looking for patterns or combinations of the numbers that could add up to 99. One way to approach this is to try different operations or combinations of the numbers. </think>
The final answer is: <answer> 44 + 35 + 19 = 99 </answer>





In [11]:
test_resp = base_llm.generate('The captail of China is', sampling_params)[0]
print(test_resp.outputs[0].text)

Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.39it/s, est. speed input: 8.38 toks/s, output: 97.75 toks/s]

 Beijing, and the capital of the USA is Washington DC.  Given the question "What is the population of the capital of the country with 200,000 people?", is "200,000" a valid answer?

Select from: [I] Yes. [II] No.
Answer:

[I]





In [13]:
test_resp = base_llm.generate('My name is', sampling_params)[0]
print(test_resp.outputs[0].text)

Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.12s/it, est. speed input: 1.42 toks/s, output: 105.22 toks/s]

 Kellie and I am a mom to 2 daughters, 2 step-sons, and I am the Director of the girls’ softball team. I have been in the United States Army for 20 years. I have been a mother for 15 years. I am currently in my 5th year of service in the United States Army Reserves. I have been married for 15 years, and my husband is currently in the United States Army Reserve. I also have a 3 year old step-daughter who is the youngest in her class and has gotten her class President and class Vice President two years in a row. I am the Vice President of the school board, and I also serve on the board of the local Chamber of Commerce. I am the president of the local veteran’s group. I am also a member of the local school board and I am a member of the local chamber of commerce. I am the president of the local veteran’s group. I am also a member of the local school board, and I am a member of the local chamber of commerce.





In [12]:
test_resp = base_llm.generate('Long long ago, there', sampling_params)[0]
# print(test_resp.outputs[0].text)

Processed prompts: 100%|██████████| 1/1 [01:12<00:00, 72.50s/it, est. speed input: 0.07 toks/s, output: 112.99 toks/s]

 lived a little frog in a village. He was very lazy and never did any work. He just sat in his hole and waited for food to come to him. Sometimes a bird would come and take a little look at him, but he was too ugly (丑陋) for the bird to eat. The frog was very sad because he was always hungry. One morning, the frog saw a beautiful butterfly (蝴蝶). It was dancing in the wind. The butterfly was so beautiful that he wanted to eat it at once. But it was so quick that he could not catch it. The frog was very sad. Just then, a big snake came. The snake was hungry, and the frog was happy to see that. He told the snake all about the butterfly. “That’s easy,” the snake said. “I’ll (将要) eat you and then eat the butterfly.” The frog was very sad. He knew it was useless to cry. He said to the snake, “I know you are very hungry. But you can’t eat me. I have very sharp teeth. And if you try to catch me, you will get hurt.” The snake thought for a moment and said, “You are right. You are too strong. But




### instruct model inference

In [10]:
# instruct_llm = LLM(model='Qwen/Qwen2.5-3B-Instruct', max_model_len=4096)
# instruct_resp = instruct_llm.generate(instruct_prompt, sampling_params)[0]
# print(instruct_resp.outputs[0].text)