In [1]:
import transformers
from unsloth import FastLanguageModel
from typing import Optional
import os
import torch
import random
import time

model, tokenizer = FastLanguageModel.from_pretrained(
                model_name="unsloth/DeepSeek-R1-Distill-Llama-8B-unsloth-bnb-4bit",
                max_seq_length=4096,
            )

FastLanguageModel.for_inference(model)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 02-23 17:04:45 __init__.py:190] Automatically detected platform cuda.
==((====))==  Unsloth 2025.2.5: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.621 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((409

In [17]:
prompt = "Welcome to the 5-Round Decision Simulation!\n\nYour Task:\nYou will participate in a game that unfolds over 5 rounds. In each round, you will see one or more active cues positioned in different quadrants. Among these cues, one quadrant is biased—the cue there appears with a 90% probability of one color and a 10% chance of another, while cues in the other quadrants follow a 50/50 color distribution. Keep in mind that at least one cue is active in every round, and each cue will disappear after a random duration.\n\nYour Goal:\nAfter completing all 5 rounds, you must identify which quadrant was biased.\n- A correct answer awards +100 points.\n- An incorrect answer deducts -100 points.\n\nRound 1 Instructions:\nThe available cue is in Quadrant B. Based on the information provided so far, select one cue by simply responding with the corresponding letter.\n\nPress your chosen letter to continue."
tokenized_prompt = tokenizer.encode(f"<｜begin▁of▁sentence｜><｜User｜>{prompt}<｜Assistant｜>", return_tensors="pt", add_special_tokens=False).to(model.device)

min_thinking_time = 20   # minimum thinking time in seconds
max_thinking_time = 40   # maximum thinking time in seconds
batch_tokens = 20        # Generate tokens in small batches
replacement_text = "Wait"  # Replacement text if end is premature

# Encode markers.
start_think_token = tokenizer.encode("<think>", add_special_tokens=False)[0]
end_think_token = tokenizer.encode("</think>", add_special_tokens=False)[0]
replacement_ids = tokenizer.encode(replacement_text, add_special_tokens=False)

is_thinking = True       # Are we in the thinking phase?
finished = False
final_tokens = tokenized_prompt

start_time = time.time()  # Record the start time

while not finished:
    outputs = model.generate(input_ids=final_tokens,max_new_tokens=batch_tokens,do_sample=True,temperature=0.6,top_p=0.95)
    new_tokens = outputs[:, final_tokens.shape[1]:]  # Only newly generated tokens
    for token in new_tokens[0]:  # Iterate over tokens
        elapsed_time = time.time() - start_time # Check if the maximum thinking time has been reached
        if elapsed_time >= max_thinking_time: # Append end-of-think token and break out of the loop
            final_tokens = torch.cat([final_tokens, torch.tensor([[end_think_token]]).to(final_tokens.device)], dim=1)
            finished = True
            break
        token_id = token.item()
        if token_id == end_think_token and is_thinking:  # Detect </think> token
            elapsed_time = time.time() - start_time
            if elapsed_time < min_thinking_time:  # If we haven't "thought" long enough
                final_tokens = torch.cat([final_tokens, torch.tensor([replacement_ids]).to(final_tokens.device)], dim=1) # Replace premature </think> with replacement text.
                continue
            else:  # If we have "thought" long enough
                is_thinking = False
                finished = True
        final_tokens = torch.cat([final_tokens, token.unsqueeze(0).unsqueeze(0)], dim=1)
        if token_id == end_think_token:
            finished = True
            break
    if finished:
        break

output_text = tokenizer.decode(final_tokens[0], skip_special_tokens=True)
print(output_text)

<｜User｜>Welcome to the 5-Round Decision Simulation!

Your Task:
You will participate in a game that unfolds over 5 rounds. In each round, you will see one or more active cues positioned in different quadrants. Among these cues, one quadrant is biased—the cue there appears with a 90% probability of one color and a 10% chance of another, while cues in the other quadrants follow a 50/50 color distribution. Keep in mind that at least one cue is active in every round, and each cue will disappear after a random duration.

Your Goal:
After completing all 5 rounds, you must identify which quadrant was biased.
- A correct answer awards +100 points.
- An incorrect answer deducts -100 points.

Round 1 Instructions:
The available cue is in Quadrant B. Based on the information provided so far, select one cue by simply responding with the corresponding letter.

Press your chosen letter to continue.<｜Assistant｜><think>
Okay, so I'm trying to figure out which quadrant was biased across 5 rounds of thi

In [18]:
# Use the output_text as the new prompt to generate a single answer token.
conclusive_text = "\nThe answer is <<"
input_ids = tokenizer.encode(output_text+conclusive_text, return_tensors="pt").to(final_tokens.device)
answer_output = model.generate(input_ids=input_ids, max_new_tokens=2, do_sample=True, temperature=0.6, top_p=0.95)
answer_text = tokenizer.decode(answer_output[0, -2], skip_special_tokens=True)
print(answer_text)

B
