In [None]:
# @title 1. Setup: Install Libraries
# (Keep as is)
!pip install transformers torch accelerate bitsandbytes sentencepiece -q
!pip install -U bitsandbytes

# @title 2. Import Libraries
# (Keep as is)
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import gc
import re

# @title 3. Configuration: Model Names, Quantization, Device
# (Keep as is)
# --- Specify your Qwen2 models ---
model_base_name = "Qwen/Qwen2.5-1.5B"
model_expert_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
model_anti_expert_name = "Qwen/Qwen2.5-Math-1.5B"

print(f"--- Model Configuration ---")
print(f"Base (M):         {model_base_name}")
print(f"Expert (M+):      {model_expert_name}")
print(f"Anti-Expert (M-): {model_anti_expert_name}")
print("-------------------------")

# --- Device ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if not torch.cuda.is_available():
    print("WARNING: CUDA not available, running on CPU will be extremely slow.")

# --- Quantization Configuration (Applied to ALL models) ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
print("Using 4-bit NF4 quantization for all models.")

# @title 4. Load Models and Tokenizer

# --- Load Tokenizer (Use tokenizer from the base model) ---
print("Loading Tokenizer...")
try:
    tokenizer = AutoTokenizer.from_pretrained(model_base_name, trust_remote_code=True)

    # Check if a chat template is defined
    if tokenizer.chat_template is None:
         print("WARNING: Tokenizer does not have a chat_template defined. Using default (or potentially incorrect) formatting.")
         # Optionally set a default template here if needed, but Qwen should have one.
         # Example: tokenizer.chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        print(f"Tokenizer pad_token set to eos_token ({tokenizer.eos_token}).")
    tokenizer.padding_side = "left"
    print(f"Tokenizer padding side set to '{tokenizer.padding_side}'.")

    print("Tokenizer Loaded. Chat template likely available via tokenizer.apply_chat_template.")
except Exception as e:
    print(f"ERROR loading tokenizer for {model_base_name}: {e}")
    raise

# --- Function to load model (Keep as is) ---
def load_model(model_name, config, device):
    print(f"Loading Model: {model_name}...")
    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=config,
            device_map="auto",
            trust_remote_code=True
        )
        model.eval()
        print(f"{model_name} Loaded Successfully.")
        mem_bytes = model.get_memory_footprint()
        print(f"Estimated memory footprint for {model_name}: {mem_bytes / 1e9:.2f} GB")
        gc.collect()
        torch.cuda.empty_cache()
        return model
    except Exception as e:
        print(f"ERROR loading model {model_name}: {e}")
        gc.collect(); torch.cuda.empty_cache(); return None

# --- Load Models (Keep as is) ---
model_base = load_model(model_base_name, bnb_config, device)
model_expert = load_model(model_expert_name, bnb_config, device)
model_anti_expert = load_model(model_anti_expert_name, bnb_config, device)

if not all([model_base, model_expert, model_anti_expert]):
     raise RuntimeError("One or more models failed to load. Cannot proceed.")
else:
    print("\nAll models loaded successfully with 4-bit quantization.")


# @title 5. Implement Proxy-Tuning Generation (Using tokenizer.apply_chat_template)

# Constants for roles (optional, makes code readable)
SYSTEM = "system"
USER = "user"
ASSISTANT = "assistant"

@torch.inference_mode()
def generate_proxy_tuned(
    prompt: str,
    max_new_tokens: int = 150,
    temperature: float = 0.6, # DeepSeek R1 recommended default
    top_k: int = 50,
    alpha: float = 1.0,
    is_math_problem: bool = False,
    include_think_prompt: bool = True # Control adding <think> prompt for R1
    ):
    """
    Generates text using proxy-tuning, formatting input via tokenizer.apply_chat_template
    and optionally adding DeepSeek-R1 hints.
    """
    if not all([model_base, model_expert, model_anti_expert]):
        print("Error: Models not loaded."); return ""

    # --- Prepare Messages for Chat Template ---
    system_prompt = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."
    user_content = prompt
    if is_math_problem and "reason step by step" not in prompt.lower():
         user_content += "\nPlease reason step by step, and put your final answer within \\boxed{}."

    messages = [
        {"role": SYSTEM, "content": system_prompt},
        {"role": USER, "content": user_content}
    ]

    # --- Apply Chat Template to get Input IDs ---
    try:
        # Get token IDs for the conversation history + the assistant generation prompt
        input_ids = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True, # Important: Adds the prompt for the assistant to start talking (e.g., <|im_start|>assistant\n)
            tokenize=True,              # Return tensor of token IDs
            return_tensors="pt"
        ).to(device)

        # --- Add DeepSeek-R1 <think> prompt tokens if requested ---
        if include_think_prompt:
            think_prompt = "<think>\n"
            # Tokenize the think prompt *without* adding special BOS/EOS tokens
            think_tokens = tokenizer.encode(think_prompt, add_special_tokens=False, return_tensors="pt").to(device)
            # Concatenate the think tokens to the end of the templated input
            input_ids = torch.cat([input_ids, think_tokens], dim=-1)

        # Store the length of the full prompt (template + think tokens) for later decoding
        prompt_length = input_ids.shape[1]

        # --- Optional: Decode for verification ---
        print("--- Final Input String (Decoded from Tokens) ---")
        print(tokenizer.decode(input_ids[0]))
        print("------------------------------------------------")

    except Exception as e:
        print(f"Error applying chat template or adding think prompt: {e}")
        print("Ensure the tokenizer has a valid chat_template attribute.")
        return ""


    # --- Generation Loop (mostly unchanged) ---
    generated_ids = input_ids.clone()
    print("Starting generation...")
    for step in range(max_new_tokens):
        current_input_ids = generated_ids

        try:
            with torch.cuda.amp.autocast(dtype=torch.bfloat16):
                logits_base = model_base(current_input_ids).logits[:, -1, :]
                logits_expert = model_expert(current_input_ids).logits[:, -1, :]
                logits_anti_expert = model_anti_expert(current_input_ids).logits[:, -1, :]
        except torch.cuda.OutOfMemoryError:
             print(f"CUDA OOM at step {step+1} (SeqLen: {current_input_ids.shape[1]}). Stopping.")
             gc.collect(); torch.cuda.empty_cache(); break
        except Exception as e:
            print(f"Inference error at step {step+1}: {e}"); break

        logit_difference = logits_expert - logits_anti_expert
        modified_logits = logits_base + alpha * logit_difference

        if torch.isnan(modified_logits).any() or torch.isinf(modified_logits).any():
            print(f"Warning: NaN/Inf in logits at step {step+1}. Using base logits.");
            modified_logits = logits_base.clone()

        if temperature > 0:
            scaled_logits = modified_logits / temperature
            top_k_values, top_k_indices = torch.topk(scaled_logits, min(top_k, scaled_logits.size(-1)))
            filtered_logits = torch.full_like(scaled_logits, float('-inf'))
            filtered_logits.scatter_(-1, top_k_indices, top_k_values)
            probabilities = torch.nn.functional.softmax(filtered_logits, dim=-1)
            if torch.isnan(probabilities).any():
                 print(f"Warning: NaN in probs at step {step+1}. Uniform sample from top-k.");
                 next_token_id = top_k_indices[:, torch.randint(0, top_k_indices.shape[1], (1,)).item()].unsqueeze(-1)
            else:
                 next_token_id = torch.multinomial(probabilities, num_samples=1)
        else:
             next_token_id = torch.argmax(modified_logits, dim=-1).unsqueeze(-1)

        generated_ids = torch.cat([generated_ids, next_token_id], dim=-1)

        # Check for termination tokens (EOS or Qwen's IM_END)
        im_end_token_id = tokenizer.convert_tokens_to_ids("<|im_end|>") # Get ID if needed
        if next_token_id.item() == tokenizer.eos_token_id or next_token_id.item() == im_end_token_id:
            print(f"Termination token generated (ID: {next_token_id.item()}) at step {step+1}.")
            break

    print("Generation finished.")

    # --- Decode the response, excluding the prompt ---
    # Extract only the generated token IDs (after the initial prompt)
    response_ids = generated_ids[:, prompt_length:]

    # Decode the generated part, skipping special tokens used during generation (like padding if any)
    # but potentially keeping structure tokens like </think> initially if needed for cleanup
    response_text = tokenizer.decode(response_ids[0], skip_special_tokens=True) # Usually best for final output

    # --- Clean up potential trailing markers ---
    # Remove </think> if it appears at the very end
    if response_text.strip().endswith("</think>"):
       response_text = response_text.rsplit("</think>", 1)[0].strip()

    # Further strip any remaining whitespace
    return response_text.strip()


# @title 6. Run Example Generation
prompt = "Determine all positive integers $n$ for which there exist positive integers $a$, $b$, and $c$ satisfying \[2a^n + 3b^n = 4c^n.\]"
print(f"Original User Prompt: {prompt}")

# --- Run Proxy-Tuned Generation ---
if 'model_base' in locals() and model_base is not None:
    generated_output = generate_proxy_tuned(
        prompt,
        max_new_tokens=5000,
        alpha=1.0,
        temperature=0.6,
        top_k=50,
        is_math_problem=True,
        include_think_prompt=True # Add <think> prompt after assistant marker
    )
    print("\n--- Proxy-Tuned Output (Using apply_chat_template + R1 Hints) ---")
    print(generated_output)
else:
    print("Models not loaded correctly. Please check the loading logs.")


# @title 7. Qwen2 / DeepSeek R1 Documentation Resources
# (Keep documentation links as before)
print("\nRelevant documentation links provided in comments and text cell above.")
# Clean up GPU memory
# del model_base, model_expert, model_anti_expert

--- Model Configuration ---
Base (M):         Qwen/Qwen2.5-1.5B
Expert (M+):      deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
Anti-Expert (M-): Qwen/Qwen2.5-Math-1.5B
-------------------------
Using device: cuda
Using 4-bit NF4 quantization for all models.
Loading Tokenizer...
Tokenizer padding side set to 'left'.
Tokenizer Loaded. Chat template likely available via tokenizer.apply_chat_template.
Loading Model: Qwen/Qwen2.5-1.5B...
Qwen/Qwen2.5-1.5B Loaded Successfully.
Estimated memory footprint for Qwen/Qwen2.5-1.5B: 1.12 GB
Loading Model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B...
deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B Loaded Successfully.
Estimated memory footprint for deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B: 1.59 GB
Loading Model: Qwen/Qwen2.5-Math-1.5B...
Qwen/Qwen2.5-Math-1.5B Loaded Successfully.
Estimated memory footprint for Qwen/Qwen2.5-Math-1.5B: 1.12 GB

All models loaded successfully with 4-bit quantization.
Original User Prompt: Determine all positive integers $n$ f

  with torch.cuda.amp.autocast(dtype=torch.bfloat16):


**Example of one reasoning trace.**

--- Proxy-Tuned Output (Using apply_chat_template + R1 Hints) ---

Alright, I need to calculate the square root of 12 and 24. Hmm, square roots can sometimes be tricky, especially when dealing with numbers that aren't perfect squares. Let me break this down step by step.

Starting with the square root of 12: I know that 12 can be factored into 4 times 3, which are both perfect squares. So, √12 can be written as √(4×3). Using the property of square roots that says √(a×b) = √a × √b, this becomes √4 × √3. I remember that √4 is 2, so multiplying that together gives 2√3. So, √12 equals 2√3. That seems straightforward.

Now, moving on to the square root of 24: Similarly, 24 can be factored into 4 times 6. Again, 4 is a perfect square, so √24 can be expressed as √(4×6). Applying the same rule as before, this becomes √4 × √6. √4 is still 2, so this simplifies to 2√6. Got it.

Wait a second, maybe I should check if these answers are correct. Let me square 2√3 to see if I get back to 12. (2√3)² = 2² × (√3)² = 4 × 3 = 12. Yep, that works. Similarly, (2√6)² = 4 × 6 = 24. Perfect, both answers are correct.

Is there a different way to approach this? Maybe. For √12, I could try dividing it by 2 first, since 2 is a perfect square. √12 = √(4×3) = √4 × √3 = 2√3. Yeah, same result. For √24, dividing by 2 gives √12, which is 2√3. Wait, so √24 = √(4×6) = √4 × √6 = 2√6. Hmm, so actually, √24 can be thought of as 2√6? Interesting.

Wait, could √24 also be expressed as √24 ÷ 2? No, that would be √12, not √24. Hmm.

I think I've covered multiple ways to calculate these square roots, both factoring them into perfect squares and using the properties of square roots. All paths lead me to the same answers: √12 = 2√3 and √24 = 2√6. That makes sense because 12 is four times 3 and 24 is four times 6, which are both perfect squares multiplied by 4.

Maybe I can visualize this. Like, on the number line, 2√3 is approximately 3.464, because √3 is about 1.732, and 2 times 1.732 is 3.464. And √24 is about 4.899, since √6 is approximately 2.449, and 2 times 2.449 is 4.898. Multiplying 3.464 by 3.464 gives 12, as I saw earlier, and 4.898 by 4.898 gives 24. So, the approximate square roots check out.

I wonder if these square roots are irrational or rational. Well, 12 isn't a perfect square, so √12 isn't rational. Similarly, 24 isn't a perfect square, so √24 isn't rational. That matches my answers earlier. So, these square roots are irrational numbers.

Is there a way to represent these square roots as decimals? Well, √3 is approximately 1.732, so 2√3 is about 3.464. √2 is approximately 1.414, so 2√6 would be? Wait, let me check, √6 is about 2.449, so 2√6 is 4.898. Yeah, these are repeating decimals, but they are irrational. So, they cannot be expressed as finite decimals or fractions.

Hmm, okay, so I think that wraps up my thought process on calculating the square roots of 12 and 24. I considered different methods, checked my answers, made sense of the results. Seems solid to me.


The square roots of 12 and 24 are calculated by factoring each number into a perfect square times another number, then applying the property of square roots that √(a×b)=√a×√b.

For √12:
- 12 factors into 4×3, which are both perfect squares.
- √12 = √(4×3) = √4 × √3 = 2√3.

For √24:
- 24 factors into 4×6, which are both perfect squares.
- √24 = √(4×6) = √4 × √6 = 2√6.

Final answers:
√12 = 2√3  
√24 = 2√6

Alright, let me tackle this physics problem step by step. So, we have a ball being thrown upwards with an initial velocity of 40 feet per second. The height of the ball at any time t is given by the equation y = 40t - 16t². We need to find the velocity when t = 2 seconds.

First off, I recall that velocity is related to the derivative of the position function. Since the height as a function of time is given by y = 40t - 16t², we can find the velocity by taking the derivative of y with respect to t.

So, let me compute the derivative. The derivative of y with respect to t, which is dy/dt, should give us the velocity at any time t.

Breaking it down, the derivative of 40t with respect to t is 40, because the slope of a linear function is just its coefficient. Then, the derivative of -16t² with respect to t is -32t. So putting it all together, dy/dt = 40 - 32t.

Now, we need to find the velocity at t = 2 seconds. That means we substitute t = 2 into the derivative equation. So, dy/dt at t = 2 is 40 - 32*(2).

Calculating that, 32 times 2 is 64, so 40 minus 64 equals... hmm, 40 - 64 is a negative number. Specifically, it's -24. So, the velocity at t = 2 is -24 feet per second.

Wait a minute, why is the velocity negative? I think that makes sense because velocity in physics can be a vector quantity, which has both magnitude and direction. If the derivative dy/dt is negative, it means the object is moving in the negative direction, which would be downwards in this case. So, at t = 2 seconds, the ball is descending, and its velocity is decreasing as time passes. Therefore, the negative velocity indicates that the ball is going up for some time, reaches a maximum height, and then starts moving downward.

Just to double-check my calculations, let me go back. Starting from the original equation, y = 40t - 16t². Taking the derivative with respect to t, y' =
