In [1]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM

model='Qwen/Qwen2.5-Math-7B'
model='Qwen/Qwen2.5-Math-1.5B-Instruct'
# Example usage
model_name = model # Replace with your model name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )
    (norm): Qw

In [19]:

def min_p_sampling(logits, p_base=0.1, top_k=5):
    """Select top-k tokens based on scaled probabilities."""
    probabilities = F.softmax(logits, dim=-1)
    top_k_probs, top_k_indices = torch.topk(probabilities, top_k, dim=-1)
    return top_k_indices

def score_sequence(logits_sequence):
    """Calculate the score of a sequence based on the difference between the highest and second-highest token probabilities."""
    scores = []
    for logits in logits_sequence:
        sorted_logits = torch.sort(logits, descending=True).values
        highest = sorted_logits[0]
        second_highest = sorted_logits[1] if len(sorted_logits) > 1 else 0
        scores.append(highest - second_highest)
    return torch.tensor(scores).mean().item()

def generate_with_kv_cache(prompt, tokenizer, model, max_tokens=200, top_k=5, p_base=0.1):
    device = model.device
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    past_key_values = None  # Initialize kv_cache

    # Initial inference
    with torch.no_grad():
        outputs = model(input_ids, use_cache=True)
        logits = outputs.logits[:, -1, :]  # Logits for the last token
        past_key_values = outputs.past_key_values

    # Get top-k tokens
    top_k_indices = min_p_sampling(logits, p_base=p_base, top_k=top_k)

    # Store candidates
    candidates = []
    for token_id in top_k_indices[0]:  # Iterate over top-k tokens
        sequence = [token_id.item()]
        probs_sequence = [F.softmax(logits[0], dim=-1)]

        # Create a copy of kv_cache for each candidate
        kv_cache_copy = tuple([tuple(torch.clone(k).detach() for k in layer) for layer in past_key_values])

        # Generate tokens for each top-k starting point
        i = 0
        finish =False
        while i < max_tokens - 1:
            i+=1
            input_ids = torch.tensor([[sequence[-1]]], dtype=torch.long).to(device)
            with torch.no_grad():
                outputs = model(input_ids, use_cache=True, past_key_values=kv_cache_copy)
                logits = outputs.logits[:, -1, :]
                kv_cache_copy = outputs.past_key_values

            probabilities = F.softmax(logits[0], dim=-1)
            next_token = torch.argmax(probabilities).item()

            if next_token != tokenizer.eos_token_id:
              sequence.append(next_token)
              #probs_sequence.append(probabilities)
            if next_token == tokenizer.eos_token_id:
                next_token = ""
                input_ids = tokenizer("So the answer is", return_tensors="pt").input_ids.to(device)
                sequence.extend(input_ids[0])
                while next_token != tokenizer.eos_token_id and i < max_tokens:


                    with torch.no_grad():
                        outputs = model(input_ids, use_cache=True, past_key_values=kv_cache_copy)
                        logits = outputs.logits[:, -1, :]
                        kv_cache_copy = outputs.past_key_values
                    probabilities = F.softmax(logits[0], dim=-1)
                    next_token = torch.argmax(probabilities).item()

                    sequence.append(next_token)

                    probs_sequence.append(probabilities)
                    i+=1
                    input_ids = torch.tensor([[next_token]], dtype=torch.long).to(device)
                    finish = True
                break

        # Score the sequence
        
        score = score_sequence(probs_sequence)
        print("************************************************************************")
        print(score)
        print(tokenizer.decode(sequence, skip_special_tokens=True))
        if finish:
            candidates.append((score, sequence))
        else:
            candidates.append((-1000, sequence))
    # Select the best sequence
    
    best_sequence = max(candidates, key=lambda x: x[0])[1]
    print("**************FINAL**********************")
    print(max(candidates, key=lambda x: x[0])[0])
    result = tokenizer.decode(best_sequence, skip_special_tokens=True)

    return result


prompt = """
        <|im_start|>You are a helpful AI assistant. <|im_end|>
        <|im_start|> user
        I have 3 apples, my dad has 2
more apples than me, how many
apples do we have in total?
<|im_end|>
<|im_start|> assistant"""

result = generate_with_kv_cache(prompt, tokenizer, model, max_tokens=300, top_k=10, p_base=0.1)
print(result)


************************************************************************
0.8711363673210144

To determine the total number of apples, we need to follow these steps:

1. Identify the number of apples you have.
2. Determine how many apples your dad has.
3. Add the number of apples you have to the number of apples your dad has.

Step 1: You have 3 apples.
Step 2: Your dad has 2 more apples than you. Since you have 3 apples, your dad has \(3 + 2 = 5\) apples.
Step 3: Add the number of apples you have to the number of apples your dad has: \(3 + 5 = 8\).

Therefore, the total number of apples is \(\boxed{8}\).So the answer is \(\boxed{8}\).
************************************************************************
0.8685962557792664


To determine the total number of apples, we need to follow these steps:

1. Identify the number of apples you have.
2. Determine how many apples your dad has.
3. Add the number of apples you have to the number of apples your dad has.

Step 1: You have 3 apples.
S

In [21]:
prompt = """
        <|im_start|>You are a helpful AI assistant. <|im_end|>
        <|im_start|> user
        X was born in 1964. Was X born in an even or odd year?
<|im_end|>
<|im_start|> assistant"""
result = generate_with_kv_cache(prompt, tokenizer, model, max_tokens=300, top_k=10, p_base=0.1)
print(result)


************************************************************************
0.9632723927497864

To determine whether the year 1964 is even or odd, we can use the modulo operator. The modulo operator (%) gives the remainder of a division operation. If a number modulo 2 is 0, then the number is even. If a number modulo 2 is 1, then the number is odd.

Let's check the year 1964 using the modulo operator in Python.
```python
# Define the year
year = 1964

# Check if the year is even or odd
is_even = (year % 2 == 0)
is_odd = not is_even

print((is_even, is_odd))
```
```output
(True, False)
```
The year 1964 is even because \(1964 \mod 2 = 0\). Therefore, the answer is \(\boxed{\text{even}}\).So the answer is \(\boxed{\text{even}}\).
************************************************************************
0.9669454097747803


To determine whether the year 1964 is even or odd, we can use the modulo operator. If a year modulo 2 is 0, then the year is even; otherwise, it is odd. Let's write the Py

In [22]:
prompt = """
        <|im_start|>You are a helpful AI assistant. <|im_end|>
        <|im_start|> user
        Claire makes a 3 egg omelet every morning for breakfast. How many dozens of eggs will she eat in 4
weeks?
<|im_end|>
<|im_start|> assistant"""
result = generate_with_kv_cache(prompt, tokenizer, model, max_tokens=500, top_k=10, p_base=0.1)
print(result)


************************************************************************
0.9360935688018799

To determine how many dozens of eggs Claire will eat in 4 weeks, we can break down the problem into smaller steps:

1. Calculate the total number of eggs Claire eats in one day.
2. Calculate the total number of eggs Claire eats in one week.
3. Calculate the total number of eggs Claire eats in 4 weeks.
4. Convert the total number of eggs into dozens.

Let's go through these steps one by one using Python code.
```python
# Step 1: Calculate the total number of eggs Claire eats in one day
eggs_per_omelet = 3
omelets_per_day = 1
eggs_per_day = eggs_per_omelet * omelets_per_day

# Step 2: Calculate the total number of eggs Claire eats in one week
days_per_week = 7
eggs_per_week = eggs_per_day * days_per_week

# Step 3: Calculate the total number of eggs Claire eats in 4 weeks
weeks = 4
total_eggs_in_4_weeks = eggs_per_week * weeks

# Step 4: Convert the total number of eggs into dozens
eggs_per_dozen

In [23]:
prompt = """
        <|im_start|>You are a helpful AI assistant. <|im_end|>
        <|im_start|> user
(3 + -3 + -9 * 1) =
<|im_end|>
<|im_start|> assistant"""
result = generate_with_kv_cache(prompt, tokenizer, model, max_tokens=300, top_k=10, p_base=0.1)
print(result)


************************************************************************
0.8840261697769165

要解表达式 \(3 + (-3) + (-9 \times 1)\)，我们将遵循运算顺序，通常用首字母缩略词PEMDAS（括号，指数，乘法和除法（从左到右），加法和减法（从左到右））来记忆。

1. 首先，我们处理括号和乘法：
   \[
   3 + (-3) + (-9 \times 1)
   \]
   乘法 \(-9 \times 1\) 得到 \(-9\)：
   \[
   3 + (-3) + (-9)
   \]

2. 接下来，我们从左到右执行加法和减法：
   \[
   3 + (-3) = 0
   \]
   然后，
   \[
   0 + (-9) = -9
   \]

所以，表达式的值是 \(\boxed{-9}\)。So the answer is \(\boxed{-9}\).
************************************************************************
0.8844248652458191


要解表达式 \(3 + (-3) + (-9 \times 1)\)，我们将遵循运算顺序，通常用首字母缩略词PEMDAS（括号，指数，乘法和除法（从左到右），加法和减法（从左到右））来记忆。

1. 首先，我们处理括号和指数。在这个表达式中，我们有乘法运算 \(-9 \times 1\)，这简化为 \(-9\)。所以表达式变为：
   \[
   3 + (-3) + (-9)
   \]

2. 接下来，我们从左到右执行加法和减法。首先，我们加 \(3\) 和 \(-3\)：
   \[
   3 + (-3) = 0
   \]
   所以现在的表达式是：
   \[
   0 + (-9)
   \]

3. 最后，我们加 \(0\) 和 \(-9\)：
   \[
   0 + (-9) = -9
   \]

因此，表达式的值是 \(\boxed{-9}\)。So the answer is \(\boxed{-9}\).
**************************

In [24]:
prompt = """
        <|im_start|>You are a helpful AI assistant. <|im_end|>
        <|im_start|> user
 Kylar went to the store to buy glasses for his new apartment. One glass costs $5, but every second
glass costs only 60% of the price. Kylar wants to buy 16 glasses. How much does he need to pay for them?
<|im_end|>
<|im_start|> assistant"""
result = generate_with_kv_cache(prompt, tokenizer, model, max_tokens=300, top_k=10, p_base=0.1)
print(result)


************************************************************************
0.9316046237945557

To determine how much Kylar needs to pay for 16 glasses, we need to consider the pricing structure: one glass costs $5, but every second glass costs only 60% of $5, which is $3. We can break down the calculation as follows:

1. Calculate the cost of the glasses in pairs (since every second glass is half price).
2. Multiply the cost of one pair by the number of pairs in 16 glasses.
3. Add the cost of any remaining glasses (if the number of glasses is not a multiple of 2).

Let's implement this in Python to get the exact total cost.
```python
# Define the cost of a glass and the discounted cost
cost_per_glass = 5
discounted_cost_per_glass = cost_per_glass * 0.6

# Total number of glasses Kylar wants to buy
total_glasses = 16

# Calculate the number of pairs and the remaining glasses
number_of_pairs = total_glasses // 2
remaining_glasses = total_glasses % 2

# Calculate the total cost
total_cost =

In [25]:

prompt = """
        <|im_start|>You are a helpful AI assistant. <|im_end|>
        <|im_start|> user
Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends
every day with four. She sells the remainder at the farmers’ market daily for $2 per fresh duck egg. How much in
dollars does she make every day at the farmers’ market?
<|im_end|>
<|im_start|> assistant"""
result = generate_with_kv_cache(prompt, tokenizer, model, max_tokens=500, top_k=10, p_base=0.1)
print(result)


************************************************************************
0.9538024663925171

To determine how much Janet makes every day at the farmers' market, we need to follow these steps:

1. Calculate the total number of eggs laid by the ducks each day.
2. Determine how many eggs Janet eats for breakfast and bakes into muffins.
3. Calculate the number of eggs remaining after breakfast and muffins.
4. Calculate the revenue from selling the remaining eggs at $2 per egg.

Let's break this down step-by-step and use Python to ensure accuracy.

1. Total eggs laid per day: 16
2. Eggs eaten for breakfast: 3
3. Eggs used for muffins: 4
4. Eggs remaining for sale: \(16 - 3 - 4\)
5. Price per egg: $2

Now, let's calculate the revenue using Python.
```python
# Given data
total_eggs_per_day = 16
eggs_for_breakfast = 3
eggs_for_muffins = 4
price_per_egg = 2

# Calculate remaining eggs for sale
remaining_eggs_for_sale = total_eggs_per_day - eggs_for_breakfast - eggs_for_muffins

# Calculate reve