In [2]:
# Login to Hugging Face
from huggingface_hub import login

# Option 1: Will prompt for token
# login()

# Option 2: Use token directly (replace with your token)
# login(token="YOUR_HF_TOKEN_HERE")

# Option 3: Read from environment variable
# import os
# login(token=os.environ.get("HF_TOKEN"))


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-Math-7B-Instruct")

In [4]:
# Load model directly

model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-Math-7B-Instruct")
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

#outputs = model.generate(**inputs, max_new_tokens=40)
#print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.73it/s]


In [5]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model.to(device)

In [6]:
from datasets import load_dataset

ds = load_dataset("KbsdJames/Omni-MATH")

ds

DatasetDict({
    test: Dataset({
        features: ['domain', 'difficulty', 'problem', 'solution', 'answer', 'source'],
        num_rows: 4428
    })
})

In [28]:
# Load the fine-tuned model
from transformers import AutoTokenizer, AutoModelForCausalLM

# Use the correct model path - escape the colon in the path
model_path = "./models/qwen2-math-7b-instruct_finetuned_on_first_3542_transformed_omni_math_solutions_filtered_lr:2e-06_warmup_steps:300_num_epochs:3"
finetuned_tokenizer = AutoTokenizer.from_pretrained(model_path)
finetuned_model = AutoModelForCausalLM.from_pretrained(model_path)
finetuned_model.to(device)


The tokenizer you are loading from './models/qwen2-math-7b-instruct_finetuned_on_first_3542_transformed_omni_math_solutions_filtered_lr:2e-06_warmup_steps:300_num_epochs:3' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.23it/s]
The module name qwen2_hyphen_math_hyphen_7b_hyphen_instruct_finetuned_on_first_3542_transformed_omni_math_solutions_filtered_lr:2e_hyphen_06_warmup_steps:300_num_epochs:3 (originally qwen2-math-7b-instruct_finetuned_on_first_3542_transformed_omni_math_solutions_filtered_lr:2e-06_warmup_steps:300_num_epochs:3) is not a valid Python identifier. Please rename the original module to avoid import issues.


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584, padding_idx=151643)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
          (k_proj): Linear(in_features=3584, out_features=512, bias=True)
          (v_proj): Linear(in_features=3584, out_features=512, bias=True)
          (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((3584,)

In [45]:
# Get a geometry level 5 problem from the dataset
#geometry_level5 = ds['train'].filter(lambda x: x.get('type') == 'Algebra' and x.get('level') == 'Level 4')
#problem = geometry_level5[11] #9
model.to(device)
num_problems = len(ds['test'])
problem = ds['test'].select(range(int(num_problems*0.8) + 1, len(ds['test'])))[10]
print(problem)

#problem = {}
#problem['problem'] = """
#Simplify and express with positive exponents: $\frac{3 y^{-\frac{5}{4}}}{y^{-1} \cdot 2 y^{-\frac{1}{3}}}$
#"""


# Format the problem for the model
messages = [
    #{"role": "user", "content": f"Solve this geometry problem without using any external tools. Put your solution in \\boxed{...} format.\n\n Here is the problem:\n\n{problem['problem']}"},
    {"role": "system", "content": """You are a math tutor. Give a complete solution using the environments \\begin{intermediatederivation}...\\end{intermediatederivation} and \\begin{lemmatheorembox}...\\end{lemmatheorembox}, and put the final answer in the format \\boxed{...} at the end."""},
    #{"role": "system", "content": """You are a math tutor. Give a complete solution and put the final answer in the format \\boxed{...}"""}, 
    {"role": "user", "content": f"""{problem['problem']}"""}
]

inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors="pt",
).to(device)

inputs


{'domain': ['Mathematics -> Number Theory -> Prime Numbers', 'Mathematics -> Number Theory -> Congruences'], 'difficulty': 6.5, 'problem': 'Find all primes $p$ and $q$ such that $3p^{q-1}+1$ divides $11^p+17^p$', 'solution': "\nTo solve the problem, we need to identify all pairs of primes \\( p \\) and \\( q \\) such that the expression \\( 3p^{q-1} + 1 \\) divides \\( 11^p + 17^p \\). The reference answer indicates that the only solution is the pair \\((3, 3)\\). Let's go through the process of verifying this.\n\nGiven the division condition:\n\n\\[ \n3p^{q-1} + 1 \\mid 11^p + 17^p\n\\]\n\nwe need to explore the values of \\( p \\) and \\( q \\). First, consider small prime numbers for both \\( p \\) and \\( q \\) due to the computational feasibility and complexity considerations.\n\n### Case: \\( p = 3 \\)\n\nWhen \\( p = 3 \\), evaluate \\( 3p^{q-1} + 1 \\) and \\( 11^p + 17^p \\):\n\n- For \\( p = 3 \\), the expression becomes \\( 3 \\times 3^{q-1} + 1 \\). This simplifies to \\( 3

{'input_ids': tensor([[151644,   8948,    198,   2610,    525,    264,   6888,  25302,     13,
          20678,    264,   4583,   6291,   1667,    279,  21737,   1124,   7265,
             90,   2245,   4404,    657,    261,  39127,     92,  30801,    408,
             90,   2245,   4404,    657,    261,  39127,     92,    323,   1124,
           7265,     90,   3433,   8470,    383,  13173,   2011,     92,  30801,
            408,     90,   3433,   8470,    383,  13173,   2011,   2137,    323,
           2182,    279,   1590,   4226,    304,    279,   3561,   1124,  79075,
             90,   1112,     92,    518,    279,    835,     13, 151645,    198,
         151644,    872,    198,   9885,    678,  49433,    400,     79,      3,
            323,    400,     80,      3,   1741,    429,    400,     18,     79,
          47822,     80,     12,     16,     92,     10,     16,      3,  64828,
            400,     16,     16,     61,     79,     10,     16,     22,     61,
             7

In [30]:
outputs = finetuned_model.generate(**inputs, max_new_tokens=6144, temperature=0.7)
response = finetuned_tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)

print("Problem:")
print(problem['problem'])
print("\n" + "="*80 + "\n")
print("Model Response:")
print(response)

Problem:

Simplify and express with positive exponents: $rac{3 y^{-rac{5}{4}}}{y^{-1} \cdot 2 y^{-rac{1}{3}}}$



Model Response:
\begin{intermediatederivation}
$3 y^{-\frac{5}{4}} \div \left(y^{-1} \cdot 2 y^{-\frac{1}{3}}\right) = 3 y^{-\frac{5}{4}} \div 2 y^{-1-\frac{1}{3}} = 3 y^{-\frac{5}{4}} \div 2 y^{-\frac{4}{3}}$
\end{intermediatederivation}
\begin{lemmatheorembox}
\textbf{Name:} Quotient of Powers Property
\textbf{Statement:} For any nonzero number $a$, and any integers $m$ and $n$, $\frac{a^{m}}{a^{n}}=a^{m-n}$.
\textbf{General Topic:} Exponentiation
\textbf{URL:} https://en.wikipedia.org/wiki/Exponentiation#Quotient_of_powers
\end{lemmatheorembox}
\begin{intermediatederivation}
$3 y^{-\frac{5}{4}} \div 2 y^{-\frac{4}{3}} = \frac{3}{2} y^{-\frac{5}{4} - \left(-\frac{4}{3}\right)} = \frac{3}{2} y^{-\frac{5}{4} + \frac{4}{3}}$
\end{intermediatederivation}
\begin{lemmatheorembox}
\textbf{Name:} Adding Fractions
\textbf{Statement:} $\frac{a}{b}+\frac{c}{d}=\frac{ad+bc}{bd}$
\

In [34]:
problem['solution']

'When we rotate images $90^{\\circ}$ the coordinates switch places, and the signs are adjusted based on whether or not an axis was crossed. In this case, rotating point $A$ $90^{\\circ}$ will bring it across the $y$-axis into Quadrant I, which means both the $x$ and $y$ will be positive. The original point $A$ was at $(-4, 1)$ so the final image will be at $(1, 4)$. We also could solve this problem by seeing that the slope of the segment from the origin to $A$ is $-1/4$. If $A$ is moving to a location that is a $90^{\\circ}$ rotation about the origin, it will move to a point on the segment perpendicular to the one that currently connects it to the origin. This will be the segment that has a slope of 4/1 or $-4/-1$ from the origin which puts us at $(1, 4)$ or $(-1, -4)$. The point $\\boxed{(1, 4)}$ is in the clockwise direction we need.'

In [26]:
import datasets
generated_dataset_level5 = datasets.load_from_disk('data/math_solutions_dataset_20000/')

In [None]:
generated_dataset_level5

[0]

In [15]:
# Reload the dataset to ensure it's loaded correctly
from datasets import load_from_disk

generated_dataset_level5 = load_from_disk('data/math_solutions_dataset_20000/')

# Check the dataset structure
print(f"Dataset type: {type(generated_dataset_level5)}")
print(f"Dataset length: {len(generated_dataset_level5)}")
if hasattr(generated_dataset_level5, 'features'):
    print(f"Dataset features: {generated_dataset_level5.features}")
    print(f"\nFirst example:")
    print(generated_dataset_level5[0])
else:
    print(f"Error: Dataset is not a proper Dataset object. It's a {type(generated_dataset_level5)}")
    print(f"First item: {generated_dataset_level5[0] if len(generated_dataset_level5) > 0 else 'empty'}")


Dataset type: <class 'datasets.arrow_dataset.Dataset'>
Dataset length: 553
Dataset features: {'problem': Value('string'), 'ground_truth': Value('string'), 'solution': Value('string')}

First example:
{'problem': 'Square ABCD has its center at $(8,-8)$ and has an area of 4 square units. The top side of the square is horizontal. The square is then dilated with the dilation center at (0,0) and a scale factor of 2. What are the coordinates of the vertex of the image of square ABCD that is farthest from the origin? Give your answer as an ordered pair.', 'ground_truth': 'With the center of dilation at the origin and a scale factor of 2, all the coordinates of square $ABCD$ are twice the coordinates of its preimage. The preimage has an area of 4 square units, so its side length is 2 units. Since the center of the preimage is at $(8, -8)$, the four vertices of the preimage are at $(7, -9), (7, -7), (9, -7)$ and $(9, -9)$. The point $(9, -9)$ is the farthest from the origin on the preimage, so 

In [None]:
generated_dataset_level5[0]

{'problem': 'Square ABCD has its center at $(8,-8)$ and has an area of 4 square units. The top side of the square is horizontal. The square is then dilated with the dilation center at (0,0) and a scale factor of 2. What are the coordinates of the vertex of the image of square ABCD that is farthest from the origin? Give your answer as an ordered pair.',
 'ground_truth': 'With the center of dilation at the origin and a scale factor of 2, all the coordinates of square $ABCD$ are twice the coordinates of its preimage. The preimage has an area of 4 square units, so its side length is 2 units. Since the center of the preimage is at $(8, -8)$, the four vertices of the preimage are at $(7, -9), (7, -7), (9, -7)$ and $(9, -9)$. The point $(9, -9)$ is the farthest from the origin on the preimage, so the point farthest from the origin on the image of square $ABCD$ is $\\boxed{(18, -18)}.$',
 'solution': "\\begin{intermediatederivation}\nThe square has area \\(4\\), so its side length is \\(s=\\sq

In [7]:
import datasets
dataset = datasets.load_dataset('KbsdJames/Omni-MATH')
#filtered_dataset = datasets.load_from_disk('newopenaioutputs/transformed_solutions_qwen2-math-7b-instruct_filtered')

In [22]:
type(dataset['difficulty'][0])

float

In [23]:
# Filter for Mathematics -> Number Theory problems with difficulty 9
filtered_dataset = dataset['test'].filter(
    lambda x: (
        x.get('difficulty') == 9 and
        any(
            isinstance(domain, str) and domain.startswith('Mathematics -> Number Theory ->')
            for domain in (x.get('domain') or [])
        )
    )
)

print(f"Total problems in test set: {len(dataset['test'])}")
print(f"Filtered problems (Number Theory, difficulty 9): {len(filtered_dataset)}")
print(f"\nFirst few domains in filtered set:")
for i in range(min(5, len(filtered_dataset))):
    print(f"  {filtered_dataset[i]['domain']}")

# Store the filtered dataset for use in other cells
number_theory_level9 = filtered_dataset

Filter: 100%|██████████| 4428/4428 [00:00<00:00, 136176.11 examples/s]

Total problems in test set: 4428
Filtered problems (Number Theory, difficulty 9): 57

First few domains in filtered set:
  ['Mathematics -> Number Theory -> Prime Numbers', 'Mathematics -> Number Theory -> Factorization']
  ['Mathematics -> Number Theory -> Divisibility -> Other', 'Mathematics -> Number Theory -> Greatest Common Divisors (GCD)']
  ['Mathematics -> Number Theory -> Prime Numbers']
  ['Mathematics -> Number Theory -> Factorization']
  ['Mathematics -> Number Theory -> Prime Numbers', 'Mathematics -> Number Theory -> Congruences']





In [None]:
hard_dataset = dataset.filter(lambda x: x.get('difficulty') == 9)
print(len(hard_dataset))



1


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
          (k_proj): Linear(in_features=3584, out_features=512, bias=True)
          (v_proj): Linear(in_features=3584, out_features=512, bias=True)
          (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((3584,), eps=1e-06)
    (ro

In [None]:
index = 4
problem_text = number_theory_level9['problem'][index]
#problem_text = "Do you know Fermat's little theorem?"
solution_text = number_theory_level9['solution'][index]
messages = [
    #{"role": "user", "content": f"Solve this geometry problem without using any external tools. Put your solution in \\boxed{...} format.\n\n Here is the problem:\n\n{problem_text}"},
    {"role": "system", "content": """You are a math tutor. Give a complete solution using the environments \\begin{intermediatederivation}...\\end{intermediatederivation} and \\begin{lemmatheorembox}...\\end{lemmatheorembox}, and put the final answer in the format \\boxed{...} at the end."""},
    #{"role": "system", "content": """You are a math tutor. Give a complete solution and put the final answer in the format \\boxed{...}"""}, 
    {"role": "user", "content": f"""This is the problem:{problem_text}\n\nUse Fermat's little theorem to solve it."""},
    {"role": "assistant", "content": f"""{assistant_text}"""},
    {"role": "user", "content": f"""This is the correct solution{solution_text}, can you summarize what you did wrong in your previous attempt and how you can improve? Was there a specific mathematical concept or trick that you should have used, or you were unfamiliar with?\n\n"""}
]

# Make sure model is on the device
#model.to(device)
print('problem is ', problem_text)

problem is  Does there exist $ 2002$ distinct positive integers $ k_1, k_2, \cdots k_{2002}$ such that for any positive integer $ n \geq 2001$, one of $ k_12^n \plus{} 1, k_22^n \plus{} 1, \cdots, k_{2002}2^n \plus{} 1$ is prime?


In [49]:

inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors="pt",
).to(device)

outputs = model.generate(**inputs, max_new_tokens=6144, temperature=0.7)
response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)

print("Problem:")
print(problem_text)
print("\n" + "="*80 + "\n")
print("Model Response:")
print(response)

Problem:
Does there exist $ 2002$ distinct positive integers $ k_1, k_2, \cdots k_{2002}$ such that for any positive integer $ n \geq 2001$, one of $ k_12^n \plus{} 1, k_22^n \plus{} 1, \cdots, k_{2002}2^n \plus{} 1$ is prime?


Model Response:
I apologize for the mistake in the initial solution. Let's correct the reasoning step-by-step.

### Correct Solution

We need to determine whether there exist \( 2002 \) distinct positive integers \( k_1, k_2, \ldots, k_{2002} \) such that for any positive integer \( n \geq 2001 \), at least one of \( k_1 2^n + 1, k_2 2^n + 1, \ldots, k_{2002} 2^n + 1 \) is prime.

To address this, we generalize the problem for \( F > 2002 \). Consider choosing \( X = F^F \prod_{i=1}^{F^F} (p_i - 1) \), where \( p_i \) are distinct primes. This choice ensures that \( X \) is extremely large.

By Fermat's Little Theorem, for any prime \( p_i \) dividing \( k_x + 1 \), we have:
\[
2^X k_i + 1 \equiv k_x + 1 \equiv 0 \pmod{p_i}.
\]

Given the size of \( 2^X k_i + 1

In [47]:
assistant_text = """To determine if there exist 2002 distinct positive integers \( k_1, k_2, \ldots, k_{2002} \) such that for any positive integer \( n \geq 2001 \), one of \( k_1 2^n + 1, k_2 2^n + 1, \ldots, k_{2002} 2^n + 1 \) is prime, we can use Fermat's Little Theorem and properties of modular arithmetic.

First, recall Fermat's Little Theorem, which states that if \( p \) is a prime number and \( a \) is an integer not divisible by \( p \), then \( a^{p-1} \equiv 1 \pmod{p} \).

We will use this theorem to analyze the behavior of \( 2^n \) modulo different primes. Specifically, we will consider the sequence \( 2^n \pmod{p} \) for various primes \( p \).

### Step-by-Step Analysis

1. **Choose 2002 distinct primes \( p_1, p_2, \ldots, p_{2002} \).**

2. **Construct the sequence \( k_i 2^n + 1 \) modulo \( p_i \):**
   For each prime \( p_i \), consider the expression \( 2^n \pmod{p_i} \). By Fermat's Little Theorem, \( 2^{p_i-1} \equiv 1 \pmod{p_i} \). This implies that the sequence \( 2^n \) modulo \( p_i \) is periodic with period \( p_i-1 \).

3. **Determine the values of \( 2^n \pmod{p_i} \) for \( n \geq 2001 \):**
   Since \( 2^{p_i-1} \equiv 1 \pmod{p_i} \), for \( n \geq 2001 \), we can write \( n = k(p_i-1) + r \) where \( 0 \leq r < p_i-1 \). Therefore,
   \[
   2^n \equiv 2^{k(p_i-1) + r} \equiv (2^{p_i-1})^k \cdot 2^r \equiv 1^k \cdot 2^r \equiv 2^r \pmod{p_i}.
   \]
   This means that \( 2^n \pmod{p_i} \) cycles through the values \( 2^0, 2^1, 2^2, \ldots, 2^{p_i-2} \) as \( n \) increases.

4. **Ensure \( k_i 2^n + 1 \) is not divisible by \( p_i \):**
   For each \( i \), choose \( k_i \) such that \( k_i 2^r + 1 \not\equiv 0 \pmod{p_i} \) for any \( r \) in the range \( 0 \leq r < p_i-1 \). This is always possible because there are \( p_i-1 \) distinct values \( 2^r \) can take, and \( k_i \) can be chosen from the set of integers modulo \( p_i \) excluding the values that would make \( k_i 2^r + 1 \equiv 0 \pmod{p_i} \).

5. **Verify the construction:**
   For any \( n \geq 2001 \), \( 2^n \pmod{p_i} \) will be one of the values \( 2^0, 2^1, \ldots, 2^{p_i-2} \). Since \( k_i \) was chosen to ensure \( k_i 2^n + 1 \not\equiv 0 \pmod{p_i} \), \( k_i 2^n + 1 \) is not divisible by \( p_i \). Therefore, \( k_i 2^n + 1 \) is not a multiple of \( p_i \) and could be a prime number.

Since we can construct such \( k_i \) for each prime \( p_i \) and ensure that \( k_i 2^n + 1 \) is not divisible by \( p_i \) for any \( n \geq 2001 \), it is indeed possible to find 2002 distinct positive integers \( k_1, k_2, \ldots, k_{2002} \) such that for any positive integer \( n \geq 2001 \), one of \( k_1 2^n + 1, k_2 2^n + 1, \ldots, k_{2002} 2^n + 1 \) is prime.

Thus, the answer is:
\[
\boxed{\text{Yes}}
\]"""

  assistant_text = """To determine if there exist 2002 distinct positive integers \( k_1, k_2, \ldots, k_{2002} \) such that for any positive integer \( n \geq 2001 \), one of \( k_1 2^n + 1, k_2 2^n + 1, \ldots, k_{2002} 2^n + 1 \) is prime, we can use Fermat's Little Theorem and properties of modular arithmetic.


In [37]:
ground_truth = number_theory_level9['solution'][index]
print(ground_truth)


We need to determine whether there exist \( 2002 \) distinct positive integers \( k_1, k_2, \ldots, k_{2002} \) such that for any positive integer \( n \geq 2001 \), at least one of \( k_1 2^n + 1, k_2 2^n + 1, \ldots, k_{2002} 2^n + 1 \) is prime.

To address this, we generalize the problem for \( F > 2002 \). Consider choosing \( X = F^F \prod_{i=1}^{F^F} (p_i - 1) \), where \( p_i \) are distinct primes. This choice ensures that \( X \) is extremely large.

By Fermat's Little Theorem, for any prime \( p_i \) dividing \( k_x + 1 \), we have:
\[
2^X k_i + 1 \equiv k_x + 1 \equiv 0 \pmod{p_i}.
\]

Given the size of \( 2^X k_i + 1 \) being greater than \( p_i \), it follows that \( 2^X k_i + 1 \) is not prime. Thus, no such integers \( k_1, k_2, \ldots, k_{2002} \) exist.

Therefore, the answer is: \boxed{\text{No}}.
