# Backup of Baseline

In [1]:
import os, io
import json
from pathlib import Path
from datetime import datetime
import re, sys, subprocess, gc
import time
import multiprocessing

import pandas as pd
import numpy as np
import torch
import transformers
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from outlines import models, generate, samplers

n_questions = 3
n_samples_per_question = 3
n_reps = 2
code_timeout_secs = 5
temperature = 0.9
top_k = 20

np.random.seed(42)

def free_mem():
    if hasattr(sys, 'last_traceback'):
        sys.last_traceback.tb_next = None
    gc.collect()
    torch.cuda.empty_cache()

def print_cuda_mem():
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)
    cached_memory = torch.cuda.memory_reserved(0) / (1024 ** 3)
    available_memory = total_memory - cached_memory 
    print(f"Available Memory (GB, approx.): {available_memory:.2f}")

ds_math_rl = "deepseek-ai/deepseek-math-7b-rl"
torch_dtype = torch.bfloat16
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

def outlines_model(model_id, quantization=True):
    config = AutoConfig.from_pretrained(model_id)
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if quantization:
        quantization_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True
        )
    else:
            quantization_config = None
    model = AutoModelForCausalLM.from_pretrained(model_id,
                                                torch_dtype=torch_dtype,
                                                 attn_implementation="flash_attention_2",
                                                 quantization_config=quantization_config,
                                                 device_map="auto")
    return models.Transformers(model, tokenizer), tokenizer
print_cuda_mem()

Available Memory (GB, approx.): 23.64


In [2]:
q_df = pd.read_csv('./train.csv')
print(q_df.iloc[0]['problem'])
q_df.iloc[0]['answer']
q_df[:3]

Let $k, l > 0$ be parameters. The parabola $y = kx^2 - 2kx + l$ intersects the line $y = 4$ at two points $A$ and $B$. These points are distance 6 apart. What is the sum of the squares of the distances from $A$ and $B$ to the origin?


Unnamed: 0,id,problem,answer
0,229ee8,"Let $k, l > 0$ be parameters. The parabola $y ...",52
1,246d26,Each of the three-digits numbers $111$ to $999...,250
2,2fc4ad,Let the `sparkle' operation on positive intege...,702


In [3]:
from MATH.math_equivalence import is_equiv

# generator = generate.choice(model, ["yes", "no"])
# for i, question in enumerate(questions['problem']):
#     tool_instruction = f"{question}\nCan this problem be expressed as a set of equations and solved by solving the equations?"
#     answers = generator([tool_instruction] * 5, max_tokens=5)
#     free_mem()
#     print(i+1, pd.Series(answers).mode().values, answers)
def read_math_data(folder_path):
    folder = Path(folder_path)
    math_data = []
    for sub_folder in os.listdir(folder):
        if not os.path.isdir(folder/sub_folder):
            continue
        for file_name in os.listdir(folder/sub_folder):
            if file_name.endswith('.json'):
                with open(folder / sub_folder / file_name, 'r') as file:
                    data = json.load(file)
                    data['id'] = file_name[:-5]
                    math_data.append(data)
    return pd.DataFrame(math_data)

math_train = read_math_data('./MATH/train')
math_test = read_math_data('./MATH/test')
print(math_train.shape, math_test.shape)
math_samp = math_test[math_test['level'].isin(['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'])].sample(n_questions)
math_samp.columns

(7500, 5) (5000, 5)


Index(['problem', 'level', 'type', 'solution', 'id'], dtype='object')

In [4]:
def extract_answer(text, boxed_in_tool_instruction=True):
    pattern = r"\\boxed\{((?:[^{}]+|\{[^{}]*\})*)\}"
    matches = re.findall(pattern, text)
    n_matches = 2 if boxed_in_tool_instruction else 1
    
    if len(matches) >= n_matches:
        # there is already an empty match in tool instruction
        return matches[-1] 

    numbers = re.findall(r'[+-]?\d+', text)
    return numbers[-1] if numbers else None

def extract_correct(text):
    return extract_answer(text, False)
        
math_samp['answer'] = math_samp['solution'].map(extract_correct)
math_samp[:3]

Unnamed: 0,problem,level,type,solution,id,answer
1501,A polynomial with integer coefficients is of t...,Level 3,Intermediate Algebra,"By the Integer Root Theorem, the possible inte...",105,"-4,-2,-1,1,2,4"
2586,"If $r$, $s$, and $t$ are constants such that $...",Level 5,Algebra,"First, we should solve for $r$, $s$, and $t$. ...",2283,\frac{1}{9}
2653,A man nearing retirement wants to invest some ...,Level 4,Algebra,"Let $x$ be the amount the man invests now, in ...",1611,74726


- 1 yes
- 2 no
- 3 no
- 4 yes
- 5 maybe
- 6 maybe
- 7 yes
- 8 maybe
- 9 yes
- 10 no

# Inference until code output

In [15]:
ds_math_model, tokenizer = outlines_model(ds_math_rl, quantization=False)
tool_instruction = '\nPlease integrate natural language reasoning with programs to solve the problem above, and put your final answer within \\boxed{}.'
python_code_prefix = "```python\n"
python_default_imports = "from sympy import *\n"
q_prompts = []
q_answers = []
tiling = np.tile(math_samp.index[:n_questions], n_reps * n_samples_per_question)
q_and_res = math_samp[['id', 'problem', 'answer']].loc[tiling]
q_and_res.columns=['problem_id', 'problem', 'correct_answer']
q_and_res['rep_idx'] = np.repeat(range(n_reps * n_samples_per_question), n_questions)
q_and_res = q_and_res.sort_values(['problem_id', 'rep_idx'])

sampler = samplers.multinomial(samples=n_samples_per_question, temperature=temperature, top_k=top_k)
generator = generate.text(ds_math_model, sampler) # generate.choice(model, ["yes", "no"])

def batch_process(in_list, batch_size):
    for i in range(0, len(in_list), batch_size):
        yield in_list[i:i + batch_size]

total_time_sec = 0
total_tokens = 0
for i in range(0, len(q_and_res), n_samples_per_question):
    prompt = q_and_res.iloc[i]['problem'] + tool_instruction
    q_prompts += [prompt] * n_samples_per_question
    messages = [{"role": "user", "content": prompt}]
    m_prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
    
    start_time = time.time()
    answers = generator(m_prompt, max_tokens=1600, stop_at='```output')

    if type(answers) == str:
        q_answers.append(answers)
        n_out_tokens = len(tokenizer(answers)['input_ids'])
    else:
        q_answers += answers
        answer_token_lengths = [len(tokenizer(answ)['input_ids']) for answ in answers]
        n_out_tokens = max(answer_token_lengths)
    total_time_sec += time.time() - start_time
    total_tokens += sum(answer_token_lengths)
    print(f'Batch {i//n_samples_per_question}: Processed {n_samples_per_question} prompts with {n_out_tokens} output tokens in {round(time.time() - start_time)}s')
print('--------------------------------')
print(f'Finished in {round(total_time_sec)}s with {round(total_tokens/total_time_sec,1)} tokens/s')


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Batch 0: Processed 3 prompts with 161 output tokens in 124s
Batch 3: Processed 3 prompts with 170 output tokens in 130s
Batch 6: Processed 3 prompts with 269 output tokens in 207s
Batch 9: Processed 3 prompts with 252 output tokens in 194s
Batch 12: Processed 3 prompts with 295 output tokens in 228s
Batch 15: Processed 3 prompts with 397 output tokens in 307s
----------------
Finished in 1190s with 3.6 tokens/s


In [6]:
def parse_code(output):
    if '```python\n' in output:
        try:
            output = output.split(python_code_prefix)[-1]
        except:
            return None   
    if '```' in output:
        output = output.split('```')[0]
    #output = python_default_imports + output
    return output

def run_with_timeout(code, timeout):
    with open('code.py', 'w') as fout:
        fout.write(code)

    batcmd = f'timeout {timeout} {sys.executable} code.py'
    try:
        shell_output = subprocess.check_output(batcmd, stderr=subprocess.STDOUT, shell=True).decode('utf8')
    except subprocess.CalledProcessError as e:
        shell_output = e.output.decode('utf8')  # Get the output which may include Python error messages
    try:
        code_output = round(float(eval(shell_output))) % 1000
    except Exception as e:
        code_output = shell_output
    os.remove('code.py')
    return code_output

def parse_and_run(output):
    try:
        code = parse_code(output)
    except Exception as ex:
        return str(ex)
    try:
        return run_with_timeout(code, code_timeout_secs)
    except Exception as ex:
        return str(ex)

In [10]:
code_out = []
code_res = []

for i in range(len(q_and_res)):
    code_out.append(parse_code(q_answers[i]))
    code_res.append(parse_and_run(q_answers[i]))
       
q_and_res['iteration'] = 0
q_and_res['first_prompt'] = q_prompts
q_and_res['llm_answer'] = q_answers
q_and_res['code'] = code_out
q_and_res['code_result'] = code_res
q_and_res.to_csv('q_and_res_baseline.csv', index=False)
q_and_res[:2]

Unnamed: 0,problem_id,problem,correct_answer,rep_idx,iteration,llm_answer,code,code_result,first_prompt
1501,105,A polynomial with integer coefficients is of t...,"-4,-2,-1,1,2,4",0,0,```python\nfrom sympy import divisors\n\ndef ...,from sympy import divisors\n\ndef integer_root...,"[1, 2, 4]\n",A polynomial with integer coefficients is of t...
1501,105,A polynomial with integer coefficients is of t...,"-4,-2,-1,1,2,4",1,0,```python\nfrom sympy import divisors\n\ndef ...,from sympy import divisors\n\ndef integer_root...,"[1, 2, 4]\n",A polynomial with integer coefficients is of t...


In [8]:
q_and_res_copy = q_and_res.copy()

## Continue inference after code output

In [None]:
q_ending = []
start_time = time.time()
for i in range(0, q_and_res):
    row = q_and_res.iloc[i]
    cont_prompt = row['first_prompt'] + row['llm_answer'] + '\n' + str(row['code_result']) + '\n```\n'
    it_start_time = time.time()
    answers = generator(q_prompts, max_tokens=2400)
    if type(answers) == str:
        q_ending.append(answers)
    else:
        q_ending += answers    
    print(f'Processed {len(q_prompts)} prompts in {round(time.time() - it_start_time)}s')
print(f'Finished in {round(time.time() - start_time)}s
q_and_res['q_ending'] = q_ending
q_and_res['combined'] = q_and_res['problem'] + tool_instruction + q_and_res['llm_answer'] + '\n' + q_and_res['code_result'].astype(str) + q_and_res['q_ending'] + '\n```\n'

In [9]:
def is_numeric(var):
    return np.issubdtype(type(var), np.number)

def standardize_number(number):
    if is_numeric(number):
        number = float(number)
        if number.is_integer():
            number = int(number)
    return number

In [13]:
q_and_res['llm_result'] = q_and_res['llm_answer'].map(extract_answer)
q_and_res.loc[q_and_res['code_result'].str.contains('Traceback').fillna(False), 'code_result'] = None
q_and_res.loc[q_and_res['code_result'].str.contains('code.py').fillna(False), 'code_result'] = None
q_and_res['code_result'] = q_and_res['code_result'].str.strip().replace('', np.nan)
q_and_res['combined_result'] = q_and_res['code_result'].fillna(q_and_res['llm_result'])

In [36]:
most_common = q_and_res.groupby('problem_id')['combined_result'].agg(pd.Series.mode)
most_common = most_common.apply(lambda x: x[0] if isinstance(x, np.ndarray) else x)
most_common = pd.DataFrame(most_common).reset_index()

merged = math_samp.merge(most_common, left_on='id', right_on='problem_id')
is_correct = []
for row in merged.iterrows():
    correct = standardize_number(row[1]['answer'])
    result = standardize_number(row[1]['combined_result'])
    is_correct.append(is_equiv(correct, result))
merged['is_correct'] = is_correct 

In [37]:
merged.groupby('level')['is_correct'].mean()

level
Level 4    0.148936
Level 5    0.018868
Name: is_correct, dtype: float64

In [39]:
with pd.option_context('display.max_rows', None):
    display(merged[merged['level'] == 'Level 4'][['answer', 'combined_result', 'is_correct']])

Unnamed: 0,answer,combined_result,is_correct
0,-6,396,False
3,32,18,False
4,36,24,False
7,48,8,False
8,1,1,True
10,\frac{1}{2},500,False
11,-3,0,False
14,5x - 7y + 11z + 4 = 0,1,False
17,11+x,"Mod(x + 11, 1000)",False
18,-376,32,False


Processed 2 prompts in 35s
Processed 2 prompts in 82s
Processed 2 prompts in 60s
Processed 2 prompts in 17s
Processed 2 prompts in 40s
Processed 2 prompts in 57s
Processed 2 prompts in 2s
Processed 2 prompts in 210s
Processed 2 prompts in 4s
Processed 2 prompts in 190s
Processed 2 prompts in 205s
Processed 2 prompts in 24s
Processed 2 prompts in 189s
Processed 2 prompts in 9s
Processed 2 prompts in 5s
Processed 2 prompts in 23s
Processed 2 prompts in 207s
Processed 2 prompts in 62s
Processed 2 prompts in 61s
Processed 2 prompts in 54s
Processed 2 prompts in 2s
Processed 2 prompts in 22s
Processed 2 prompts in 9s
Processed 2 prompts in 216s
Processed 2 prompts in 109s
Processed 2 prompts in 25s
Processed 2 prompts in 42s
Processed 2 prompts in 38s
Processed 2 prompts in 217s
Processed 2 prompts in 42s
Processed 2 prompts in 28s
Processed 2 prompts in 43s
Processed 2 prompts in 30s
Processed 2 prompts in 9s
Processed 2 prompts in 204s
Processed 2 prompts in 98s
Processed 2 prompts in 5s


TypeError: bad operand type for unary +: 'str'

22

In [18]:
q_and_res.to_csv('q_and_res_baseline.csv', index=False)

In [19]:
q_and_res

Unnamed: 0,question_id,problem,correct_answer,rep_idx,iteration,llm_answer,code,code_result,q_ending,combined,answer
3334,1084,"Let $(x,y)$ be an ordered pair of real numbers...",49,0,0,"\n```python\nfrom sympy import symbols, Eq, so...","from sympy import symbols, Eq, solve\n\ndef ma...",49,"The maximum value of $y$ is $49$. However, sin...","Let $(x,y)$ be an ordered pair of real numbers...",49
2841,539,Find the sum of the $x$-coordinates of all pos...,78,0,0,"\n```python\nfrom sympy import symbols, solve,...","from sympy import symbols, solve, Rational\n\n...","Traceback (most recent call last):\n File ""/h...",The solution to the equation $\frac1x+\frac1y=...,Find the sum of the $x$-coordinates of all pos...,1000
4411,1304,Let\n\[\mathbf{a} = \begin{pmatrix} 5 \\ -3 \\...,9,0,0,\n\nThe direction vector of the line containin...,from sympy import Matrix\n\ndef direction_vect...,"Matrix([[-16], [4], [32]])\n",The direction vector of the line containing $\...,Let\n\[\mathbf{a} = \begin{pmatrix} 5 \\ -3 \\...,1000
2072,977,Let $z=a+bi$ be the complex number with $\vert...,5,0,0,"\n```python\nfrom sympy import symbols, I, sqr...","from sympy import symbols, I, sqrt, simplify\n...","Traceback (most recent call last):\n File ""/h...",The `atan` function is not defined in the Symp...,Let $z=a+bi$ be the complex number with $\vert...,4
3963,936,Two jokers are added to a $52$ card deck and t...,3,0,0,\n\nSince the two jokers are added to a 52 car...,"def expected_cards():\n """"""Find the expecte...",52,The expected number of cards that will be stri...,Two jokers are added to a $52$ card deck and t...,52
...,...,...,...,...,...,...,...,...,...,...,...
2090,163,For $1 \leq i \leq 215$ let $a_i = \dfrac{1}{2...,860,2,0,\n```python\nimport numpy as np\nfrom scipy.op...,import numpy as np\nfrom scipy.optimize import...,487,The maximum possible value of $x_2$ modulo 100...,For $1 \leq i \leq 215$ let $a_i = \dfrac{1}{2...,487
1803,911,"For $0 < k < 6,$ the graphs of $\frac{(x - k)^...",5,2,0,"\n```python\nfrom sympy import symbols, solve,...","from sympy import symbols, solve, Rational\n\n...",2015/2 - 3*sqrt(281)/2\n,The value of $k$ such that the square $ABCD$ e...,"For $0 < k < 6,$ the graphs of $\frac{(x - k)^...",2
3545,765,There are 360 people in my school. 15 take ca...,110,2,0,"\n\n```python\nfrom sympy import symbols, Eq, ...","from sympy import symbols, Eq, solve\n\ndef st...",255,The number of students taking physics is $\box...,There are 360 people in my school. 15 take ca...,255
3714,396,We roll a fair 6-sided die 5 times. What is t...,648,2,0,\n\n\n```python\nfrom sympy import binomial\n\...,from sympy import binomial\n\ndef probability_...,1,The probability that we get a 6 in at most 2 o...,We roll a fair 6-sided die 5 times. What is t...,1\%


In [7]:
refine_instruction = "Please check the code above and provide a corrected version below.\n" \
                     + "If the output above shows an error improve the code below to fix the error.\n" \
                     + python_code_prefix + python_default_imports
refine_template = f"""{{question}}
{tool_instruction}
{{answer}}
```output
{{code_results}}
```

{refine_instruction}
"""
print(refine_template)

{question}

Please only provide Python code to solve the problem. The correct answer is an integer.
Unless the problem is explicity in complex numbers create all SymPy symbols with parameter real=True.


{answer}
```output
{code_results}
```

Please check the code above and provide a corrected version below.
If the output above shows an error improve the code below to fix the error.
```python
from sympy import *




In [19]:
refine_model = outlines_model(llama_3)
generator = generate.text(refine_model)

def new_res_row(old_row, refined_code, itr):
    new_row = old_row.copy()
    new_row['iteration'] = itr + 1
    new_row['code'] = parse_code(refined_code)
    new_row['code_result'] = parse_and_run(refined_code)
    return new_row

for itr in range(1):
    prev_results = q_and_res[q_and_res['iteration'] == itr]
    new_results = []
    for batch in batch_process(prev_results, batch_size):
        prompts = []
        for res in batch.iterrows():
            refine_prompt = refine_template.format(question=res[1]['problem'], answer=res[1]['code'], code_results=res[1]['code_result'])
            # print(refine_prompt)
            prompts.append(refine_prompt)
        start_time = time.time()
        refined_res = generator(prompts, max_tokens=3000, stop_at='```output')
        if type(refined_res) == str:
            new_rows = pd.DataFrame([new_res_row(batch.iloc[0])], refined_res, itr)
        else:  
            new_rows = []
            for i, refined_code in enumerate(refined_res):
                new_rows.append(new_res_row(batch.iloc[i], refined_code, itr))
            new_rows = pd.DataFrame(new_rows)  
        q_and_res = pd.concat([q_and_res, new_rows])
        print(f'Processed {len(prompts)} prompts in {round(time.time() - start_time)}s')

del[refine_model]
del[generator]
free_mem()
print_cuda_mem()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Processed 3 prompts in 258s
Processed 3 prompts in 120s
Processed 3 prompts in 157s
Processed 3 prompts in 45s
Available Memory (GB, approx.): 23.62


In [20]:
df_res = pd.DataFrame(q_and_res).sort_values(['question_id', 'rep_idx', 'iteration']).reset_index(drop=True)
df_res

Unnamed: 0,question_id,problem,correct_answer,rep_idx,iteration,code,code_result
0,229ee8,"Let $k, l > 0$ be parameters. The parabola $y ...",52,0,0,from sympy import *\n\ndef sum_of_squares_dist...,"Traceback (most recent call last):\n File ""/h..."
1,229ee8,"Let $k, l > 0$ be parameters. The parabola $y ...",52,0,1,from sympy import *\ndef sum_of_squares_distan...,"File ""/home/daniel/code/math-ai/code.py"", li..."
2,229ee8,"Let $k, l > 0$ be parameters. The parabola $y ...",52,1,0,from sympy import *\n\ndef sum_of_squares_dist...,2*(18*k - l + 4)/k\n
3,229ee8,"Let $k, l > 0$ be parameters. The parabola $y ...",52,1,1,from sympy import *\n DEV solution training tu...,"File ""/home/daniel/code/math-ai/code.py"", li..."
4,229ee8,"Let $k, l > 0$ be parameters. The parabola $y ...",52,2,0,from sympy import *\nfrom sympy.solvers import...,-(-(k - sqrt(k*(k - l + 4)))/k + (k + sqrt(k*(...
5,229ee8,"Let $k, l > 0$ be parameters. The parabola $y ...",52,2,1,"from sympy import *\nx, k, l = symbols('x k l'...",-(-(k - sqrt(k*(k - l + 4)))/k + (k + sqrt(k*(...
6,229ee8,"Let $k, l > 0$ be parameters. The parabola $y ...",52,3,0,"from sympy import *\n\nk, l, x, y, a, b = symb...",32 + (k - sqrt(k*(k - l + 4)))**2/k**2 + (k + ...
7,229ee8,"Let $k, l > 0$ be parameters. The parabola $y ...",52,3,1,"from sympy import *\nk, l, x, y, a, b = symbol...",32 + (k - sqrt(k*(k - l + 4)))**2/k**2 + (k + ...
8,229ee8,"Let $k, l > 0$ be parameters. The parabola $y ...",52,4,0,"from sympy import *\n\nk, l, x = symbols('k l ...","Traceback (most recent call last):\n File ""/h..."
9,229ee8,"Let $k, l > 0$ be parameters. The parabola $y ...",52,4,1,from sympy import *\n BLAH BLAH\n\n,"File ""/home/daniel/code/math-ai/code.py"", li..."


In [None]:
print(df_res['code'][4])

In [None]:
print(df_res['code'][2])

In [None]:
print(df_res['code'][3])

In [None]:
print(df_res['code'][4])

In [None]:
questions = pd.read_csv('./test.csv')

In [None]:
def clean_latex(text):
    """
    convert r"[math]\vec{x} + \vec{y}" to English
    """
    # edge case
    text = re.sub(r'\[math\]', ' LaTex math ', text)
    text = re.sub(r'\[\/math\]', ' LaTex math ', text)
    text = re.sub(r'\\', ' LaTex ', text)

    pattern_to_sub = {
        r'$': '', #Add replace $?
        r'\\mathrm': ' LaTex math mode ',
        r'\\mathbb': ' LaTex math mode ',
        r'\\boxed': ' LaTex equation ',
        r'\\begin': ' LaTex equation ', #should it be parenthesis?
        r'\\end': ' LaTex equation ', #should it be parenthesis?
        r'\\left': ' LaTex equation ', #should it be parenthesis?
        r'\\right': ' LaTex equation ', #should it be parenthesis?
        r'\\(over|under)brace': ' LaTex equation ',
        r'\\text': ' LaTex equation ',
        r'\\vec': ' vector ',
        r'\\var': ' variable ',
        r'\\theta': ' theta ',
        r'\\mu': ' average ',
        r'\\min': ' minimum ',
        r'\\max': ' maximum ',
        r'\\sum': ' + ',
        r'\\times': ' * ',
        r'\\cdot': ' * ',
        r'\\hat': ' ^ ',
        r'\\frac': ' / ',
        r'\\div': ' / ',
        r'\\sin': ' Sine ',
        r'\\cos': ' Cosine ',
        r'\\tan': ' Tangent ',
        r'\\infty': ' infinity ',
        r'\\int': ' integer ',
        r'\\in': ' in ',
    }
    # post process for look up
    pattern_dict = {k.strip('\\'): v for k, v in pattern_to_sub.items()}
    # init re
    patterns = pattern_to_sub.keys()
    pattern_re = re.compile('(%s)' % '|'.join(patterns))

    def _replace(match):
        try:
            word = pattern_dict.get(match.group(0).strip('\\'))
        except KeyError:
            word = match.group(0)
            print('!!Error: Could Not Find Key: {}'.format(word))
        return word
    return pattern_re.sub(_replace, text)

In [None]:
def run_python_code(code):
    try:
        with open('code.py', 'w') as fout:
            fout.write('from sympy import * \n')
            fout.write(code)
        batcmd = 'timeout 7 ' + sys.executable + ' code.py'
        return subprocess.check_output(batcmd, stderr=subprocess.STDOUT, shell=True).decode('utf8')
    except subprocess.CalledProcessError as e:
        return e.output.decode('utf8')  # Return the error output
    except Exception as ex:
        return str(ex)


def process_output(output):
    result = output
    try:
        shell_output = run_python_code(output.split('```')[1][7:])
        shell_number = re.findall(r'(-?\d+)(?!.*\d)', shell_output)[-1]
        code_output = round(float(eval(shell_number))) % 1000
    except:
        code_output = -1
    try:
        result_output = re.findall(r'\\boxed\{(.*)\}', result)
        if not len(result_output):
            result_output = re.findall(r'(-?\d+)(?!.*\d)', result)[-1]
        else:
            result_output = result_output[-1]
        if not len(result_output):
            result_output = -1
        else:
            result_output = round(float(eval(result_output))) % 1000
    except:
        result_output = -1
    return result_output, code_output

In [None]:
questions = math_train[math_train['level'] == 'Level 5'].sample(2)
print('\n\n\n'.join(questions['problem'].values))

In [None]:
print('\n-----------------------------------------\n'.join(questions['solution'].values))

In [None]:
question = questions['problem'].iloc[0]
question

In [None]:
!pip install outlines

In [None]:
# tool_instruction = " The answer should be given as a non-negative modulo 1000."

n_repetitions = 3
q_ics = []
raw_results = []
total_results = []
total_answers = []

for q_idx in range(len(questions)):
    print(f"\n\n{q_idx}: {questions['problem'].iloc[q_idx]}")
    results = []
    answers = []
    try:
        combined_messages = None
        for rep_idx in range(n_repetitions):
            print(datetime.now().strftime('%H:%M:%S'))
            if rep_idx > (n_repetitions + 1) / 2:
                problem = clean_latex(questions['problem'].iloc[q_idx])
            else:
                problem = questions['problem'].iloc[q_idx] 
            messages = [{
                "role": "user", 
                "content": 'Problem: ' + problem + "\nGenerate Python code to solve the above problem." +
                '\nUnless complex numbers are mentioned in the problem create SymPy symbols with parameter real=True' +
                    '\n```python\n'
            }]
            query_prompt = tokenizer.apply_chat_template(messages, tokenize=False)
            first_output = pipeline(query_prompt, 
                                    max_new_tokens=1024,
                                    do_sample=True,
                                    temperature=0.9,
                                    return_full_text=False,
                                    n_return_sequences=5
                            )
            first_output = '```python\n' + first_output[0]['generated_text']
            # print('Shell input: \n' + first_output.split('```')[1][7:] + '\n')
            shell_output = run_python_code(first_output.split('```')[1][7:])
            print(f'{shell_output=}\n')
            with_output = first_output.split('```output')[0] + f'```output\n{shell_output}\n```'
            message = {
                "role": "assistant",
                "content": f'Answer {rep_idx}:\n```python\n' + with_output
            }
            messages.append(message)
            #torch.cuda.empty_cache()
            #gc.collect()
            if combined_messages is None:
                combined_messages = messages
            else: 
                combined_messages.append(message)
                
        combined_messages.append({
            "role": "user",
            "content":  "Which is the best of the solutions above?"
                    
        })
        second_prompt = tokenizer.apply_chat_template(combined_messages, tokenize=False).replace('<｜end▁of▁sentence｜>', '')
        # print(f'{second_prompt=}\n')
        second_output = pipeline(second_prompt, max_new_tokens=1024, do_sample=True, temperature=0.1, return_full_text=False)
        print(f'{second_output=}\n')
        
    except Exception as ex:
        print(ex)
        result_output, code_output = -1, -1 
    raw_results.append(second_output)
    result_output, code_output = process_output(second_output)


    results.append(result_output)
    answers.append(code_output)
    q_ics.append(q_idx)
total_results.append(results)
total_answers.append(answers)

In [None]:
from sympy import *

a = symbols('a')


In [None]:
# pd.DataFrame({
#     'question': np.repeat(questions.values, n_repetitions),
#     'sympy_answers': total_answers,
#     'llm_answers': total_results,
#     'raw_answers': raw_results
# })

In [None]:
print(len(np.repeat(questions['problem'].values, n_repetitions)))
print(len(raw_results))
print(len(total_results))
print(len(total_answers))

In [None]:
questions['problem'].iloc[7]

In [None]:
total_results

In [None]:
total_answers

In [None]:
questions['solution'].values

In [None]:
propm_prefix = '''Problem:
Find the domain of the expression $\frac{\sqrt{x-2}}{\sqrt{5-x}}$.}

Solution:
The expressions inside each square root must be non-negative. Therefore,
$x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator
cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of
the expression is $\boxed{[2,5)}$.
Final Answer: The final answer is $[2,5)$. I hope it is correct.

Problem:
If $\det \mathbf{A} = 2$ and $\det \mathbf{B} = 12,$ then find
$\det (\mathbf{A} \mathbf{B}).$
Solution:
We have that $\det (\mathbf{A} \mathbf{B}) = (\det \mathbf{A})(\det \mathbf{B})
= (2)(12) = \boxed{24}.$
Final Answer: The final answer is $24$. I hope it is correct.
Problem:
Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound
weights instead, how many times must Terrell lift them in order to lift the
same total weight?

Solution:
If Terrell lifts two 20-pound weights 12 times, he lifts a total of
$2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound
weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$
pounds of weight. Equating this to 480 pounds, we can solve for $n$:
\begin{align*}
30n&=480\\
\Rightarrow\qquad n&=480/30=\boxed{16}
\end{align*}
Final Answer: The final answer is $16$. I hope it is correct.

Problem:
If the system of equations
\begin{align*}
6x-4y&=a,\\
6y-9x &=b.
\end{align*}has a solution $(x, y)$ where $x$ and $y$ are both nonzero,
find $\frac{a}{b},$ assuming $b$ is nonzero.

Solution:
If we multiply the first equation by $-\frac{3}{2}$, we obtain
$$6y-9x=-\frac{3}{2}a.$$Since we also know that $6y-9x=b$, we have
$$-\frac{3}{2}a=b\Rightarrow\frac{a}{b}=\boxed{-\frac{2}{3}}.$$
Final Answer: The final answer is $-\frac{2}{3}$. I hope it is correct.

Problem:
'''

question_easy = 'Beth bakes 4, 2 dozen batches of cokies in a week. If these cookies are shared amongst 16 people equally, how many cookies does each person consume?'
question_hard = 'Each of the three-digits numbers $111$ to $999$ is coloured blue or yellow in such a way that the sum of any two (not necessarily different) yellow numbers is equal to a blue number. What is the maximum possible number of yellow numbers there can be?'
postfix = '\nPlease reason step by step, and put your final answer within \boxed{}.'
postfix_2 =  '\nPlease convert this problem into a set of equations'
tokens = tokenizer.encode(question_hard + postfix_2, return_tensors='pt').to('cuda')
out_tokens = model.generate(tokens, max_length=500)
print(tokenizer.decode(out_tokens[0]))

In [None]:
from sympy import symbols, Rational, floor, sqrt
from sympy import solve, And, Le, Ge
def solve_for_prob():
    x = symbols('x')
    # define the polynomial P(x)
    P = x**2 - 3*x - 9
    # define the condition for the problem
    condition = floor(sqrt(P.subs(x, x))) == sqrt(P.subs(x, floor(x)))
    # define the interval
    interval = And(5 <= x, x <= 15)
    # calculate the number of values of x that satisfy the condition in the interval
    valid_count = 0    
    for i in range(5, 16):
        if condition.subs(x, i): 
            valid_count += 1
        # calculate the total number of values of x in the interval
        total_count = 15 - 5 + 1
        # calculate the probability
        probability = Rational(valid_count, total_count)
        # calculate a, b, c, d, e for the fraction form of the probability
        a = 1
        b = 1
        c = 1
        d = -1
        e = 1
        
solve_for_prob()

In [None]:
˘..