# DeepSeek Baseline
- vLLM in fp16
- only one inference round with multiple self-consistency answers per question

In [4]:
import os, io
import json
from pathlib import Path
from datetime import datetime
import re, sys, subprocess, gc
import time
import multiprocessing

import pandas as pd
import numpy as np
import torch
from vllm import LLM, SamplingParams

n_questions = 250
n_reps = 5
code_timeout_secs = 5
temperature = 0.9

np.random.seed(42)

def free_mem():
    if hasattr(sys, 'last_traceback'):
        sys.last_traceback.tb_next = None
    gc.collect()
    torch.cuda.empty_cache()

def print_cuda_mem():
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)
    cached_memory = torch.cuda.memory_reserved(0) / (1024 ** 3)
    available_memory = total_memory - cached_memory 
    print(f"Available Memory (GB, approx.): {available_memory:.2f}")

ds_math_rl = "deepseek-ai/deepseek-math-7b-rl"
torch_dtype = torch.bfloat16
# to avoid warning when spawing processes to evaluate code later
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
print_cuda_mem()


vllm = LLM(model=ds_math_rl,
          dtype='half',
          enforce_eager=True,
          gpu_memory_utilization=0.99,
          swap_space=4,
          max_model_len=2048,
          kv_cache_dtype="fp8_e5m2",
          tensor_parallel_size=1)
vtokenizer = vllm.get_tokenizer()

Available Memory (GB, approx.): 23.64
INFO 04-23 23:37:15 config.py:381] Using fp8_e5m2 data type to store kv cache. It reduces the GPU memory footprint and boosts the performance. But it may cause slight accuracy drop. Currently we only support fp8 without scaling factors and make e5m2 as a default format.
INFO 04-23 23:37:15 llm_engine.py:74] Initializing an LLM engine (v0.4.0.post1) with config: model='deepseek-ai/deepseek-math-7b-rl', tokenizer='deepseek-ai/deepseek-math-7b-rl', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=True, kv_cache_dtype=fp8_e5m2, device_config=cuda, seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 04-23 23:37:16 selector.py:51] Cannot use FlashAttention because the package is not found. Please install it for better performance.
INFO 04-23 23:37:16 selector.py:25] Using XFormers backend.


RuntimeError: CUDA error: out of memory
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [2]:
q_df = pd.read_csv('./train.csv')
print(q_df.iloc[0]['problem'])
q_df.iloc[0]['answer']
q_df[:3]

Let $k, l > 0$ be parameters. The parabola $y = kx^2 - 2kx + l$ intersects the line $y = 4$ at two points $A$ and $B$. These points are distance 6 apart. What is the sum of the squares of the distances from $A$ and $B$ to the origin?


Unnamed: 0,id,problem,answer
0,229ee8,"Let $k, l > 0$ be parameters. The parabola $y ...",52
1,246d26,Each of the three-digits numbers $111$ to $999...,250
2,2fc4ad,Let the `sparkle' operation on positive intege...,702


In [8]:
from MATH.math_equivalence import is_equiv
def extract_answer(text, boxed_in_tool_instruction=True):
    # designed to allow two-level deep recursion for curly braces inside of curly braces. with unbounded recursing there was catastrophic backtracking.
    pattern = r"\\boxed\{([^{}]*(?:\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}[^{}]*)*)\}"
    matches = re.findall(pattern, text)
    n_matches = 2 if boxed_in_tool_instruction else 1
    
    if len(matches) >= n_matches:
        # there is already an empty match in tool instruction
        return matches[-1] 
    numbers = re.findall(r'[+-]?\d+', text)
    return numbers[-1] if numbers else None

def extract_correct(text):
    return extract_answer(text, False)

def read_math_data(folder_path):
    folder = Path(folder_path)
    math_data = []
    for sub_folder in os.listdir(folder):
        if not os.path.isdir(folder/sub_folder):
            continue
        for file_name in os.listdir(folder/sub_folder):
            if file_name.endswith('.json'):
                with open(folder / sub_folder / file_name, 'r') as file:
                    data = json.load(file)
                    data['id'] = file_name[:-5]
                    math_data.append(data)
    return pd.DataFrame(math_data)

math_train = read_math_data('./MATH/train')
math_train['answer'] = math_train['solution'].map(extract_correct)
math_test = read_math_data('./MATH/test')
math_test['answer'] = math_test['solution'].map(extract_correct)
print(math_train.shape, math_test.shape)

(7500, 6) (5000, 6)


- 1 yes
- 2 no
- 3 no
- 4 yes
- 5 maybe
- 6 maybe
- 7 yes
- 8 maybe
- 9 yes
- 10 no

In [9]:
math_test_int.shape

NameError: name 'math_test_int' is not defined

In [10]:
math_test_int = math_test[math_test['answer'].apply(lambda x: x.isdigit())].copy()
math_test_int['answer'] = math_test_int['answer'].astype(int) % 1000

math_samp = math_test_int[math_test_int['level'].isin(['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'])].sample(n_questions)
math_samp.columns

Index(['problem', 'level', 'type', 'solution', 'id', 'answer'], dtype='object')

In [11]:
# with pd.option_context('display.max_rows', None):
display(math_samp)

Unnamed: 0,problem,level,type,solution,id,answer
696,Elodie is putting on a fashion show and has fi...,Level 5,Prealgebra,"Since two of the outfits are ruined, we only h...",1183,60
2228,Consider the graph of $$\frac{x^3-x^2+x}{6x^2-...,Level 3,Intermediate Algebra,We can factor the numerator and denominator to...,962,7
3720,"An ant is walking on a hexagon. At each step, ...",Level 4,Counting & Probability,There are two ways in which the ant can return...,41,0
2497,A standard deck of playing cards with $26$ red...,Level 5,Algebra,Let pile $A$ have $r_A$ red cards and $b_A$ bl...,1834,22
1499,"A cubic polynomial $f$ satisfies $f(0)=0, f(1)...",Level 4,Intermediate Algebra,Let $g(x) = f(x) - x.$ Then $g(x)$ is a cubic...,570,15
...,...,...,...,...,...,...
2525,"On the Cartesian plane, the midpoint between t...",Level 5,Algebra,"Before moving, the midpoint (in terms of $a$, ...",2714,10
816,How many sides would there be in a convex poly...,Level 5,Prealgebra,The sum of the interior angles in any $n$-side...,1251,8
1802,"Let $x,$ $y,$ $z$ be positive real numbers. F...",Level 5,Intermediate Algebra,We write\n\begin{align*}\n\frac{(1 + 5z)(4z + ...,1146,120
2807,"You connect the points (0, 0) and (9, 6) with ...",Level 3,Algebra,Since we move $\frac{1}{3}$ of the way along t...,1586,5


# Batched Inference

In [None]:
tool_instruction = " The answer should be given as a non-negative modulo 1000."
tool_instruction += '\nPlease integrate natural language reasoning with programs to solve the problem above, and put your final answer within \\boxed{}.'
python_code_prefix = "```python\n"
python_default_imports = "from sympy import *\n"

tool_sampling_params = SamplingParams(n=n_reps,
                                      temperature=0.9,
                                      max_tokens=2048)
                                      #stop='output')
total_time_sec = 0
total_tokens = 0
out_rows = []  
                                      
for i, problem in enumerate(math_samp['problem']):
    prompt = problem + tool_instruction
    messages = [{"role": "user", "content": prompt}]
    m_prompt = vtokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
    
    start_time = time.time()
    llm_out = vllm.generate(m_prompt, tool_sampling_params)
    
    n_out_tokens = 0
    for output in llm_out[0].outputs:
        out_rows.append({
            'iteration': 0,
            'problem_id': math_samp.iloc[i]['id'],
            'problem': problem,
            'correct_answer': math_samp.iloc[i]['answer'],
            'prompt': m_prompt,
            'llm_answer': output.text,
            'n_answer_tokens': len(output.token_ids)
        })
        n_out_tokens += len(output.token_ids)                                                                                             
    total_time_sec += time.time() - start_time
    print(f'Batch {i}: Processed {n_reps} reps of prompt with a total of {n_out_tokens} output tokens in {round(time.time() - start_time)}s')
print('--------------------------------')
print(f'Finished in {round(total_time_sec)}s with {round(total_tokens/total_time_sec,1)} tokens/s')

Processed prompts: 100%|██████████| 1/1 [00:04<00:00,  4.56s/it]


Batch 0: Processed 5 reps of prompt with a total of 1057 output tokens in 5s


Processed prompts: 100%|██████████| 1/1 [00:35<00:00, 35.93s/it]


Batch 1: Processed 5 reps of prompt with a total of 3362 output tokens in 36s


Processed prompts: 100%|██████████| 1/1 [00:08<00:00,  8.90s/it]


Batch 2: Processed 5 reps of prompt with a total of 2255 output tokens in 9s


Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.20s/it]


Batch 3: Processed 5 reps of prompt with a total of 782 output tokens in 3s


Processed prompts: 100%|██████████| 1/1 [00:06<00:00,  6.03s/it]


Batch 4: Processed 5 reps of prompt with a total of 1393 output tokens in 6s


Processed prompts: 100%|██████████| 1/1 [00:33<00:00, 33.72s/it]


Batch 5: Processed 5 reps of prompt with a total of 3883 output tokens in 34s


Processed prompts: 100%|██████████| 1/1 [00:05<00:00,  5.02s/it]


Batch 6: Processed 5 reps of prompt with a total of 1234 output tokens in 5s


Processed prompts: 100%|██████████| 1/1 [00:19<00:00, 19.58s/it]


Batch 7: Processed 5 reps of prompt with a total of 2656 output tokens in 20s


Processed prompts: 100%|██████████| 1/1 [00:31<00:00, 31.51s/it]


Batch 8: Processed 5 reps of prompt with a total of 3825 output tokens in 32s


Processed prompts: 100%|██████████| 1/1 [00:17<00:00, 17.72s/it]


Batch 9: Processed 5 reps of prompt with a total of 2385 output tokens in 18s


Processed prompts: 100%|██████████| 1/1 [00:33<00:00, 33.11s/it]


Batch 10: Processed 5 reps of prompt with a total of 4642 output tokens in 33s


Processed prompts: 100%|██████████| 1/1 [00:36<00:00, 36.13s/it]


Batch 11: Processed 5 reps of prompt with a total of 3494 output tokens in 36s


Processed prompts: 100%|██████████| 1/1 [00:06<00:00,  6.71s/it]


Batch 12: Processed 5 reps of prompt with a total of 1039 output tokens in 7s


Processed prompts: 100%|██████████| 1/1 [00:12<00:00, 12.97s/it]


Batch 13: Processed 5 reps of prompt with a total of 1765 output tokens in 13s


Processed prompts: 100%|██████████| 1/1 [00:36<00:00, 36.24s/it]


Batch 14: Processed 5 reps of prompt with a total of 3348 output tokens in 36s


Processed prompts: 100%|██████████| 1/1 [00:26<00:00, 26.12s/it]


Batch 15: Processed 5 reps of prompt with a total of 3595 output tokens in 26s


Processed prompts: 100%|██████████| 1/1 [00:05<00:00,  5.28s/it]


Batch 16: Processed 5 reps of prompt with a total of 1224 output tokens in 5s


Processed prompts: 100%|██████████| 1/1 [00:11<00:00, 11.78s/it]


Batch 17: Processed 5 reps of prompt with a total of 1454 output tokens in 12s


Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.25s/it]


Batch 18: Processed 5 reps of prompt with a total of 1216 output tokens in 7s


Processed prompts: 100%|██████████| 1/1 [00:23<00:00, 23.72s/it]


Batch 19: Processed 5 reps of prompt with a total of 3990 output tokens in 24s


Processed prompts: 100%|██████████| 1/1 [00:36<00:00, 36.99s/it]


Batch 20: Processed 5 reps of prompt with a total of 4261 output tokens in 37s


Processed prompts: 100%|██████████| 1/1 [00:36<00:00, 36.32s/it]


Batch 21: Processed 5 reps of prompt with a total of 2881 output tokens in 36s


Processed prompts: 100%|██████████| 1/1 [00:10<00:00, 10.88s/it]


Batch 22: Processed 5 reps of prompt with a total of 1430 output tokens in 11s


Processed prompts: 100%|██████████| 1/1 [00:35<00:00, 35.38s/it]


Batch 23: Processed 5 reps of prompt with a total of 3297 output tokens in 35s


Processed prompts: 100%|██████████| 1/1 [00:16<00:00, 16.10s/it]


Batch 24: Processed 5 reps of prompt with a total of 2479 output tokens in 16s


Processed prompts: 100%|██████████| 1/1 [00:14<00:00, 14.62s/it]


Batch 25: Processed 5 reps of prompt with a total of 2843 output tokens in 15s


Processed prompts: 100%|██████████| 1/1 [00:10<00:00, 10.68s/it]


Batch 26: Processed 5 reps of prompt with a total of 2266 output tokens in 11s


Processed prompts: 100%|██████████| 1/1 [00:10<00:00, 10.87s/it]


Batch 27: Processed 5 reps of prompt with a total of 2305 output tokens in 11s


Processed prompts: 100%|██████████| 1/1 [00:05<00:00,  5.98s/it]


Batch 28: Processed 5 reps of prompt with a total of 1421 output tokens in 6s


Processed prompts: 100%|██████████| 1/1 [00:09<00:00,  9.22s/it]


Batch 29: Processed 5 reps of prompt with a total of 2043 output tokens in 9s


Processed prompts: 100%|██████████| 1/1 [00:06<00:00,  6.45s/it]


Batch 30: Processed 5 reps of prompt with a total of 1221 output tokens in 6s


Processed prompts: 100%|██████████| 1/1 [00:37<00:00, 37.23s/it]


Batch 31: Processed 5 reps of prompt with a total of 4982 output tokens in 37s


Processed prompts: 100%|██████████| 1/1 [00:06<00:00,  6.17s/it]


Batch 32: Processed 5 reps of prompt with a total of 1581 output tokens in 6s


Processed prompts: 100%|██████████| 1/1 [00:05<00:00,  5.26s/it]


Batch 33: Processed 5 reps of prompt with a total of 1181 output tokens in 5s


Processed prompts: 100%|██████████| 1/1 [00:32<00:00, 32.55s/it]


Batch 34: Processed 5 reps of prompt with a total of 5014 output tokens in 33s


Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.37s/it]


Batch 35: Processed 5 reps of prompt with a total of 1786 output tokens in 7s


Processed prompts: 100%|██████████| 1/1 [00:05<00:00,  5.20s/it]


Batch 36: Processed 5 reps of prompt with a total of 1206 output tokens in 5s


Processed prompts: 100%|██████████| 1/1 [00:08<00:00,  8.98s/it]


Batch 37: Processed 5 reps of prompt with a total of 1681 output tokens in 9s


Processed prompts: 100%|██████████| 1/1 [00:06<00:00,  6.64s/it]


Batch 38: Processed 5 reps of prompt with a total of 1736 output tokens in 7s


Processed prompts: 100%|██████████| 1/1 [00:05<00:00,  5.19s/it]


Batch 39: Processed 5 reps of prompt with a total of 1288 output tokens in 5s


Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.86s/it]


Batch 40: Processed 5 reps of prompt with a total of 942 output tokens in 4s


Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.52s/it]


Batch 41: Processed 5 reps of prompt with a total of 1696 output tokens in 8s


Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.94s/it]


Batch 42: Processed 5 reps of prompt with a total of 1743 output tokens in 8s


Processed prompts: 100%|██████████| 1/1 [00:05<00:00,  5.67s/it]


Batch 43: Processed 5 reps of prompt with a total of 1435 output tokens in 6s


Processed prompts: 100%|██████████| 1/1 [00:37<00:00, 37.14s/it]


Batch 44: Processed 5 reps of prompt with a total of 6925 output tokens in 37s


Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
def parse_code(output):
    if '```python\n' in output:
        try:
            output = output.split(python_code_prefix)[-1]
        except:
            return None   
    if '```' in output:
        output = output.split('```')[0]
    #output = python_default_imports + output
    return output

def run_with_timeout(code, timeout):
    with open('code.py', 'w') as fout:
        fout.write(code)

    batcmd = f'timeout {timeout} {sys.executable} code.py'
    try:
        shell_output = subprocess.check_output(batcmd, stderr=subprocess.STDOUT, shell=True).decode('utf8')
    except subprocess.CalledProcessError as e:
        shell_output = e.output.decode('utf8')  # Get the output which may include Python error messages
    try:
        code_output = round(float(eval(shell_output))) % 1000  # <----------   !!!!!!!!!!!!   modulo important for competition
    except Exception as e:
        code_output = shell_output
    os.remove('code.py')
    return code_output

def parse_and_run(output):
    try:
        code = parse_code(output)
    except Exception as ex:
        return str(ex)
    try:
        return run_with_timeout(code, code_timeout_secs)
    except Exception as ex:
        return str(ex)

In [None]:
for row in out_rows:
    row['code'] = parse_code(row['llm_answer'])
    row['code_result'] = parse_and_run(row['llm_answer'])

In [32]:
df_res = pd.DataFrame(out_rows)
df_res

Unnamed: 0,iteration,problem_id,problem,correct_answer,prompt,llm_answer,n_answer_tokens,code,code_result
0,0,64,What is the smallest positive integer that sat...,39,<｜begin▁of▁sentence｜>User: What is the smalles...,"```python\nfrom sympy import symbols, mod_inv...",207,"from sympy import symbols, mod_inverse\n\ndef ...",39
1,0,64,What is the smallest positive integer that sat...,39,<｜begin▁of▁sentence｜>User: What is the smalles...,"```python\ndef solve_congruence():\n """"""Wh...",187,"def solve_congruence():\n """"""What is the sm...",39
2,0,64,What is the smallest positive integer that sat...,39,<｜begin▁of▁sentence｜>User: What is the smalles...,"```python\nfrom sympy import symbols, solve, ...",205,"from sympy import symbols, solve, Mod\n\ndef s...",39
3,0,64,What is the smallest positive integer that sat...,39,<｜begin▁of▁sentence｜>User: What is the smalles...,```python\nfrom sympy import mod_inverse\n\nd...,211,from sympy import mod_inverse\n\ndef solve_con...,39
4,0,64,What is the smallest positive integer that sat...,39,<｜begin▁of▁sentence｜>User: What is the smalles...,"```python\nfrom sympy import symbols, solve, ...",247,"from sympy import symbols, solve, Mod\n\ndef s...",39
...,...,...,...,...,...,...,...,...,...
1245,0,1571,Two right triangles have equal areas. The firs...,2,<｜begin▁of▁sentence｜>User: Two right triangles...,"```python\nfrom sympy import symbols, Eq, sol...",257,"from sympy import symbols, Eq, solve\n\ndef ot...",2
1246,0,1571,Two right triangles have equal areas. The firs...,2,<｜begin▁of▁sentence｜>User: Two right triangles...,"```python\nfrom sympy import symbols, solve, ...",295,"from sympy import symbols, solve, sqrt\n\ndef ...",2
1247,0,1571,Two right triangles have equal areas. The firs...,2,<｜begin▁of▁sentence｜>User: Two right triangles...,"```python\nfrom sympy import symbols, solve, ...",354,"from sympy import symbols, solve, sqrt\n\ndef ...",20
1248,0,1571,Two right triangles have equal areas. The firs...,2,<｜begin▁of▁sentence｜>User: Two right triangles...,"```python\nfrom sympy import symbols, Eq, sol...",331,"from sympy import symbols, Eq, solve, Rational...",2


In [97]:
# df_res.to_csv('baseline_results_april_23.csv', index=False)
df_res = pd.read_csv('baseline_results_april_23.csv')
df_res['problem_id'] = df_res['problem_id'].astype(str)

In [98]:
def is_numeric(var):
    return np.issubdtype(type(var), np.number)

def standardize_number(number):
    # ToDo: does not work for strings that are numbers
    if is_numeric(number):
        number = float(number)
        if number.is_integer():
            number = int(number)
    return number

In [106]:
df_res['llm_result'] = df_res['llm_answer'].map(extract_answer)
df_res.loc[df_res['code_result'].str.contains('Traceback').fillna(False), 'code_result'] = None
df_res.loc[df_res['code_result'].str.contains('code.py').fillna(False), 'code_result'] = None
df_res['code_result'] = df_res['code_result'].replace('', np.nan)
df_res['code_result'] = df_res['code_result'].map(standardize_number)
df_res['combined_result'] = df_res['code_result'].fillna(df_res['llm_result'])

In [107]:
most_common = df_res.groupby('problem_id')[['llm_result', 'code_result', 'combined_result']].agg(pd.Series.mode)
for col in ['llm_result', 'code_result', 'combined_result']:
    most_common[col] = most_common[col].apply(lambda x: x[0] if isinstance(x, np.ndarray) and len(x) >= 1 else x)
most_common = pd.DataFrame(most_common).reset_index()

merged = math_samp.merge(most_common, left_on='id', right_on='problem_id')
is_correct = []
for row in merged.iterrows():
    correct = standardize_number(row[1]['answer'])
    result = standardize_number(row[1]['combined_result'])
    is_correct.append(is_equiv(correct, result))
merged['is_correct'] = is_correct 

In [108]:
print(merged['is_correct'].mean())
merged.groupby('level')['is_correct'].agg(['size', 'mean'])

0.0


Unnamed: 0_level_0,size,mean
level,Unnamed: 1_level_1,Unnamed: 2_level_1
Level 1,10,0.0
Level 2,6,0.0
Level 3,11,0.0
Level 4,9,0.0
Level 5,11,0.0


In [None]:
print(is_equiv('x^9', 'x**9'))
print(is_equiv('-11 + 27i', '-11 + 27*I'))
print(is_equiv('\pi', '3.14159265358979'))
print(is_equiv('3 - 2f(x)', '3 - 2*f(x)'))
print(is_equiv('\text{neither}', 'neither'))
print(is_equiv('\sqrt{53}', 'sqrt(53)'))

In [69]:
with pd.option_context('display.max_rows', None):
    display(merged[['level', 'type', 'answer', 'llm_result', 'code_result', 'combined_result', 'is_correct']])

Unnamed: 0,level,type,answer,llm_result,code_result,combined_result,is_correct
0,Level 5,Prealgebra,60,1,1,1,False
1,Level 3,Intermediate Algebra,7,3,5,5,False
2,Level 4,Counting & Probability,0,750,8,8,False
3,Level 3,Intermediate Algebra,10,750,8,8,False
4,Level 5,Algebra,208,160,160,160,False
5,Level 1,Prealgebra,160,160,160,160,False
6,Level 4,Geometry,864,160,160,160,False
7,Level 4,Prealgebra,9,113,9,9,False
8,Level 2,Intermediate Algebra,101,6,100,100,False
9,Level 3,Algebra,8,4,4,4,False


In [12]:
q_ending = []
for batch in batch_process(q_and_res, batch_size // 2):
    q_prompts = []
    for row in batch.iterrows():
        one_q = row[1]['problem'] + tool_instruction + row[1]['llm_answer'] + '\n' + str(row[1]['code_result']) + '\n```\n'
        q_prompts.append(one_q)
    start_time = time.time()
    answers = generator(q_prompts, max_tokens=2400, stop_at='```output')
    if type(answers) == str:
        q_ending.append(answers)
    else:
        q_ending += answers    
    print(f'Processed {len(q_prompts)} prompts in {round(time.time() - start_time)}s')
    
q_and_res['q_ending'] = q_ending
q_and_res['combined'] = q_and_res['problem'] + tool_instruction + q_and_res['llm_answer'] + '\n' + q_and_res['code_result'].astype(str) + q_and_res['q_ending'] + '\n```\n'

Processed 2 prompts in 35s
Processed 2 prompts in 82s
Processed 2 prompts in 60s
Processed 2 prompts in 17s
Processed 2 prompts in 40s
Processed 2 prompts in 57s
Processed 2 prompts in 2s
Processed 2 prompts in 210s
Processed 2 prompts in 4s
Processed 2 prompts in 190s
Processed 2 prompts in 205s
Processed 2 prompts in 24s
Processed 2 prompts in 189s
Processed 2 prompts in 9s
Processed 2 prompts in 5s
Processed 2 prompts in 23s
Processed 2 prompts in 207s
Processed 2 prompts in 62s
Processed 2 prompts in 61s
Processed 2 prompts in 54s
Processed 2 prompts in 2s
Processed 2 prompts in 22s
Processed 2 prompts in 9s
Processed 2 prompts in 216s
Processed 2 prompts in 109s
Processed 2 prompts in 25s
Processed 2 prompts in 42s
Processed 2 prompts in 38s
Processed 2 prompts in 217s
Processed 2 prompts in 42s
Processed 2 prompts in 28s
Processed 2 prompts in 43s
Processed 2 prompts in 30s
Processed 2 prompts in 9s
Processed 2 prompts in 204s
Processed 2 prompts in 98s
Processed 2 prompts in 5s


TypeError: bad operand type for unary +: 'str'

22

In [18]:
q_and_res.to_csv('q_and_res_baseline.csv', index=False)

In [19]:
q_and_res

Unnamed: 0,question_id,problem,correct_answer,rep_idx,iteration,llm_answer,code,code_result,q_ending,combined,answer
3334,1084,"Let $(x,y)$ be an ordered pair of real numbers...",49,0,0,"\n```python\nfrom sympy import symbols, Eq, so...","from sympy import symbols, Eq, solve\n\ndef ma...",49,"The maximum value of $y$ is $49$. However, sin...","Let $(x,y)$ be an ordered pair of real numbers...",49
2841,539,Find the sum of the $x$-coordinates of all pos...,78,0,0,"\n```python\nfrom sympy import symbols, solve,...","from sympy import symbols, solve, Rational\n\n...","Traceback (most recent call last):\n File ""/h...",The solution to the equation $\frac1x+\frac1y=...,Find the sum of the $x$-coordinates of all pos...,1000
4411,1304,Let\n\[\mathbf{a} = \begin{pmatrix} 5 \\ -3 \\...,9,0,0,\n\nThe direction vector of the line containin...,from sympy import Matrix\n\ndef direction_vect...,"Matrix([[-16], [4], [32]])\n",The direction vector of the line containing $\...,Let\n\[\mathbf{a} = \begin{pmatrix} 5 \\ -3 \\...,1000
2072,977,Let $z=a+bi$ be the complex number with $\vert...,5,0,0,"\n```python\nfrom sympy import symbols, I, sqr...","from sympy import symbols, I, sqrt, simplify\n...","Traceback (most recent call last):\n File ""/h...",The `atan` function is not defined in the Symp...,Let $z=a+bi$ be the complex number with $\vert...,4
3963,936,Two jokers are added to a $52$ card deck and t...,3,0,0,\n\nSince the two jokers are added to a 52 car...,"def expected_cards():\n """"""Find the expecte...",52,The expected number of cards that will be stri...,Two jokers are added to a $52$ card deck and t...,52
...,...,...,...,...,...,...,...,...,...,...,...
2090,163,For $1 \leq i \leq 215$ let $a_i = \dfrac{1}{2...,860,2,0,\n```python\nimport numpy as np\nfrom scipy.op...,import numpy as np\nfrom scipy.optimize import...,487,The maximum possible value of $x_2$ modulo 100...,For $1 \leq i \leq 215$ let $a_i = \dfrac{1}{2...,487
1803,911,"For $0 < k < 6,$ the graphs of $\frac{(x - k)^...",5,2,0,"\n```python\nfrom sympy import symbols, solve,...","from sympy import symbols, solve, Rational\n\n...",2015/2 - 3*sqrt(281)/2\n,The value of $k$ such that the square $ABCD$ e...,"For $0 < k < 6,$ the graphs of $\frac{(x - k)^...",2
3545,765,There are 360 people in my school. 15 take ca...,110,2,0,"\n\n```python\nfrom sympy import symbols, Eq, ...","from sympy import symbols, Eq, solve\n\ndef st...",255,The number of students taking physics is $\box...,There are 360 people in my school. 15 take ca...,255
3714,396,We roll a fair 6-sided die 5 times. What is t...,648,2,0,\n\n\n```python\nfrom sympy import binomial\n\...,from sympy import binomial\n\ndef probability_...,1,The probability that we get a 6 in at most 2 o...,We roll a fair 6-sided die 5 times. What is t...,1\%


In [7]:
refine_instruction = "Please check the code above and provide a corrected version below.\n" \
                     + "If the output above shows an error improve the code below to fix the error.\n" \
                     + python_code_prefix + python_default_imports
refine_template = f"""{{question}}
{tool_instruction}
{{answer}}
```output
{{code_results}}
```

{refine_instruction}
"""
print(refine_template)

{question}

Please only provide Python code to solve the problem. The correct answer is an integer.
Unless the problem is explicity in complex numbers create all SymPy symbols with parameter real=True.


{answer}
```output
{code_results}
```

Please check the code above and provide a corrected version below.
If the output above shows an error improve the code below to fix the error.
```python
from sympy import *




In [19]:
refine_model = outlines_model(llama_3)
generator = generate.text(refine_model)

def new_res_row(old_row, refined_code, itr):
    new_row = old_row.copy()
    new_row['iteration'] = itr + 1
    new_row['code'] = parse_code(refined_code)
    new_row['code_result'] = parse_and_run(refined_code)
    return new_row

for itr in range(1):
    prev_results = q_and_res[q_and_res['iteration'] == itr]
    new_results = []
    for batch in batch_process(prev_results, batch_size):
        prompts = []
        for res in batch.iterrows():
            refine_prompt = refine_template.format(question=res[1]['problem'], answer=res[1]['code'], code_results=res[1]['code_result'])
            # print(refine_prompt)
            prompts.append(refine_prompt)
        start_time = time.time()
        refined_res = generator(prompts, max_tokens=3000, stop_at='```output')
        if type(refined_res) == str:
            new_rows = pd.DataFrame([new_res_row(batch.iloc[0])], refined_res, itr)
        else:  
            new_rows = []
            for i, refined_code in enumerate(refined_res):
                new_rows.append(new_res_row(batch.iloc[i], refined_code, itr))
            new_rows = pd.DataFrame(new_rows)  
        q_and_res = pd.concat([q_and_res, new_rows])
        print(f'Processed {len(prompts)} prompts in {round(time.time() - start_time)}s')

del[refine_model]
del[generator]
free_mem()
print_cuda_mem()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Processed 3 prompts in 258s
Processed 3 prompts in 120s
Processed 3 prompts in 157s
Processed 3 prompts in 45s
Available Memory (GB, approx.): 23.62


In [20]:
df_res = pd.DataFrame(q_and_res).sort_values(['question_id', 'rep_idx', 'iteration']).reset_index(drop=True)
df_res

Unnamed: 0,question_id,problem,correct_answer,rep_idx,iteration,code,code_result
0,229ee8,"Let $k, l > 0$ be parameters. The parabola $y ...",52,0,0,from sympy import *\n\ndef sum_of_squares_dist...,"Traceback (most recent call last):\n File ""/h..."
1,229ee8,"Let $k, l > 0$ be parameters. The parabola $y ...",52,0,1,from sympy import *\ndef sum_of_squares_distan...,"File ""/home/daniel/code/math-ai/code.py"", li..."
2,229ee8,"Let $k, l > 0$ be parameters. The parabola $y ...",52,1,0,from sympy import *\n\ndef sum_of_squares_dist...,2*(18*k - l + 4)/k\n
3,229ee8,"Let $k, l > 0$ be parameters. The parabola $y ...",52,1,1,from sympy import *\n DEV solution training tu...,"File ""/home/daniel/code/math-ai/code.py"", li..."
4,229ee8,"Let $k, l > 0$ be parameters. The parabola $y ...",52,2,0,from sympy import *\nfrom sympy.solvers import...,-(-(k - sqrt(k*(k - l + 4)))/k + (k + sqrt(k*(...
5,229ee8,"Let $k, l > 0$ be parameters. The parabola $y ...",52,2,1,"from sympy import *\nx, k, l = symbols('x k l'...",-(-(k - sqrt(k*(k - l + 4)))/k + (k + sqrt(k*(...
6,229ee8,"Let $k, l > 0$ be parameters. The parabola $y ...",52,3,0,"from sympy import *\n\nk, l, x, y, a, b = symb...",32 + (k - sqrt(k*(k - l + 4)))**2/k**2 + (k + ...
7,229ee8,"Let $k, l > 0$ be parameters. The parabola $y ...",52,3,1,"from sympy import *\nk, l, x, y, a, b = symbol...",32 + (k - sqrt(k*(k - l + 4)))**2/k**2 + (k + ...
8,229ee8,"Let $k, l > 0$ be parameters. The parabola $y ...",52,4,0,"from sympy import *\n\nk, l, x = symbols('k l ...","Traceback (most recent call last):\n File ""/h..."
9,229ee8,"Let $k, l > 0$ be parameters. The parabola $y ...",52,4,1,from sympy import *\n BLAH BLAH\n\n,"File ""/home/daniel/code/math-ai/code.py"", li..."


In [None]:
print(df_res['code'][4])

In [None]:
print(df_res['code'][2])

In [None]:
print(df_res['code'][3])

In [None]:
print(df_res['code'][4])

In [None]:
questions = pd.read_csv('./test.csv')

In [None]:
def clean_latex(text):
    """
    convert r"[math]\vec{x} + \vec{y}" to English
    """
    # edge case
    text = re.sub(r'\[math\]', ' LaTex math ', text)
    text = re.sub(r'\[\/math\]', ' LaTex math ', text)
    text = re.sub(r'\\', ' LaTex ', text)

    pattern_to_sub = {
        r'$': '', #Add replace $?
        r'\\mathrm': ' LaTex math mode ',
        r'\\mathbb': ' LaTex math mode ',
        r'\\boxed': ' LaTex equation ',
        r'\\begin': ' LaTex equation ', #should it be parenthesis?
        r'\\end': ' LaTex equation ', #should it be parenthesis?
        r'\\left': ' LaTex equation ', #should it be parenthesis?
        r'\\right': ' LaTex equation ', #should it be parenthesis?
        r'\\(over|under)brace': ' LaTex equation ',
        r'\\text': ' LaTex equation ',
        r'\\vec': ' vector ',
        r'\\var': ' variable ',
        r'\\theta': ' theta ',
        r'\\mu': ' average ',
        r'\\min': ' minimum ',
        r'\\max': ' maximum ',
        r'\\sum': ' + ',
        r'\\times': ' * ',
        r'\\cdot': ' * ',
        r'\\hat': ' ^ ',
        r'\\frac': ' / ',
        r'\\div': ' / ',
        r'\\sin': ' Sine ',
        r'\\cos': ' Cosine ',
        r'\\tan': ' Tangent ',
        r'\\infty': ' infinity ',
        r'\\int': ' integer ',
        r'\\in': ' in ',
    }
    # post process for look up
    pattern_dict = {k.strip('\\'): v for k, v in pattern_to_sub.items()}
    # init re
    patterns = pattern_to_sub.keys()
    pattern_re = re.compile('(%s)' % '|'.join(patterns))

    def _replace(match):
        try:
            word = pattern_dict.get(match.group(0).strip('\\'))
        except KeyError:
            word = match.group(0)
            print('!!Error: Could Not Find Key: {}'.format(word))
        return word
    return pattern_re.sub(_replace, text)

In [None]:
def run_python_code(code):
    try:
        with open('code.py', 'w') as fout:
            fout.write('from sympy import * \n')
            fout.write(code)
        batcmd = 'timeout 7 ' + sys.executable + ' code.py'
        return subprocess.check_output(batcmd, stderr=subprocess.STDOUT, shell=True).decode('utf8')
    except subprocess.CalledProcessError as e:
        return e.output.decode('utf8')  # Return the error output
    except Exception as ex:
        return str(ex)


def process_output(output):
    result = output
    try:
        shell_output = run_python_code(output.split('```')[1][7:])
        shell_number = re.findall(r'(-?\d+)(?!.*\d)', shell_output)[-1]
        code_output = round(float(eval(shell_number))) % 1000
    except:
        code_output = -1
    try:
        result_output = re.findall(r'\\boxed\{(.*)\}', result)
        if not len(result_output):
            result_output = re.findall(r'(-?\d+)(?!.*\d)', result)[-1]
        else:
            result_output = result_output[-1]
        if not len(result_output):
            result_output = -1
        else:
            result_output = round(float(eval(result_output))) % 1000
    except:
        result_output = -1
    return result_output, code_output

In [None]:
questions = math_train[math_train['level'] == 'Level 5'].sample(2)
print('\n\n\n'.join(questions['problem'].values))

In [None]:
print('\n-----------------------------------------\n'.join(questions['solution'].values))

In [None]:
question = questions['problem'].iloc[0]
question

In [None]:
!pip install outlines

In [None]:
# tool_instruction = " The answer should be given as a non-negative modulo 1000."

n_repetitions = 3
q_ics = []
raw_results = []
total_results = []
total_answers = []

for q_idx in range(len(questions)):
    print(f"\n\n{q_idx}: {questions['problem'].iloc[q_idx]}")
    results = []
    answers = []
    try:
        combined_messages = None
        for rep_idx in range(n_repetitions):
            print(datetime.now().strftime('%H:%M:%S'))
            if rep_idx > (n_repetitions + 1) / 2:
                problem = clean_latex(questions['problem'].iloc[q_idx])
            else:
                problem = questions['problem'].iloc[q_idx] 
            messages = [{
                "role": "user", 
                "content": 'Problem: ' + problem + "\nGenerate Python code to solve the above problem." +
                '\nUnless complex numbers are mentioned in the problem create SymPy symbols with parameter real=True' +
                    '\n```python\n'
            }]
            query_prompt = tokenizer.apply_chat_template(messages, tokenize=False)
            first_output = pipeline(query_prompt, 
                                    max_new_tokens=1024,
                                    do_sample=True,
                                    temperature=0.9,
                                    return_full_text=False,
                                    n_return_sequences=5
                            )
            first_output = '```python\n' + first_output[0]['generated_text']
            # print('Shell input: \n' + first_output.split('```')[1][7:] + '\n')
            shell_output = run_python_code(first_output.split('```')[1][7:])
            print(f'{shell_output=}\n')
            with_output = first_output.split('```output')[0] + f'```output\n{shell_output}\n```'
            message = {
                "role": "assistant",
                "content": f'Answer {rep_idx}:\n```python\n' + with_output
            }
            messages.append(message)
            #torch.cuda.empty_cache()
            #gc.collect()
            if combined_messages is None:
                combined_messages = messages
            else: 
                combined_messages.append(message)
                
        combined_messages.append({
            "role": "user",
            "content":  "Which is the best of the solutions above?"
                    
        })
        second_prompt = tokenizer.apply_chat_template(combined_messages, tokenize=False).replace('<｜end▁of▁sentence｜>', '')
        # print(f'{second_prompt=}\n')
        second_output = pipeline(second_prompt, max_new_tokens=1024, do_sample=True, temperature=0.1, return_full_text=False)
        print(f'{second_output=}\n')
        
    except Exception as ex:
        print(ex)
        result_output, code_output = -1, -1 
    raw_results.append(second_output)
    result_output, code_output = process_output(second_output)


    results.append(result_output)
    answers.append(code_output)
    q_ics.append(q_idx)
total_results.append(results)
total_answers.append(answers)

In [None]:
from sympy import *

a = symbols('a')


In [None]:
# pd.DataFrame({
#     'question': np.repeat(questions.values, n_repetitions),
#     'sympy_answers': total_answers,
#     'llm_answers': total_results,
#     'raw_answers': raw_results
# })

In [None]:
print(len(np.repeat(questions['problem'].values, n_repetitions)))
print(len(raw_results))
print(len(total_results))
print(len(total_answers))

In [None]:
questions['problem'].iloc[7]

In [None]:
total_results

In [None]:
total_answers

In [None]:
questions['solution'].values

In [None]:
propm_prefix = '''Problem:
Find the domain of the expression $\frac{\sqrt{x-2}}{\sqrt{5-x}}$.}

Solution:
The expressions inside each square root must be non-negative. Therefore,
$x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator
cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of
the expression is $\boxed{[2,5)}$.
Final Answer: The final answer is $[2,5)$. I hope it is correct.

Problem:
If $\det \mathbf{A} = 2$ and $\det \mathbf{B} = 12,$ then find
$\det (\mathbf{A} \mathbf{B}).$
Solution:
We have that $\det (\mathbf{A} \mathbf{B}) = (\det \mathbf{A})(\det \mathbf{B})
= (2)(12) = \boxed{24}.$
Final Answer: The final answer is $24$. I hope it is correct.
Problem:
Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound
weights instead, how many times must Terrell lift them in order to lift the
same total weight?

Solution:
If Terrell lifts two 20-pound weights 12 times, he lifts a total of
$2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound
weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$
pounds of weight. Equating this to 480 pounds, we can solve for $n$:
\begin{align*}
30n&=480\\
\Rightarrow\qquad n&=480/30=\boxed{16}
\end{align*}
Final Answer: The final answer is $16$. I hope it is correct.

Problem:
If the system of equations
\begin{align*}
6x-4y&=a,\\
6y-9x &=b.
\end{align*}has a solution $(x, y)$ where $x$ and $y$ are both nonzero,
find $\frac{a}{b},$ assuming $b$ is nonzero.

Solution:
If we multiply the first equation by $-\frac{3}{2}$, we obtain
$$6y-9x=-\frac{3}{2}a.$$Since we also know that $6y-9x=b$, we have
$$-\frac{3}{2}a=b\Rightarrow\frac{a}{b}=\boxed{-\frac{2}{3}}.$$
Final Answer: The final answer is $-\frac{2}{3}$. I hope it is correct.

Problem:
'''

question_easy = 'Beth bakes 4, 2 dozen batches of cokies in a week. If these cookies are shared amongst 16 people equally, how many cookies does each person consume?'
question_hard = 'Each of the three-digits numbers $111$ to $999$ is coloured blue or yellow in such a way that the sum of any two (not necessarily different) yellow numbers is equal to a blue number. What is the maximum possible number of yellow numbers there can be?'
postfix = '\nPlease reason step by step, and put your final answer within \boxed{}.'
postfix_2 =  '\nPlease convert this problem into a set of equations'
tokens = tokenizer.encode(question_hard + postfix_2, return_tensors='pt').to('cuda')
out_tokens = model.generate(tokens, max_length=500)
print(tokenizer.decode(out_tokens[0]))

In [None]:
from sympy import symbols, Rational, floor, sqrt
from sympy import solve, And, Le, Ge
def solve_for_prob():
    x = symbols('x')
    # define the polynomial P(x)
    P = x**2 - 3*x - 9
    # define the condition for the problem
    condition = floor(sqrt(P.subs(x, x))) == sqrt(P.subs(x, floor(x)))
    # define the interval
    interval = And(5 <= x, x <= 15)
    # calculate the number of values of x that satisfy the condition in the interval
    valid_count = 0    
    for i in range(5, 16):
        if condition.subs(x, i): 
            valid_count += 1
        # calculate the total number of values of x in the interval
        total_count = 15 - 5 + 1
        # calculate the probability
        probability = Rational(valid_count, total_count)
        # calculate a, b, c, d, e for the fraction form of the probability
        a = 1
        b = 1
        c = 1
        d = -1
        e = 1
        
solve_for_prob()

In [None]:
˘..