# Inject Code Results DeepSeek
- vLLM in fp16

In [1]:
import os, io
import json
from pathlib import Path
from datetime import datetime
import re, sys, subprocess, gc
import time
import multiprocessing

import pandas as pd
import numpy as np
import torch
from vllm import LLM, SamplingParams

n_questions = 250
n_reps = 5
code_timeout_secs = 2
temperature = 0.9

np.random.seed(42)

def free_mem():
    if hasattr(sys, 'last_traceback'):
        sys.last_traceback.tb_next = None
    gc.collect()
    torch.cuda.empty_cache()

def print_cuda_mem():
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)
    cached_memory = torch.cuda.memory_reserved(0) / (1024 ** 3)
    available_memory = total_memory - cached_memory 
    print(f"Available Memory (GB, approx.): {available_memory:.2f}")

ds_math_rl = "deepseek-ai/deepseek-math-7b-rl"
torch_dtype = torch.bfloat16
# to avoid warning when spawing processes to evaluate code later
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
print_cuda_mem()

if not 'vllm' in globals():
    vllm = LLM(model=ds_math_rl,
              dtype='half',
              enforce_eager=True,
              gpu_memory_utilization=0.99,
              swap_space=4,
              max_model_len=2048,
              kv_cache_dtype="fp8_e5m2",
              tensor_parallel_size=1)
vtokenizer = vllm.get_tokenizer()

Available Memory (GB, approx.): 23.64
INFO 04-25 12:29:26 config.py:381] Using fp8_e5m2 data type to store kv cache. It reduces the GPU memory footprint and boosts the performance. But it may cause slight accuracy drop. Currently we only support fp8 without scaling factors and make e5m2 as a default format.
INFO 04-25 12:29:26 llm_engine.py:74] Initializing an LLM engine (v0.4.0.post1) with config: model='deepseek-ai/deepseek-math-7b-rl', tokenizer='deepseek-ai/deepseek-math-7b-rl', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=True, kv_cache_dtype=fp8_e5m2, device_config=cuda, seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 04-25 12:29:27 selector.py:51] Cannot use FlashAttention because the package is not found. Please install it for better performance.
INFO 04-25 12:29:27 selector.py:25] Using XFormers backend.
INFO 04-25 12:29:27 weight_utils.py:177] Using model weights format ['*.safetensors']
INFO 04-25 12:29:31 model_runner.py:104] Loading model weights took 12.8725 GB
INFO 04-25 12:29:31 gpu_executor.py:94] # GPU blocks: 2569, # CPU blocks: 1092


In [2]:
q_df = pd.read_csv('./train.csv')
print(q_df.iloc[0]['problem'])
q_df.iloc[0]['answer']
q_df[:3]

Let $k, l > 0$ be parameters. The parabola $y = kx^2 - 2kx + l$ intersects the line $y = 4$ at two points $A$ and $B$. These points are distance 6 apart. What is the sum of the squares of the distances from $A$ and $B$ to the origin?


Unnamed: 0,id,problem,answer
0,229ee8,"Let $k, l > 0$ be parameters. The parabola $y ...",52
1,246d26,Each of the three-digits numbers $111$ to $999...,250
2,2fc4ad,Let the `sparkle' operation on positive intege...,702


In [3]:
from MATH.math_equivalence import is_equiv
def extract_answer(text, boxed_in_tool_instruction=True):
    pattern = r"\\boxed\{([^}]*)\}"
    matches = re.findall(pattern, text)
    n_matches = 2 if boxed_in_tool_instruction else 1
    
    if len(matches) >= n_matches:
        try:
            return int(matches[-1]) % 1000
        except ValueError:
            return None

    numbers = re.findall(r'[+-]?\d+', text)
    return int(numbers[-1]) % 1000 if numbers else -1

def extract_correct(text):
    return extract_answer(text, False)

def read_math_data(folder_path):
    folder = Path(folder_path)
    math_data = []
    for sub_folder in os.listdir(folder):
        if not os.path.isdir(folder/sub_folder):
            continue
        for file_name in os.listdir(folder/sub_folder):
            if file_name.endswith('.json'):
                with open(folder / sub_folder / file_name, 'r') as file:
                    data = json.load(file)
                    data['id'] = file_name[:-5]
                    math_data.append(data)
    return pd.DataFrame(math_data)

math_train = read_math_data('./MATH/train')
math_train['answer'] = math_train['solution'].map(extract_correct)
math_train = math_train[math_train['answer'].notnull()]
math_train['answer'] = math_train['answer'].astype(int)
math_test = read_math_data('./MATH/test')
math_test['answer'] = math_test['solution'].map(extract_correct)
math_test = math_test[math_test['answer'].notnull()]
math_test['answer'] = math_test['answer'].astype(int)
print(math_train.shape, math_test.shape)

(4748, 6) (3119, 6)


- 1 yes
- 2 no
- 3 no
- 4 yes
- 5 maybe
- 6 maybe
- 7 yes
- 8 maybe
- 9 yes
- 10 no

In [4]:
math_samp = math_test[math_test['level'].isin(['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'])].sample(n_questions)
math_samp.columns

Index(['problem', 'level', 'type', 'solution', 'id', 'answer'], dtype='object')

In [5]:
# with pd.option_context('display.max_rows', None):
display(math_samp)

Unnamed: 0,problem,level,type,solution,id,answer
2857,Given $2^a = 32$ and $a^b = 125$ find $b^a$.,Level 1,Algebra,We note that $32 = 2 \cdot 2\cdot 2\cdot 2\cdo...,756,243
3540,How many distinct three-letter sequences with ...,Level 5,Counting & Probability,We solve by casework.\n\n$\bullet$ Case I: Exa...,381,48
3975,Compute $\cos 180^\circ$.,Level 1,Precalculus,"Rotating the point $(1,0)$ about the origin by...",1282,999
3264,"In a $100$ meter track event, Alice runs at a ...",Level 1,Algebra,Alice took $25 - 5 = 20$ seconds to complete t...,623,10
533,Points earned on equally weighted tests are 70...,Level 1,Prealgebra,We take the average of these six scores: \begi...,1083,80
...,...,...,...,...,...,...
2044,"Let $p(x)$ be a monic, quartic polynomial, suc...",Level 5,Intermediate Algebra,Let $q(x) = p(x) - (x^2 + 2).$ Then $q(1) = q...,1877,112
761,If a stack of eight quarters is exactly half a...,Level 3,Prealgebra,"There are $12$ inches in a foot, so we can con...",1800,192
4650,"Consider the rectangle with vertices at $(5,4)...",Level 4,Geometry,Points with integer coordinates are called lat...,1097,63
1940,"At a competition with $N$ players, the number ...",Level 3,Intermediate Algebra,We start with $ 2^{1+\lfloor\log_{2}(N-1)\rflo...,1669,154


# Batched Inference

In [6]:
tool_instruction = " The answer should be given as a non-negative modulo 1000."
tool_instruction += '\nPlease integrate natural language reasoning with programs to solve the problem above, and put your final answer within \\boxed{}.'
python_code_prefix = "```python\n"
python_default_imports = "from sympy import *\n"


batched_prompts = []
out_rows = []
for rep in range(n_reps):
    for i, problem in enumerate(math_samp['problem']):
        prompt = problem + tool_instruction
        messages = [{"role": "user", "content": prompt}]
        m_prompt = vtokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
        batched_prompts.append(m_prompt)
        out_rows.append({
            'problem_id': math_samp.iloc[i]['id'],
            'problem': math_samp.iloc[i]['problem'],
            'correct_answer': math_samp.iloc[i]['answer'],
            'iteration': 0,
            'rep': rep,
            'level': math_samp.iloc[i]['level'],
            'type': math_samp.iloc[i]['type'],
            'prompt': m_prompt
        })

tool_sampling_params = SamplingParams(n=1,
                                      temperature=temperature,
                                      max_tokens=1400,
                                      stop='output')
before_output = vllm.generate(batched_prompts, tool_sampling_params)

n_out_tokens = 0
for i, output in enumerate(before_output):
    out_rows[i]['before_output'] = output.outputs[0].text
    out_rows[i]['n_before_output'] = len(output.outputs[0].token_ids)

df_res = pd.DataFrame(out_rows)
df_res[:2]

Processed prompts: 100%|██████████| 1250/1250 [02:04<00:00, 10.01it/s]


Unnamed: 0,problem_id,problem,correct_answer,iteration,rep,level,type,prompt,out_before_code,n_before_code_tokens
0,756,Given $2^a = 32$ and $a^b = 125$ find $b^a$.,243,0,0,Level 1,Algebra,<｜begin▁of▁sentence｜>User: Given $2^a = 32$ an...,"```python\nfrom sympy import symbols, Eq, sol...",208
1,381,How many distinct three-letter sequences with ...,48,0,0,Level 5,Counting & Probability,<｜begin▁of▁sentence｜>User: How many distinct t...,```python\nfrom sympy import factorial\n\ndef...,248


In [8]:
def parse_code(output):
    if '```python\n' in output:
        try:
            output = output.split(python_code_prefix)[-1]
        except:
            return None   
    if '```' in output:
        output = output.split('```')[0]
    #output = python_default_imports + output
    return output

def run_with_timeout(code, timeout):
    def repl(match):
        if "real" not in match.group():
            return "{}{}".format(match.group()[:-1], ', real=True)')
        else:
            return "{}{}".format(match.group()[:-1], ')')
    code = re.sub(r"symbols\([^)]+\)", repl, code)
    code = 'from sympy import *\n' + code
    
    #ToDo: check whether sometimes there is a print of the corect result before an exception
    
    with open('code.py', 'w') as fout:
        fout.write(code)

    batcmd = f'timeout {timeout} {sys.executable} code.py'
    try:
        shell_output = subprocess.check_output(batcmd, stderr=subprocess.STDOUT, shell=True).decode('utf8')
    except subprocess.CalledProcessError as e:
        shell_output = e.output.decode('utf8')  # Get the output which may include Python error messages
    try:
        code_output = round(float(eval(shell_output))) % 1000  # <----------   !!!!!!!!!!!!   modulo important for competition
    except Exception as e:
        code_output = shell_output
    os.remove('code.py')
    return code_output

def parse_and_run(output):
    try:
        code = parse_code(output)
    except Exception as ex:
        return str(ex)
    try:
        return run_with_timeout(code, code_timeout_secs)
    except Exception as ex:
        return str(ex)

In [None]:
df_res

Unnamed: 0,problem_id,problem,correct_answer,iteration,rep,level,type,prompt,out_before_code,n_before_code_tokens,before_output
0,756,Given $2^a = 32$ and $a^b = 125$ find $b^a$.,243,0,0,Level 1,Algebra,<｜begin▁of▁sentence｜>User: Given $2^a = 32$ an...,"```python\nfrom sympy import symbols, Eq, sol...",208,"```python\nfrom sympy import symbols, Eq, sol..."
1,381,How many distinct three-letter sequences with ...,48,0,0,Level 5,Counting & Probability,<｜begin▁of▁sentence｜>User: How many distinct t...,```python\nfrom sympy import factorial\n\ndef...,248,```python\nfrom sympy import factorial\n\ndef...
2,1282,Compute $\cos 180^\circ$.,999,0,0,Level 1,Precalculus,<｜begin▁of▁sentence｜>User: Compute $\cos 180^\...,"```python\nfrom sympy import cos, rad\n\ndef ...",73,"```python\nfrom sympy import cos, rad\n\ndef ..."
3,623,"In a $100$ meter track event, Alice runs at a ...",10,0,0,Level 1,Algebra,<｜begin▁of▁sentence｜>User: In a $100$ meter tr...,```python\nfrom sympy import Rational\n\ndef ...,222,```python\nfrom sympy import Rational\n\ndef ...
4,1083,Points earned on equally weighted tests are 70...,80,0,0,Level 1,Prealgebra,<｜begin▁of▁sentence｜>User: Points earned on eq...,```python\nfrom statistics import mean\n\ndef...,126,```python\nfrom statistics import mean\n\ndef...
...,...,...,...,...,...,...,...,...,...,...,...
1245,1877,"Let $p(x)$ be a monic, quartic polynomial, suc...",112,0,4,Level 5,Intermediate Algebra,<｜begin▁of▁sentence｜>User: Let $p(x)$ be a mon...,"```python\nfrom sympy import symbols, solve\n...",371,"```python\nfrom sympy import symbols, solve\n..."
1246,1800,If a stack of eight quarters is exactly half a...,192,0,4,Level 3,Prealgebra,<｜begin▁of▁sentence｜>User: If a stack of eight...,"```python\ndef number_of_quarters():\n """"""...",205,"```python\ndef number_of_quarters():\n """"""..."
1247,1097,"Consider the rectangle with vertices at $(5,4)...",63,0,4,Level 4,Geometry,<｜begin▁of▁sentence｜>User: Consider the rectan...,```python\ndef count_interior_points():\n ...,246,```python\ndef count_interior_points():\n ...
1248,1669,"At a competition with $N$ players, the number ...",154,0,4,Level 3,Intermediate Algebra,<｜begin▁of▁sentence｜>User: At a competition wi...,"```python\nfrom sympy import symbols, Eq, log...",288,"```python\nfrom sympy import symbols, Eq, log..."


In [14]:
parsed_code = []
code_results = []
for i in range(len(df_res)):
    parsed_code.append(parse_code(df_res.iloc[i]['before_output']))
    code_results.append(parse_and_run(df_res.iloc[i]['before_output']))
df_res['code'] = parsed_code
df_res['code_result'] = code_results
df_res[:3]

Unnamed: 0,problem_id,problem,correct_answer,iteration,rep,level,type,prompt,out_before_code,n_before_code_tokens,before_output,code,code_result
0,756,Given $2^a = 32$ and $a^b = 125$ find $b^a$.,243,0,0,Level 1,Algebra,<｜begin▁of▁sentence｜>User: Given $2^a = 32$ an...,"```python\nfrom sympy import symbols, Eq, sol...",208,"```python\nfrom sympy import symbols, Eq, sol...","from sympy import symbols, Eq, solve, log\n\nd...","Mod(log(125)**5/log(a)**5, 1000)\n"
1,381,How many distinct three-letter sequences with ...,48,0,0,Level 5,Counting & Probability,<｜begin▁of▁sentence｜>User: How many distinct t...,```python\nfrom sympy import factorial\n\ndef...,248,```python\nfrom sympy import factorial\n\ndef...,from sympy import factorial\n\ndef distinct_se...,60
2,1282,Compute $\cos 180^\circ$.,999,0,0,Level 1,Precalculus,<｜begin▁of▁sentence｜>User: Compute $\cos 180^\...,"```python\nfrom sympy import cos, rad\n\ndef ...",73,"```python\nfrom sympy import cos, rad\n\ndef ...","from sympy import cos, rad\n\ndef compute_cos_...",999


In [15]:
df_res['continue_prompt'] = df_res['prompt'] + df_res['before_output'] + 'output\n' + df_res['code_result'].astype(str) + '\n```\n'
tool_sampling_params = SamplingParams(n=1,
                                      temperature=0.9,
                                      max_tokens=2048)

llm_out = vllm.generate(df_res['continue_prompt'], tool_sampling_params)

continuations = []
for llm_output in llm_out:
    continuations.append(llm_output.outputs[0].text)
    
df_res['continuations'] = continuations
df_res['combined'] = df_res['continue_prompt'] + df_res['continuations']

Token indices sequence length is longer than the specified maximum sequence length for this model (47136 > 4096). Running this sequence through the model will result in indexing errors
Processed prompts:   0%|          | 0/1250 [00:00<?, ?it/s]



Processed prompts:  20%|█▉        | 248/1250 [00:31<03:42,  4.49it/s]



Processed prompts:  39%|███▉      | 489/1250 [01:32<01:41,  7.48it/s]



Processed prompts:  60%|██████    | 751/1250 [02:52<01:45,  4.74it/s]



Processed prompts:  80%|████████  | 1002/1250 [04:08<00:32,  7.65it/s]



Processed prompts: 100%|██████████| 1250/1250 [05:22<00:00,  3.88it/s]


In [16]:
# df_res.to_csv('baseline_results_april_23.csv', index=False)
# df_res = pd.read_csv('baseline_results_april_23.csv')
# df_res['problem_id'] = df_res['problem_id'].astype(str)

In [17]:
def is_numeric(var):
    return np.issubdtype(type(var), np.number)

def standardize_number(number):
    try:
        return int(number)
    except:pass
    try:
        return float(number)
    except: pass
    return -1

In [156]:
df_res['llm_result'] = df_res['combined'].map(extract_answer)
df_res['code_result'] = pd.to_numeric(df_res['code_result'], errors='coerce')
df_res['code_result'] = df_res['llm_result'].replace(-1, None)
df_res['combined_result'] = df_res['code_result'].fillna(df_res['llm_result'])

In [164]:
def first_mode(series):
    return series.mode().iloc[0] if not series.mode().empty else None

most_common = df_res.groupby('problem_id')[['llm_result', 'code_result', 'combined_result']].agg(first_mode)
for col in ['llm_result', 'code_result', 'combined_result']:
    most_common[col] = most_common[col].apply(lambda x: x[0] if isinstance(x, np.ndarray) and len(x) >= 1 else x)
most_common = pd.DataFrame(most_common).reset_index()

merged = math_samp.merge(most_common, left_on='id', right_on='problem_id')
is_correct = []
for row in merged.iterrows():
    correct = standardize_number(row[1]['answer'])
    result = standardize_number(row[1]['llm_result'])
    is_correct.append(is_equiv(correct, result))
merged['is_correct'] = is_correct

In [165]:
print(merged['is_correct'].mean())
merged.groupby('level')['is_correct'].agg(['size', 'mean'])

0.636


Unnamed: 0_level_0,size,mean
level,Unnamed: 1_level_1,Unnamed: 2_level_1
Level 1,25,0.88
Level 2,47,0.744681
Level 3,65,0.769231
Level 4,60,0.55
Level 5,53,0.358491


In [166]:
df_res.to_csv('inject_code_results_april_25.csv', index=False)

In [None]:
with pd.option_context('display.max_rows', None):
    display(merged[['level', 'type', 'answer', 'llm_result', 'code_result', 'combined_result', 'is_correct']])