# Performance Test - Deep Seek

In [1]:
import os
import json
import time
from pathlib import Path
from datetime import datetime
import re, sys, subprocess, gc

import pandas as pd
import numpy as np
import torch
import transformers
from transformers import AutoConfig, GenerationConfig, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_name = "deepseek-ai/deepseek-math-7b-rl"
config = AutoConfig.from_pretrained(model_name)
gen_config = GenerationConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

torch_dtype = torch.bfloat16
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
import sys, gc
import torch

def free_mem():
    if hasattr(sys, 'last_traceback'):
        sys.last_traceback.tb_next = None
    gc.collect()
    torch.cuda.empty_cache()
free_mem()

def print_cuda_memory():
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)
    cached_memory = torch.cuda.memory_reserved(0) / (1024 ** 3)
    available_memory = total_memory - cached_memory 
    print(f"Available Memory (GB, approx.): {available_memory:.2f}")
print_cuda_memory()

Available Memory (GB, approx.): 23.65


In [3]:
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             torch_dtype=torch_dtype,
                                             # quantization_config=quantization_config,
                                             device_map="auto")
print_cuda_memory()
print(model.dtype)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Available Memory (GB, approx.): 10.72
torch.bfloat16


In [4]:
questions = pd.read_csv('./train.csv')

In [12]:
tokenizer_max_length = 512
max_length = 1600


tool_instruction = " The answer should be given as a non-negative modulo 1000."
tool_instruction += '\nPlease integrate natural language reasoning with programs to solve the problem above, and put your final answer within \\boxed{}.'

messages = [{
   "role": "user", 
   "content": questions.iloc[0]['problem'] + tool_instruction
}]
query_prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False
)
start_time = time.time()
tokenized = tokenizer(query_prompt, 
                      return_tensors="pt",
                      truncation=True,
                      max_length=tokenizer_max_length)
input_length = len(tokenized['input_ids'][0])
print(f'{input_length=}')
tokenized = {key: value.to('cuda') for key, value in tokenized.items()}
raw_output = model.generate(**tokenized, 
                            max_length=max_length,
                            do_sample=True,
                            temperature=0.9,
                            num_return_sequences=8,
                            generation_config=gen_config)
out_list = []
for i in range(len(raw_output)):
    print(f'{len(raw_output[i][input_length:])=}')
    out_list.append(tokenizer.decode(raw_output[i][input_length:], skip_special_tokens=True))
print(time.time() - start_time)

Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


input_length=125


OutOfMemoryError: CUDA out of memory. Tried to allocate 86.00 MiB. GPU 0 has a total capacity of 23.65 GiB of which 49.56 MiB is free. Including non-PyTorch memory, this process has 23.56 GiB memory in use. Of the allocated memory 22.74 GiB is allocated by PyTorch, and 385.29 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [10]:
print(out_list)

['```python\nfrom sympy import symbols, Eq, solve, sqrt\n\ndef distance_to_origin():\n    """Let $k, l > 0$ be parameters. The parabola $y = kx^2 - 2kx + l$ intersects the line $y = 4$ at two points $A$ and $B$. These points are distance 6 apart. What is the sum of the squares of the distances from $A$ and $B$ to the origin? The answer should be given as a non-negative modulo 1000."""\n    # Define symbols\n    x, k, l = symbols(\'x k l\')\n\n    # Set up the equation of the parabola and the line\n    eq1 = Eq(k*x**2 - 2*k*x + l, 4)\n\n    # Solve the equation for x\n    solutions = solve(eq1, x)\n\n    # Calculate the distance from each solution to the origin\n    distance_A = sqrt(solutions[0]**2)\n    distance_B = sqrt(solutions[1]**2)\n\n    # Calculate the sum of the squares of the distances\n    result = (distance_A**2 + distance_B**2) % 1000\n\n    return result\n\nresult = distance_to_origin()\nprint(result)\n```\n```output\nMod(k**2*(2*l/k - 1)**2 + k**2*(2*l/k + 1)**2, 1000)\

### Test Results
| n Input tokens | n Output tokens  | dtype  | quantization | double quant | eyeball quality | batch size | time | token/s | 
|----------------|------------------|--------|--------------|--------------|-----------------|------------|------|---------|
| 125            | 1155             |bfloat16| 4-bit        | yes          | good            | 8          |99    | 129     |
| 125            | 1155             |bfloat16| 4-bit        | yes          | good            | 1          |36    | 32      |
| 125            | 1155             |bfloat16| 4-bit        | no           | good            | 1          |36    | 39      |
| 125            | 1155             |bfloat16| 8-bit        | yes          | good            | 1          |102   | 11      |
| 125            | 1155             |bfloat16| 8-bit        | no           | good            | 1          |102   | 11      |
| 125            | OOM              |bfloat16| no           | no           | good            | 8          |OOM   | OOM     |
| 125            | 1475             |bfloat16| no           | no           | good            | 4          |53    | 111     |
| 125            | 1475             |bfloat16| no           | no           | good            | 2          |48    | 61      |
| 125            | 1155             |bfloat16| no           | no           | good            | 1          |36    | 32      |
| 125            | 908              | float16| no           | no           | good            | 1          |446   | 2       |
| 125            | 552              | float32| no           | no           | good            | 1          |      |         |

111.32075471698113

In [60]:
# tool_instruction = " The answer should be given as a non-negative modulo 1000."

n_repetitions = 3
q_ics = []
raw_results = []
total_results = []
total_answers = []

for q_idx in range(len(questions)):
    print(f"\n\n{q_idx}: {questions['problem'].iloc[q_idx]}")
    results = []
    answers = []
    try:
        combined_messages = None
        for rep_idx in range(n_repetitions):
            print(datetime.now().strftime('%H:%M:%S'))
            if rep_idx > (n_repetitions + 1) / 2:
                problem = clean_latex(questions['problem'].iloc[q_idx])
            else:
                problem = questions['problem'].iloc[q_idx] 
            messages = [{
                "role": "user", 
                "content": 'Problem: ' + problem + "\nGenerate Python code to solve the above problem." +
                '\nUnless complex numbers are mentioned in the problem create SymPy symbols with parameter real=True' +
                    '\n```python\n'
            }]
            query_prompt = tokenizer.apply_chat_template(messages, tokenize=False)
            first_output = pipeline(query_prompt, max_new_tokens=1024, do_sample=True, temperature=0.9, return_full_text=False)
            first_output = '```python\n' + first_output[0]['generated_text']
            # print('Shell input: \n' + first_output.split('```')[1][7:] + '\n')
            shell_output = run_python_code(first_output.split('```')[1][7:])
            print(f'{shell_output=}\n')
            with_output = first_output.split('```output')[0] + f'```output\n{shell_output}\n```'
            message = {
                "role": "assistant",
                "content": f'Answer {rep_idx}:\n```python\n' + with_output
            }
            messages.append(message)
            #torch.cuda.empty_cache()
            #gc.collect()
            if combined_messages is None:
                combined_messages = messages
            else: 
                combined_messages.append(message)
                
        combined_messages.append({
            "role": "user",
            "content": ""
                    
        })
        second_prompt = tokenizer.apply_chat_template(combined_messages, tokenize=False).replace('<｜end▁of▁sentence｜>', '')
        # print(f'{second_prompt=}\n')
        second_output = pipeline(second_prompt, max_new_tokens=1024, do_sample=True, temperature=0.1, return_full_text=False)
        print(f'{second_output=}\n')
        
    except Exception as ex:
        print(ex)
        result_output, code_output = -1, -1 
    raw_results.append(second_output)
    result_output, code_output = process_output(second_output)


    results.append(result_output)
    answers.append(code_output)
    q_ics.append(q_idx)
total_results.append(results)
total_answers.append(answers)

Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




0: The graph of the equation $9x+223y=2007$ is drawn on graph paper with each square representing one unit in each direction. How many of the $1$ by $1$ graph paper squares have interiors lying entirely below the graph and entirely in the first quadrant?

18:19:20


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


shell_output='Traceback (most recent call last):\n  File "/home/daniel/code/math-ai/code.py", line 21, in <module>\n    result = count_squares()\n             ^^^^^^^^^^^^^^^\n  File "/home/daniel/code/math-ai/code.py", line 11, in count_squares\n    y_intercept = solve(x, y)[0].evalf()\n                  ~~~~~~~~~~~^^^\nIndexError: list index out of range\n'

18:19:54


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


shell_output='888\n'

18:20:03


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


shell_output='0\n'



Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


second_output=[{'generated_text': '```python\ndef graph_paper_squares():\n    """The graph of the equation $9x+223y=2007$ is drawn on graph paper with each square representing one unit in each direction. How many of the $1$ by $1$ graph paper squares have interiors lying entirely below the graph and entirely in the first quadrant?\n"""\n    # Define the variables\n    x, y = symbols(\'x y\', real=True)\n    \n    # Solve the equation for y\n    y_expr = solve(9*x + 223*y - 2007, y)[0]\n\n    # Find the integer values of x for which y is a positive integer\n    count = 0\n    for i in range(1, 2017):\n        if i % 9 == 0:\n            x_val = i / 9\n            y_val = y_expr.subs(x, x_val)\n            if y_val.is_positive and y_val == int(y_val):\n                count += 1\n\n    return count\n\nnumber_of_squares = graph_paper_squares()\nprint(number_of_squares)\n```\n```output\n0\n```\nThe number of 1 by 1 graph paper squares that have interiors lying entirely below the graph and 

Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


shell_output='2*sqrt(5) + 13*sqrt(170) + 28*sqrt(785)\n'

18:21:03


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


shell_output='  File "/home/daniel/code/math-ai/code.py", line 2\n    Find the distance of the four points to the focus of the parabola y = x^2.\n         ^^^\nSyntaxError: invalid syntax\n'

18:21:17


--- Logging error ---
Traceback (most recent call last):
  File "/home/daniel/mambaforge/envs/pytorch/lib/python3.11/logging/__init__.py", line 1110, in emit
    msg = self.format(record)
          ^^^^^^^^^^^^^^^^^^^
  File "/home/daniel/mambaforge/envs/pytorch/lib/python3.11/logging/__init__.py", line 953, in format
    return fmt.format(record)
           ^^^^^^^^^^^^^^^^^^
  File "/home/daniel/mambaforge/envs/pytorch/lib/python3.11/logging/__init__.py", line 687, in format
    record.message = record.getMessage()
                     ^^^^^^^^^^^^^^^^^^^
  File "/home/daniel/mambaforge/envs/pytorch/lib/python3.11/logging/__init__.py", line 377, in getMessage
    msg = msg % self.args
          ~~~~^~~~~~~~~~~
TypeError: not all arguments converted during string formatting
Call stack:
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/home/daniel/mambaforge/envs/pytorch/lib/python3.11/site-packages/ipykernel_launcher.py",

shell_output='957.750000000000\n'

second_output=[{'generated_text': '```python\nfrom sympy import symbols, solve, N\n\ndef sum_of_distances_intersection_points():\n    """Problem: Let $\\mathcal{P}$ be the parabola in the plane determined by the equation $y = x^2.$  Suppose a circle $\\mathcal{C}$ intersects $\\mathcal{P}$ at four distinct points.  If three of these points are $(-28,784),$ $(-2,4),$ and $(13,169),$ find the sum of the distances from the focus of $\\mathcal{P}$ to all four of the intersection points.\n"""\n    x, y = symbols(\'x y\', real=True)\n\n    # The focus of the parabola y = x^2 is at (0, 1/4)\n    focus_x, focus_y = 0, 1/4\n\n    # The given intersection points\n    points = [(-28, 784), (-2, 4), (13, 169)]\n\n    # The sum of the distances from the focus to the intersection points\n    sum_of_distances = 0\n    for point in points:\n        # Calculate the distance using the distance formula\n        sum_of_distances += ((focus_x - point[0])**2 + (focus_y - p

In [13]:
from sympy import symbols, solve, Eq
def solve_triangle():
    # Define the variables
    a, b, c, pa, cp = symbols('a b c pa cp')
    # From the angle bisector theorem, we have: a/b = (c+a)/(2c)
    # Simplify this to get the ratio of a to b\n 
    ratio = solve(Eq(a/b, (c+a)/(2*c)), a/b)[0]
    # Given that M is the midpoint of AD, we have: pa = 2*cp
    # Substitute this into the ratio to get the ratio of cp to pa
    ratio = ratio.subs(a, 2*cp)
    # Simplify the ratio
    ratio = ratio.simplify()
    # The problem asks for the sum of m and n in the fraction m,
    # where m and n are relatively prime positive integers.
    # The ratio we found is in the form m/n. So, m = cp and n = pa.
    m, n = ratio.as_numer_denom()
    # Calculate m+n\n 
    sum_mn = m + n
    return sum_mn
result = solve_triangle()
print(result)

3*c + 2*cp


In [None]:
# pd.DataFrame({
#     'question': np.repeat(questions.values, n_repetitions),
#     'sympy_answers': total_answers,
#     'llm_answers': total_results,
#     'raw_answers': raw_results
# })

In [54]:
print(len(np.repeat(questions['problem'].values, n_repetitions)))
print(len(raw_results))
print(len(total_results))
print(len(total_answers))

75
25
8
8


In [65]:
questions['problem'].iloc[7]

'When the expression $4(x^2-2x+2)-7(x^3-3x+1)$ is fully simplified, what is the sum of the squares of the coefficients of the terms?'

In [19]:
total_results

[[-1, -1, -1], [-1, 2, -1], [-1, 1, 1]]

In [20]:
total_answers

[[-1, 995, 995], [2, 2, -1], [-1, 2, 2]]

In [22]:
questions['solution'].values

array(['In general, $\\mathbf{M} \\begin{pmatrix} 1 \\\\ 0 \\end{pmatrix}$ is the first column of $\\mathbf{M}$, and $\\mathbf{M} \\begin{pmatrix} 0 \\\\ 1 \\end{pmatrix}$ is the second column of $\\mathbf{M}.$\n\nTaking $\\mathbf{v} = \\begin{pmatrix} 1 \\\\ 0 \\end{pmatrix},$ we get\n\\[-5 \\begin{pmatrix} 1 \\\\ 0 \\end{pmatrix} = \\begin{pmatrix} -5 \\\\ 0 \\end{pmatrix}.\\]Taking $\\mathbf{v} = \\begin{pmatrix} 0 \\\\ 1 \\end{pmatrix},$ we get\n\\[-5 \\begin{pmatrix} 0 \\\\ 1 \\end{pmatrix} = \\begin{pmatrix} 0 \\\\ -5 \\end{pmatrix}.\\]Therefore,\n\\[\\mathbf{M} = \\boxed{\\begin{pmatrix} -5 & 0 \\\\ 0 & -5 \\end{pmatrix}}.\\]',
       'We have that\n\\begin{align*}\nak^3 + bk^2 + ck + d &= 0, \\\\\nbk^3 + ck^2 + dk + a &= 0.\n\\end{align*}Multiplying the first equation by $k,$ we get\n\\[ak^4 + bk^3 + ck^2 + dk = 0.\\]Subtracting the equation $bk^3 + ck^2 + dk + a = 0,$ we get $ak^4 = a.$  Since $a$ is nonzero, $k^4 = 1.$  Then $k^4 - 1 = 0,$ which factors as\n\\[(k - 1)(k + 1)(

In [7]:
propm_prefix = '''Problem:
Find the domain of the expression $\frac{\sqrt{x-2}}{\sqrt{5-x}}$.}

Solution:
The expressions inside each square root must be non-negative. Therefore,
$x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator
cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of
the expression is $\boxed{[2,5)}$.
Final Answer: The final answer is $[2,5)$. I hope it is correct.

Problem:
If $\det \mathbf{A} = 2$ and $\det \mathbf{B} = 12,$ then find
$\det (\mathbf{A} \mathbf{B}).$
Solution:
We have that $\det (\mathbf{A} \mathbf{B}) = (\det \mathbf{A})(\det \mathbf{B})
= (2)(12) = \boxed{24}.$
Final Answer: The final answer is $24$. I hope it is correct.
Problem:
Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound
weights instead, how many times must Terrell lift them in order to lift the
same total weight?

Solution:
If Terrell lifts two 20-pound weights 12 times, he lifts a total of
$2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound
weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$
pounds of weight. Equating this to 480 pounds, we can solve for $n$:
\begin{align*}
30n&=480\\
\Rightarrow\qquad n&=480/30=\boxed{16}
\end{align*}
Final Answer: The final answer is $16$. I hope it is correct.

Problem:
If the system of equations
\begin{align*}
6x-4y&=a,\\
6y-9x &=b.
\end{align*}has a solution $(x, y)$ where $x$ and $y$ are both nonzero,
find $\frac{a}{b},$ assuming $b$ is nonzero.

Solution:
If we multiply the first equation by $-\frac{3}{2}$, we obtain
$$6y-9x=-\frac{3}{2}a.$$Since we also know that $6y-9x=b$, we have
$$-\frac{3}{2}a=b\Rightarrow\frac{a}{b}=\boxed{-\frac{2}{3}}.$$
Final Answer: The final answer is $-\frac{2}{3}$. I hope it is correct.

Problem:
'''

question_easy = 'Beth bakes 4, 2 dozen batches of cokies in a week. If these cookies are shared amongst 16 people equally, how many cookies does each person consume?'
question_hard = 'Each of the three-digits numbers $111$ to $999$ is coloured blue or yellow in such a way that the sum of any two (not necessarily different) yellow numbers is equal to a blue number. What is the maximum possible number of yellow numbers there can be?'
postfix = '\nPlease reason step by step, and put your final answer within \boxed{}.'
postfix_2 =  '\nPlease convert this problem into a set of equations'
tokens = tokenizer.encode(question_hard + postfix_2, return_tensors='pt').to('cuda')
out_tokens = model.generate(tokens, max_length=500)
print(tokenizer.decode(out_tokens[0]))

In [34]:
from sympy import symbols, Rational, floor, sqrt
from sympy import solve, And, Le, Ge
def solve_for_prob():
    x = symbols('x')
    # define the polynomial P(x)
    P = x**2 - 3*x - 9
    # define the condition for the problem
    condition = floor(sqrt(P.subs(x, x))) == sqrt(P.subs(x, floor(x)))
    # define the interval
    interval = And(5 <= x, x <= 15)
    # calculate the number of values of x that satisfy the condition in the interval
    valid_count = 0    
    for i in range(5, 16):
        if condition.subs(x, i): 
            valid_count += 1
        # calculate the total number of values of x in the interval
        total_count = 15 - 5 + 1
        # calculate the probability
        probability = Rational(valid_count, total_count)
        # calculate a, b, c, d, e for the fraction form of the probability
        a = 1
        b = 1
        c = 1
        d = -1
        e = 1
        
solve_for_prob()

AttributeError: 'bool' object has no attribute 'subs'

In [None]:
˘..