In [1]:
!pip install -r requirements.txt

### Imports

In [38]:
import torch
from transformers import BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from datasets import load_dataset

import numpy as np
import pandas as pd
from tqdm import tqdm
import re, sys, subprocess, gc
from collections import Counter, defaultdict

from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import transformers

### Quantization

In [3]:
# https://huggingface.co/docs/peft/main/en/developer_guides/quantization

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [72]:
# Load in the model
model_id = "nvidia/OpenMath-Mistral-7B-v0.1-hf"
tokenizer = AutoTokenizer.from_pretrained(model_id)
if torch.cuda.is_available():
    print("cuda")
    model = AutoModelForCausalLM.from_pretrained(model_id, 
                                                device_map="cuda:0", 
                                                quantization_config=quantization_config, 
                                                pad_token_id=tokenizer.eos_token_id)
elif torch.backends.mps.is_available():
    print("mps")
    model = AutoModelForCausalLM.from_pretrained(model_id,
                                                 device_map="mps",
                                                 pad_token_id=tokenizer.eos_token_id)
else:
    print("cpu")
    model = AutoModelForCausalLM.from_pretrained(model_id,
                                                 pad_token_id=tokenizer.eos_token_id)

mps


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [76]:
model.eval()
print(tokenizer.eos_token_id)

2


In [77]:
pipeline = transformers.pipeline("text-generation", model=model, tokenizer=tokenizer, torch_dtype='auto', device_map="auto")

In [7]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token
tokenizer.padding_side = "left"

In [8]:
#model = prepare_model_for_kbit_training(quantized_model)

In [9]:
# lora_config = LoraConfig(
#     r=16,
#     lora_alpha=8,
#     target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
#     lora_dropout=0.05,
#     bias="none",
#     task_type="CAUSAL_LM"
# )

In [10]:
# model = get_peft_model(model, lora_config)

### Datahandling

In [125]:
from datasets import load_dataset
import random
from format_data import get_data_SFTTrainer
from rag import get_RAG_context
# from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from sklearn.feature_extraction.text import TfidfVectorizer

In [126]:
train_dataset = get_data_SFTTrainer('data/math/merged_math_problems_train_clean.json')
random.shuffle(train_dataset)

split = 0.8
train_dataset = train_dataset[:int(split*len(train_dataset))]
val_dataset = train_dataset[int(split*len(train_dataset)):]
test_dataset = get_data_SFTTrainer('data/aime_clean.json')

In [127]:
print(test_dataset[0])

{'prompt': 'Five men and nine women stand equally spaced around a circle in random order. The probability that every man stands diametrically opposite a woman is <math> LaTex frac{m}{n},</math> where <math>m</math> and <math>n</math> are relatively prime positive integers. Find <math>m+n.</math>', 'completion': '\nFor simplicity purposes, we consider two arrangements different even if they only differ by rotations or reflections. In this way, there are <math>14!</math> arrangements without restrictions.\n\nFirst, there are <math> LaTex binom{7}{5}</math> ways to choose the man-woman diameters. Then, there are <math>10 LaTex cdot8 LaTex cdot6 LaTex cdot4 LaTex cdot2</math> ways to place the five men each in a man-woman diameter. Finally, there are <math>9!</math> ways to place the nine women without restrictions.\n\nTogether, the requested probability is <cmath> LaTex frac{ LaTex tbinom{7}{5} LaTex cdot(10 LaTex cdot8 LaTex cdot6 LaTex cdot4 LaTex cdot2) LaTex cdot9!}{14!} =  LaTex frac

### Rag

In [128]:
#TODO: Additional prompt engineering to clean up just appending RAG context to the start
text_format = """### Question: {problem_statement}"""

# RAG support
prompts = [example["prompt"] for example in train_dataset] 
vectorizer = TfidfVectorizer()                       # Initialize TF-IDF vectorizer
tfidf_matrix = vectorizer.fit_transform(prompts)     # Fit vectorizer on prompts
RAG_params = {
    "train_data": train_dataset,
    "vectorizer": vectorizer,
    "tfidf_matrix": tfidf_matrix,
}
TOP_K = 1

def formatting_prompts_func(dataset):
    output_texts = []
    for question_dict in tqdm(dataset):
        problem = question_dict["prompt"]
        text = text_format.format(problem_statement=problem)
        # get RAG-supported context
        context = get_RAG_context(question_dict, top_k=TOP_K, RAG_params = RAG_params)
        output_texts.append(context+text)
    return output_texts

In [129]:
rag_formatted_test_dataset = formatting_prompts_func(test_dataset)

100%|██████████| 975/975 [00:01<00:00, 638.97it/s]


In [130]:
rag_formatted_test_dataset[0:2]

['### prompt: The probability that Kim has a math test today is $ LaTex frac{4}{7}$. What is the probability that Kim does not have a math test today? Express your answer as a common fraction. [completion]: The probability that Kim does not have a math test is equal to one minus the probability she does have a math test. So, the probability of not having a math test is $1 -  LaTex frac{4}{7} =  LaTex boxed{ LaTex frac{3}{7}}$.### Question: Five men and nine women stand equally spaced around a circle in random order. The probability that every man stands diametrically opposite a woman is <math> LaTex frac{m}{n},</math> where <math>m</math> and <math>n</math> are relatively prime positive integers. Find <math>m+n.</math> Please give your answer as an integer between 0 and 999.',
 "### prompt: In how many ways can I arrange 3 different math books and 5 different history books on my bookshelf, if I require there to be a math book on both ends? [completion]: Let's deal with the restriction 

### Answer Extraction Code

In [131]:
def naive_parse(answer):
    out = []
    start = False
    end = False
    for l in reversed(list(answer)):
        if l in '0123456789' and not end:
            start = True
            out.append(l)
        else:
            if start:
                end = True
        
    out = reversed(out)
    return ''.join(out)

In [132]:
def process_output(output): 
    result = output
    try:
        code = output.split('```')[1][7:]
        with open('code.py', 'w') as fout:
            fout.write(code)
        batcmd = 'timeout 7 ' + sys.executable + ' code.py'
        try:
            shell_output = subprocess.check_output(batcmd, shell=True).decode('utf8')
            code_output = round(float(eval(shell_output))) % 1000
        except:
            code_output = -1
    except Exception as e:
        code_output = -1
    try:
        result_output = re.findall(r'\\boxed\{(.*)\}', result)
        if not len(result_output):
            result_output = naive_parse(result)
        else:
            result_output = result_output[-1]
        if not len(result_output):
            result_output = -1
        else:
            result_output = round(float(eval(result_output))) % 1000
    except Exception as e:
        result_output = -1
    return result_output, code_output

### Testing

In [133]:
tool_instruction = " The answer should be given as a non-negative modulo 1000."
tool_instruction += '\nPlease integrate natural language reasoning with programs to solve the problem above, and put your final answer within \\boxed{}.'
n_repetitions = 5
total_results = []
total_answers = []

for i in tqdm(range(len(rag_formatted_test_dataset[:100]))):
    problem = rag_formatted_test_dataset[i]
    messages = [{"role": "user", "content": problem + tool_instruction}]
    query_prompt = tokenizer.apply_chat_template(messages, tokenize=False)
    results = []
    answers = []
    for _ in range(n_repetitions):
        try:
            raw_output = pipeline(query_prompt, max_new_tokens=2048, do_sample=True, temperature=0.8911, return_full_text=False)
            raw_output = raw_output[0]['generated_text']
            #print(raw_output)
            result_output, code_output = process_output(raw_output)
            torch.cuda.empty_cache()
            gc.collect()
        except Exception as e:
            print("caught error:", e)
            result_output, code_output = -1, -1
        results.append(result_output)
        answers.append(code_output)
    total_results.append(results)
    total_answers.append(answers)

  0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 1/100 [01:04<1:46:32, 64.57s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  2%|▏         | 2/100 [01:57<1:34:37, 57.94s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for 

In [109]:
print(total_answers)
print(total_results)

[[-1], [-1], [-1], [-1], [-1], [-1], [-1], [-1], [-1], [-1]]
[[0], [-1], [-1], [2], [-1], [-1], [-1], [-1], [56], [-1]]


In [106]:
problem = rag_formatted_test_dataset[0]
# problem = test_dataset[0]['prompt']
print(problem)

# token_input = tokenizer.encode(query_prompt, return_tensors='pt')
# output = model.generate(token_input, max_length=5000, num_return_sequences=1, temperature=0.7)
# generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
# print(token_input[-1])
# print(generated_text)

print(pipeline(problem, max_new_tokens=2048, do_sample=True, temperature=0.8911, return_full_text=False))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


### prompt: We roll a fair 6-sided die 5 times.  What is the probability that we get an odd number in exactly 4 of the 5 rolls? [completion]: The chances of getting an odd or even number are equal, so there are $2^5=32$ equally likely outcomes.  If we want to get exactly 4 of 5 the rolls to be odd, the probability is $ LaTex dfrac{ LaTex binom{5}{4}}{2^5}= LaTex boxed{ LaTex dfrac{5}{32}}.$### Question: We roll a fair 6-sided die 5 times.  What is the probability that we get a 6 in at most 2 of the rolls?
[{'generated_text': '\nWe can count the number of outcomes by the casework argument. There are 3 cases to consider: no 6s, exactly 1 6, and exactly 2 6s.\n<llm-code>\nfrom sympy import binomial\n\nn = 5\np = 6\n\n# no 6s\nnum_no_6s = binomial(n, 0)\n\n# exactly 1 6\nnum_1_6s = binomial(n, 1)\n\n# exactly 2 6s\nnum_2_6s = binomial(n, 2)\n\n# probability\nprob = (num_no_6s + num_1_6s + num_2_6s) / 2**n\nprint(prob)\n</llm-code>\n<llm-code-output>\n11/32\n</llm-code-output>\nThe probabil

In [115]:
final_answers = []
for a, b in zip(total_answers, total_results):
    a = np.array(a)
    b = np.array(b)
    a[a < 0] = b[a < 0]
    pred = Counter(a.tolist()).most_common(2)
    #REMOVE -1 Answers?
    try:
        ans = pred[0][0] if not pred[0][0] < 0 else pred[1][0]
    except:
        ans = pred[0][0]
    while ans > 999: ans = ans % 1000
    final_answers.append(ans)

[0] [0]
[-1] [-1]
[-1] [-1]
[2] [2]
[-1] [-1]
[-1] [-1]
[-1] [-1]
[-1] [-1]
[56] [56]
[-1] [-1]


In [116]:
def process_ground_truth(output): 
    result = output
    try:
        result_output = re.findall(r'\\boxed\{(.*)\}', result)
        if not len(result_output):
            result_output = naive_parse(result)
        else:
            result_output = result_output[-1]
        if not len(result_output):
            result_output = -1
        else:
            result_output = round(float(eval(result_output))) % 1000
    except Exception as e:
        result_output = -1
    return result_output


In [118]:
parsed_ground_truth = []

for i in tqdm(range(len(final_answers))):
    parsed_ground_truth.append(process_ground_truth(test_dataset[i]["completion"]))

100%|██████████| 10/10 [00:00<00:00, 31068.92it/s]


In [123]:

correct = np.sum(np.array(final_answers) == np.array(parsed_ground_truth))

print("Accuracy:",correct/len(final_answers))

1
[648, 149, 5, 48, 72, 6, 10, 23, 560, 7]
[648, -1, -1, 2, -1, -1, -1, -1, 56, -1]
Accuracy: 0.1
