In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [2]:
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM
from datasets import Dataset, load_dataset
from tqdm.notebook import tqdm

In [3]:
prompt_path = "prompts/costar_cot_1shot.txt"
checkpoint_path = "meta-llama/Meta-Llama-3-8B-Instruct"

prompt = open(prompt_path, "r").read()
prompt = 'Submit your answer with the format: "Result = 72 <submit>"'

FileNotFoundError: [Errno 2] No such file or directory: 'prompts/costar_cot_1shot.txt'

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype="bfloat16",
)
model = AutoModelForCausalLM.from_pretrained(
    checkpoint_path, quantization_config=quantization_config
)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
tokenizer.pad_token = tokenizer.eos_token

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def generate(model, input_texts: list[str]):
    inputs = tokenizer(
        input_texts, return_tensors="pt", padding=True, truncation=False
    ).to(model.device)
    outputs = model.generate(
        **inputs, max_new_tokens=512, pad_token_id=tokenizer.eos_token_id
    )
    output_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return [output[len(input) :] for input, output in zip(input_texts, output_texts)]

In [None]:
import numpy as np
import re


def _exact_match_reward(responses, answers):
    """Reward if generated response contains correct answer."""
    rewards = []
    for response, answer in zip(responses, answers):
        reward = 0.0
        predicted_number = _get_answer(response)
        if predicted_number is not None:
            if np.abs(predicted_number - float(answer)) < 0.1:
                reward += 1.0
        else:
            reward = 0.0
        rewards.append(reward)
    return rewards


def _get_answer(response):
    try:
        pattern = r"Result\s*=\s*(-?\d+(?:\.\d+)?)\s*<submit>"
        match_pattern = re.findall(pattern, response)
        if match_pattern:
            return float(match_pattern[0])
        else:
            return None
    except Exception:
        return None

In [None]:
def get_MATH_test_dataset():
    dataset = load_dataset("json", data_dir="data/MATH")

    def is_real_number(text):
        try:
            float(text)
            return True
        except Exception:
            return False

    def extract_answer(text):
        try:
            match = re.search(r"\\boxed{(.+?)}", text)
            return match.group(1)
        except Exception:
            return None

    dataset_with_answer = dataset.map(
        lambda x: {"problem": x["problem"], "answer": extract_answer(x["solution"])}
    )
    dataset_with_answer = dataset_with_answer.filter(
        lambda x: is_real_number(x["answer"])
    )
    dataset_with_answer = dataset_with_answer.filter(lambda x: len(x["problem"]) < 500)
    dataset_with_answer = dataset_with_answer.rename_column("problem", "query")
    return dataset_with_answer["test"]


def get_aimo_test_dataset():
    test_dataset = Dataset.from_csv("data/val.csv")
    test_dataset = test_dataset.rename_column("problem", "query")
    test_dataset = test_dataset.remove_columns(["id"])
    return test_dataset

In [None]:
def evaluate(test_dataset):
    batch_size = len(test_dataset)
    responses = []

    for i in tqdm(range(0, len(test_dataset), batch_size)):
        batch = test_dataset[i : i + batch_size]
        batch_queries = [prompt + row for row in batch]
        batch_responses = generate(model, batch_queries)
        responses.extend(batch_responses)

    answers = test_dataset["answer"]
    rewards = _exact_match_reward(responses, answers)
    print(f"Exact match reward: {np.mean(rewards)}")
    return responses

In [None]:
dataset = get_MATH_test_dataset()
responses = evaluate(dataset)

Resolving data files:   0%|          | 0/7500 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Exact match reward: 0.0


In [None]:
check_index = 1

print(f"Query: {dataset[check_index]['query']}")
print(f"Predicted: {responses[check_index]}")
print(f"Answer: {dataset[check_index]['answer']}")

Query: What is the positive difference between $120\%$ of 30 and $130\%$ of 20?
Predicted:  1: 72
level 2: 72
level 3: 72
level 4: 72
level 5: 72
level 6: 72
level 7: 72
level 8: 72
level 9: 72
level 10: 72
level 11: 72
level 12: 72
level 13: 72
level 14: 72
level 15: 72
level 16: 72
level 17: 72
level 18: 72
level 19: 72
level 20: 72
level 21: 72
level 22: 72
level 23: 72
level 24: 72
level 25: 72
level 26: 72
level 27: 72
level 28: 72
level 29: 72
level 30: 72
level 31: 72
level 32: 72
level 33: 72
level 34: 72
level 35: 72
level 36: 72
level 37: 72
level 38: 72
level 39: 72
level 40: 72
level 41: 72
level 42: 72
level 43: 72
level 44: 72
level 45: 72
level 46: 72
level 47: 72
level 48: 72
level 49: 72
level 50: 72
level 51: 72
level 52: 72
level 53: 72
level 54: 72
level 55: 72
level 56: 72
level 57: 72
level 58: 72
level 59: 72
level 60: 72
level 61: 72
level 62: 72
level 63: 72
level 64: 72
level 65: 72
level 66: 72
level 67: 72
level 68: 72
level 69: 72
level 70: 72
level 71: 72
