In [1]:
!pip install -q peft math_verify evaluate flash-attn rouge_score sentencepiece sacrebleu

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!cp -r "/content/drive/MyDrive/Graduate Project/MATH_train_staging.csv" .
!cp -r "/content/drive/MyDrive/Graduate Project/MATH_test_staging.csv" .
!cp -r "/content/drive/MyDrive/Graduate Project/Llama-3.2-3B-Instruct-Reasoning" ./

In [4]:
import pandas as pd
import transformers
import evaluate
import random
import torch
import re

from IPython.display import display, Markdown
from peft import PeftConfig, PeftModel
from math_verify import parse, verify
from pprint import pprint
from tqdm import tqdm

In [5]:
SEED = 42
random.seed(SEED)
_ = torch.manual_seed(SEED)
DEVICE = (
    torch.device("cuda")
    if torch.cuda.is_available()
    else (
        torch.device("mps")
        if torch.backends.mps.is_available()
        else torch.device("cpu")
    )
)
QUANTIZATION = None
MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"
MAX_NEW_TOKENS = 10000
ROUGE = evaluate.load("rouge")
BLEU = evaluate.load("bleu")
METEOR = evaluate.load("meteor")
CHRF = evaluate.load("chrf")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

In [6]:
train_data = pd.read_csv("MATH_train_staging.csv")
test_data = pd.read_csv("MATH_test_staging.csv")

In [7]:
train_data["extracted_answer"] = train_data["reasoning"].map(lambda x: parse(x)[1])
test_data["extracted_answer"] = test_data["reasoning"].map(lambda x: parse(x)[1])
for split_str in [
    "-OR-",
    " OR ",
    "OR\n",
    "\\$\\$OR \\$\\$",
    "\\\\text\\{OR\\}",
    "\\\\textbf\\{OR\\}",
    "\\\\centerline\\{\\\\bf \\{OR\\}\\}",
    "\\\\centerline\\{\\{\\\\bf OR\\}\\}",
]:
    train_data.loc[
        train_data["reasoning"].str.split(split_str).str.len() > 1, "reasons"
    ] = train_data[train_data["reasoning"].str.split(split_str).str.len() > 1][
        "reasoning"
    ].str.split(
        split_str
    )
    test_data.loc[
        test_data["reasoning"].str.split(split_str).str.len() > 1, "reasons"
    ] = test_data[test_data["reasoning"].str.split(split_str).str.len() > 1][
        "reasoning"
    ].str.split(
        split_str
    )
train_data.loc[train_data["reasons"].isna(), "reasons"] = train_data.loc[
    train_data["reasons"].isna()
]["reasoning"].str.split("-OR-")
test_data.loc[test_data["reasons"].isna(), "reasons"] = test_data.loc[
    test_data["reasons"].isna()
]["reasoning"].str.split("-OR-")

In [8]:
SYSTEM_PROMPT = (
    "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant "
    "first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning "
    "process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., "
    "<think> reasoning process here </think><answer> answer here </answer>"
)

In [9]:
llm = transformers.AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map=DEVICE,
    quantization_config=QUANTIZATION,
    torch_dtype="auto",
    attn_implementation="flash_attention_2",
    trust_remote_code=True,
)
tokenizer = transformers.AutoTokenizer.from_pretrained(
    MODEL_NAME, trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token
model = PeftModel.from_pretrained(llm, "Llama-3.2-3B-Instruct-Reasoning")
pipe = transformers.pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map=DEVICE,
    trust_remote_code=True,
)
generation_args = {
    "max_new_tokens": MAX_NEW_TOKENS,
}
model.eval()

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Device set to use cuda
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'Gemma3ForConditionalGeneration', 'Gemma3ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GotOcr2ForConditionalGeneration', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapa

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 3072)
        (layers): ModuleList(
          (0-27): 28 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
              (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
              (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
              (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
            )
            (mlp): LlamaMLP(
              (gate_proj): lora.Linear(
                (base_layer): Linear(in_features=3072, out_features=8192, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=16, bias=False)
                )
  

In [10]:
def get_prompt(text):
    message = []
    message.append({"role": "system", "content": SYSTEM_PROMPT})
    message.append({"role": "user", "content": text})
    return message

In [11]:
val_results = {
    "dataset_id": [],
    "question_id": [],
    "bleu": [],
    "rouge1": [],
    "rouge2": [],
    "rougeL": [],
    "meteor": [],
    "chrf": [],
    "correct": [],
    "response": [],
}

In [12]:
for idx in tqdm(range(100), desc="evaluating"):
    row = test_data.loc[idx]
    val_results["dataset_id"].append(row["dataset_id"])
    val_results["question_id"].append(row["question_id"])
    prompt = get_prompt(row["question_text"])
    response = pipe(prompt, **generation_args)[0]["generated_text"][-1][
        "content"
    ].strip()
    val_results["response"].append(response)
    # bleu
    bleu = BLEU.compute(predictions=[response], references=[row["reasons"]])
    val_results["bleu"].append(bleu["bleu"])
    # rouge
    rouge = ROUGE.compute(predictions=[response], references=[row["reasons"]], tokenizer=lambda x: x.split())
    val_results["rouge1"].append(rouge["rouge1"])
    val_results["rouge2"].append(rouge["rouge2"])
    val_results["rougeL"].append(rouge["rougeL"])
    # meteor
    meteor = METEOR.compute(predictions=[response], references=[row["reasons"]])
    val_results["meteor"].append(float(meteor["meteor"]))
    # chrf
    chrf = CHRF.compute(predictions=[response], references=[row["reasons"]])
    val_results["chrf"].append(chrf["score"])
    # correctness
    correct = verify(row["extracted_answer"], parse(response))
    val_results["correct"].append(correct)

evaluating:  10%|█         | 10/100 [02:11<20:56, 13.97s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
evaluating: 100%|██████████| 100/100 [45:23<00:00, 27.23s/it]


In [13]:
pd.DataFrame(val_results).to_csv(
    f"{MODEL_NAME.split('/')[1]}_MATH_reasoning_test_results.csv", index=False
)

In [14]:
display(Markdown("Question ---> " + test_data.loc[0]["question_text"]))
display(Markdown("Reasoning ---> " + test_data.loc[0]["reasons"][0]))
display(Markdown("Answer ---> $" + test_data.loc[0]["extracted_answer"] + "$"))

Question ---> How many vertical asymptotes does the graph of $y=\frac{2}{x^2+x-6}$ have?

Reasoning ---> The denominator of the rational function factors into $x^2+x-6=(x-2)(x+3)$. Since the numerator is always nonzero, there is a vertical asymptote whenever the denominator is $0$, which occurs for $x = 2$ and $x = -3$.  Therefore, the graph has $\boxed{2}$ vertical asymptotes.

Answer ---> $2$

In [15]:
pprint(val_results["response"][0].replace("\n", "\n\n"))

('<think>First, we need to factorize the denominator to find the values of x '
 'for which the function becomes undefined. We can rewrite the denominator as '
 '$x^2+x-6 = (x+3)(x-2)$. So, the function becomes $y = '
 '\\frac{2}{(x+3)(x-2)}$. Now, we need to find the values of x for which the '
 'denominator becomes zero. This happens when $x+3=0$ or $x-2=0$, i.e., when '
 '$x=-3$ or $x=2$. These are the potential vertical asymptotes. However, we '
 'also need to check if there are any factors in the numerator that cancel out '
 'the factors in the denominator. In this case, there are no such factors. '
 'Therefore, the function will have vertical asymptotes at $x=-3$ and '
 '$x=2$.</think><answer> The graph of $y=\\frac{2}{x^2+x-6}$ has 2 vertical '
 'asymptotes at $x=-3$ and $x=2.</answer>')


In [16]:
display(Markdown("Question ---> " + test_data.loc[7]["question_text"]))
display(Markdown("Reasoning ---> " + test_data.loc[7]["reasons"][0]))
display(Markdown("Answer ---> $" + test_data.loc[7]["extracted_answer"] + "$"))

Question ---> Mr. Madoff invests 1000 dollars in a fund that compounds annually at a constant interest rate.  After three years, his investment has grown to 1225 dollars.  What is the annual interest rate, as a percentage?  (Round your answer to the nearest integer.)

Reasoning ---> Let $r$ be the annual interest rate.  Then after three years, Mr. Madoff's investment is $1000 \cdot \left( 1 + \frac{r}{100} \right)^3$, so \[1000 \cdot \left( 1 + \frac{r}{100} \right)^3 = 1225.\]Then \[\left( 1 + \frac{r}{100} \right)^3 = 1.225,\]so \[1 + \frac{r}{100} = \sqrt[3]{1.225} = 1.069987 \dots,\]which means $r = \boxed{7}$, to the nearest integer.

Answer ---> $7$

In [17]:
pprint(val_results["response"][7].replace("\n", "\n\n"))

("<think>First, let's recall the formula for compound interest, which is A = "
 'P(1 + r)^n, where A is the amount of money accumulated after n years, '
 'including interest, P is the principal amount (initial investment), r is the '
 'annual interest rate (in decimal), and n is the number of years the money is '
 'invested. In this case, we are given A = 1225, P = 1000, and n = 3. We need '
 'to solve for r.</think><answer> To solve for r, we can rearrange the formula '
 'to isolate r: r = (A/P)^(1/n) - 1. Plugging in the given values, we get r = '
 '(1225/1000)^(1/3) - 1 ≈ 0.075 or 7.5%.</answer>')
