In [1]:
from datasets import load_dataset, concatenate_datasets
import argparse
import torch
import sys
import os
import json

sub_task = ['metamath']
data_path = r'/TCotMechanism/RealisticDataVerification/dataset'
dataset_split = "test"


if sub_task is None:
    dataset = load_dataset(data_path, split=dataset_split)
else:
    all_test_dataset = []
    for task in sub_task:
        ds = load_dataset(data_path, data_dir=task, split=dataset_split)
        print(f"{data_path}/{task}/{dataset_split}")
        for k,v in ds[0].items():
            print("-"*100)
            print(k,end=':\t')
            print(v)
        print("+"*100)
        all_test_dataset.append(ds)
        
    dataset = concatenate_datasets(all_test_dataset)

In [2]:
dataset = dataset.filter(lambda x: x["type"] == "gsm8k")

In [3]:
def modify_output(example):
    example['instruction'] =  example['instruction'].replace("Let's think step by step.","The answer is: ")
    return example
# dataset = dataset.map(modify_output)
dataset[0]

{'output': '18',
 'instruction': "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nJanet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\n\n### Response: Let's think step by step.",
 'type': 'gsm8k',
 'input': ''}

In [4]:
def batch_data(data_list, batch_size=1):
    n = len(data_list) // batch_size
    batch_data = []
    for i in range(n-1):
        start = i * batch_size
        end = (i+1)*batch_size
        batch_data.append(data_list[start:end])

    last_start = (n-1) * batch_size
    last_end = sys.maxsize
    batch_data.append(data_list[last_start:last_end])
    return batch_data

batch_size = 16
batch_dataset_query = batch_data(dataset["instruction"], batch_size=batch_size)
batch_dataset_answer = batch_data(dataset["output"], batch_size=batch_size)
batch_dataset_task = batch_data(dataset["type"], batch_size=batch_size)

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from tqdm import tqdm
from peft import PeftModel
from transformers import BitsAndBytesConfig


model_path = '/.../Meta-Llama-3.1-8B'
model_path = '/metamath-LoRA-Qwen2.5-3B/checkpoint-1000'

bits=16
quant_type='nf4'
double_quant=True
bf16=True
compute_dtype = (torch.bfloat16 if bf16 else torch.float32)
model = AutoModelForCausalLM.from_pretrained(
        model_path,
        quantization_config=BitsAndBytesConfig(
            load_in_4bit= bits == 4,
            load_in_8bit= bits == 8,
            llm_int8_threshold=6.0,
            llm_int8_has_fp16_weight=False,
            bnb_4bit_compute_dtype=compute_dtype,
            bnb_4bit_use_double_quant=double_quant,
            bnb_4bit_quant_type=quant_type,
        ) if bits in [4, 8] else None,
        torch_dtype=compute_dtype,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        device_map='auto'
    )

tokenizer = AutoTokenizer.from_pretrained(model_path,padding_side='left')

In [6]:
MAX_LENGTH = 512  # input max
MAX_NEW_TOKENS = 1024  # generate max

In [3]:
model.eval()
tokenizer.pad_token = tokenizer.eos_token 

def batch_generate(texts):
    predictions = []
    for i in tqdm(range(len(texts))):
        batch = texts[i]
        
        inputs = tokenizer(
            batch,
            max_length=MAX_LENGTH,
            padding=True,
            truncation=True,
            return_tensors="pt"
        ).to(model.device)
        
        gen_config = GenerationConfig(
        max_new_tokens=MAX_NEW_TOKENS,          
        temperature=0.01,             
        do_sample=False,             # Disable Random Sampling (Deterministic Mathematical Requirements)
    )
        
        with torch.no_grad(), torch.amp.autocast(device_type='cuda', dtype=torch.bfloat16):
            outputs = model.generate(
                **inputs,
                generation_config=gen_config
            )

        preds = tokenizer.batch_decode(
            outputs,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True
        )
        predictions.append(preds)
    
    return predictions

# excute test
num = 82
predictions = batch_generate(batch_dataset_query[-num:])

In [9]:
output_file = r'/.../metamath_response.jsonl'
for i in range(num):
    batch_q = batch_dataset_query[-num:][i]
    batch_p = predictions[i]
    batch_a = batch_dataset_answer[-num:][i]
    batch_t = batch_dataset_task[-num:][i]
    for j in range(batch_size):
        query = batch_q[j]
        pre = batch_p[j]
        answer = batch_a[j]
        task = batch_t[j]
        with open(output_file, 'a') as f:
            json.dump({'type': task, 'query': query, 'output': pre, 'answer': answer}, f)
            f.write('\n')

In [10]:
input_text = batch_dataset_query[-num:][0][-1]#.replace("Let's think step by step.","The Answer is ")
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

# setting
outputs = model.generate(
    **inputs,
    max_new_tokens=1024,
    do_sample=False
)

# decode output
# {"type": "gsm8k", "query": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nThere are some oranges in a basket. Ana spends 3 minutes peeling an orange and Jane spends 4 minutes doing the same. If Ana and Jane start picking oranges from this basket to peel at the same time, how many more oranges will Ana have peeled than Jane after an hour?\n\n### Response: Let's think step by step.", "output": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nThere are some oranges in a basket. Ana spends 3 minutes peeling an orange and Jane spends 4 minutes doing the same. If Ana and Jane start picking oranges from this basket to peel at the same time, how many more oranges will Ana have peeled than Jane after an hour?\n\n### Response: Let's think step by step. First of all we need to find out what number represents one minute for both girls.\nFor example if there were only two oranges then it would be easy because each girl could pick up her own fruit without any problems but since they're not alone anymore so let me explain my reasoning behind these numbers (I hope I'm right). So first thing comes into mind when thinking about something like \"how much does someone spend per unit\" - well obviously its just their speed divided over total amount spent which gives us $latex \\frac{1}{2}$ or half as fast compared with other person who has no problem whatsoever getting things done quickly enough even though he might take longer breaks between tasks etc... But anyway back on track here goes nothing! We know now exactly where our starting point lies therefore next logical question becomes obvious too namely 'what happens during given period?' And again answer seems pretty straightforward considering previous information provided earlier today; every single second counts towards final result thus making sure everything gets completed within specified timeframe regardless whether you do better job overall due lack experience handling certain situations differently depending upon personal preferences regarding work ethic itself rather than actual skill level required completing particular assignment correctly once assigned properly beforehand naturally speaking otherwise wouldn't make sense having said previously mentioned factoid being true throughout entire process leading eventually", "answer": "5"}
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
A merchant wants to make a choice of purchase between 2 purchase plans: jewelry worth $5,000 or electronic gadgets worth $8,000. His financial advisor speculates that the jewelry market will go up 2.5% while the electronic gadgets market will rise 1.2% within the same month. If the merchant is looking to maximize profit at the end of this month by making a choice, how much profit would this be?

### Response: Let's think step by step. The jewelry market is expected to go up by 2.5%, so the value of the jewelry will increase by 2.5% of $5,000, which is 0.025 * $5,000 = ...
The electronic gadgets market is expected to go up by 1.2%, so the value of the electronic gadgets will increase by 1.2% of $8,000, which is 0.012 * $8,000 = ...
If the merchant chooses to purchase the jewelry, he will make a profit of $125 - $5,000 = ...
If the merchant chooses to purchase the e