In [1]:
from datasets import load_dataset
from dataset_generator import (
    generate_completion_dataset,
    generate_corrective_dataset,
    generate_kto_dataset,
    generate_copy_dataset,
)
from transformers import (
    pipeline,
    BitsAndBytesConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
)
import re

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype="bfloat16",
)
model_path = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_path, quantization_config=quantization_config
)
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token_id = tokenizer.eos_token_id
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

generate_kargs = {"max_new_tokens": 3000, "do_sample": True, "batch_size": 64}

prompt = "{problem} \nPlease reason step by step, and put your final answer within \\boxed{{}}"

dataset = load_dataset("openai/gsm8k", "main")
testing_set = dataset["test"]
# Change the testing_set question col into problem col
testing_set = testing_set.rename_column("question", "problem")

def get_answer_from_output(text):
    try:
        result_output = re.findall(r"\\boxed\{(\d+)\}", text)
        return float(result_output[0])
    except Exception:
        return None

2024-07-07 04:37:20.636675: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-07 04:37:20.678768: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX512F AVX512_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
completion_dataset = generate_completion_dataset(
    pipe,
    testing_set,
    prompt,
    get_answer_from_output,
    generate_kwargs=generate_kargs,
    generate_count_per_problem=1,
)
completion_dataset[:]

# Calculate the accuracy of the model
correct = 0
total = 0
for problem in completion_dataset:
    total += 1
    if problem["correct"]:
        correct += 1
        
print(f"Accuracy: {correct/total}")

Generating completions:   0%|          | 0/1319 [00:00<?, ?it/s]