In [None]:
!pip install transformers datasets
!pip install -U bitsandbytes
!pip install peft
!pip install datasets

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch
from datasets import load_dataset, concatenate_datasets
from getpass import getpass
import random
import os
from peft import PeftModel, PeftConfig
from dotenv import load_dotenv
load_dotenv() #loads environment variables
hf_api_key = os.getenv('HUGGINGFACE_API_KEY')
openai_api_key = os.getenv('OPENAI_API_KEY')

In [None]:
token = hf_api_key
os.environ['HF_HOME'] = '/content/cache'
os.environ['TRANSFORMERS_CACHE'] = '/content/cache/transformers'
os.environ['HF_DATASETS_CACHE'] = '/content/cache/datasets'
os.environ['HF_METRICS_CACHE'] = '/content/cache/metrics'
os.environ['HF_MODULES_CACHE'] = '/content/cache/modules'
os.environ['HF_TOKEN'] = token

In [None]:
# Load the MATH fine-tuned model
torch.set_default_device('cuda')

model_name = "mistralai/Mistral-7B-v0.3"
adapters_name = "emirkocer/mistral-7b-v03-finetuned-math-v2"

print(f"Starting to load the model {model_name} into memory")

m = AutoModelForCausalLM.from_pretrained(
    model_name,
    #load_in_4bit=True,
    torch_dtype=torch.bfloat16,
    device_map={"": 0}
)
m = PeftModel.from_pretrained(m, adapters_name)
m = m.merge_and_unload()
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.3")
tokenizer.bos_token_id = 1

stop_token_ids = [0]

print(f"Successfully loaded the model {model_name} into memory")

In [None]:
"""
Builds prompts by formatting template prompts
using given input variables
"""
def build_prompt(prompt, input_vars: dict):
    return prompt.format_map(input_vars)


"""
MATH Baseline prompt template
"""
math_baseline_prompt_template = """
Below is a math problem. Solve the problem and give an answer.
Problem:\n
{problem}\n
Answer:
"""

"""
MATH answer verifier prompt template
"""
math_verifier_template = """
You are the wise answer verifier who is specialized in mathematics.\
You will be provided a math problem, the real answer of this problem, and the \
predicted answer from a generation model. You should understand the problem and validate the correctness of the\
model-generated answer in the context of the provided math problem and the real answer.\
You should not solve the problem by yourself, you only job is to act as a verifier.\
Your logic and reasoning should be rigorous and intelligent.\
The model-generated answer can potentially be in various formats, including plain text, LaTeX-formatted text, or multiple-choice options. \
These options may involve single or multiple selections, a numeric value, or a numerical value accompanied by units.\
Both the 'Real Answer' and the 'Model-generated Answer' may correspond to any of these response types.\
Exact string matching is not required; what matters is that the mathematical meaning or the options are consistent. \
In the case of multiple-choice questions, different orders are also acceptable.\
Your output are limited to 'correct' or 'incorrect'. You should only response 'correct' or 'incorrect' after verifying \
the answer. \nReal answer: {real_answer}\nModel-generated answer: {model_answer}\nYour output:
"""


In [None]:
# MMLU Inference with the fine-tuned model
!pip install openai
from openai import OpenAI
import json
from datasets import DatasetDict


""" Load JSON Lines file and return a list of dictionaries. """
def load_jsonl_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line.strip()))
    return data

""" Loads MATH dataset """
def load_math_dataset():
    test_data_path = '/content/test.jsonl' # MATH test data from PRM800K
    math_test = load_jsonl_data(test_data_path)
    return math_test

""" Returns the MATH baseline prompt """
def get_baseline_prompt(problem):
    prompt = build_prompt(
            prompt=math_baseline_prompt_template,
            input_vars={"problem": problem}
            )
    return prompt

""" Calls the fine-tuned model and returns the model-generated answer """
def get_model_response(model, prompt, max_tokens):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    # Ensure `max_length` includes space for the prompt and the expected output length
    max_length = len(input_ids[0]) + max_tokens  

    outputs = model.generate(
        input_ids, 
        max_length=max_length, 
        max_new_tokens=max_tokens  # Ensuring that only the specified number of new tokens are generated
    )
    
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

""" Calls GPT-4 via OpenAI API and uses it as an evaluator for model answers """
def call_gpt4_verifier(client, template_prompt, model_answer, real_answer):
    verifier_prompt = build_prompt(
            prompt=template_prompt,
            input_vars={"model_answer": model_answer, "real_answer": real_answer}
            )
    stream = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": verifier_prompt}],
        stream=True,
        )
    gpt_answer = []
    for chunk in stream:
        gpt_answer.append(chunk.choices[0].delta.content)
    return gpt_answer[1]


""" Evaluation """
def evaluate():
    openai_api_key = openai_api_key
    openai_client = OpenAI(api_key=openai_api_key)
    math_test = load_math_dataset()
    correct_answers = 0
    total_questions = len(math_test) # total number of test questions

    for i in range(total_questions):

        prompt = get_baseline_prompt(math_test[i]['problem'])
        model_answer = get_model_response(m, prompt, 512)
        gpt_response = call_gpt4_verifier(openai_client,
                                          math_verifier_template,
                                          model_answer,
                                          math_test[i]['answer']
                                          )
        # Check if the predicted answer is correct
        if gpt_response == 'correct':
            correct_answers += 1
        print(correct_answers / (i + 1))
    accuracy = correct_answers / total_questions
    print(accuracy)
    return accuracy

acc = evaluate()
print(acc)

