In [None]:
!pip install transformers datasets
!pip install -U bitsandbytes
!pip install peft
!pip install datasets

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch
from datasets import load_dataset, concatenate_datasets
from getpass import getpass
import random
import os
from peft import PeftModel, PeftConfig
from dotenv import load_dotenv
load_dotenv() #loads environment variables
hf_api_key = os.getenv('HUGGINGFACE_API_KEY')
openai_api_key = os.getenv('OPENAI_API_KEY')

In [None]:
token = hf_api_key
os.environ['HF_HOME'] = '/content/cache'
os.environ['TRANSFORMERS_CACHE'] = '/content/cache/transformers'
os.environ['HF_DATASETS_CACHE'] = '/content/cache/datasets'
os.environ['HF_METRICS_CACHE'] = '/content/cache/metrics'
os.environ['HF_MODULES_CACHE'] = '/content/cache/modules'
os.environ['HF_TOKEN'] = token

In [None]:
# Load the MMLU fine-tuned model
torch.set_default_device('cuda')

model_name = "mistralai/Mistral-7B-v0.3"
adapters_name = "emirkocer/mistral-7b-v03-finetuned-mmlu"

print(f"Starting to load the model {model_name} into memory")

m = AutoModelForCausalLM.from_pretrained(
    model_name,
    #load_in_4bit=True,
    torch_dtype=torch.bfloat16,
    device_map={"": 0}
)
m = PeftModel.from_pretrained(m, adapters_name)
m = m.merge_and_unload()
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.3")
tokenizer.bos_token_id = 1

stop_token_ids = [0]

print(f"Successfully loaded the model {model_name} into memory")

In [None]:
"""
Builds prompts by formatting template prompts
using given input variables
"""
def build_prompt(prompt, input_vars: dict):
    return prompt.format_map(input_vars)


"""
MMLU Baseline prompt template
"""
mmlu_baseline_prompt_template = """
Answer the following multiple-choice question about chemistry or physics or biology
by selecting the correct option: 'A', 'B', 'C', or 'D'. Only give the
correct option as the answer without reasoning.\n
Question:\n
{question}\n
{formatted_options}\n
Answer:
"""

"""
MMLU answer verifier prompt template
"""
mmlu_verifier_template = """
return f"You are the wise answer verifier who is specialized in high school chemistry, biology and physics problems.\
You will be provided a problem in of these three fields, the real answer for that problem, and the \
predicted answer from a generation model. You should understand the problem and validate the correctness of the\
generated answer in the context of the provided chemistry, biology or physics problem and the real answer.\
You should not solve the problem by yourself, you only job is to act as a verifier. \
You should only extract the model answer from the generated model response and compare it with the real answer. \
Questions are multiple-choice questions with four options that are A, B, C or D. \
The real answer will be 0 or 1 or 2 or 3. These correspond to A or B or C or D, respectively.  \
Model generated answer can be in various formats, including plain text or LaTeX-formatted text. \
Your job is to extract the model-generated answer from the given response text
and verify it with the real answer. \
For example, if the model-generated answer is A and the real answer is 0, then the answer will be correct.\
If the model-generated answer is C and the real answer is 3, then the answer will be incorrect. It should have been D. \
Your output are limited to 'correct' or 'incorrect'. You should only response 'correct' or 'incorrect' after verifying \
the answer.\n
Real answer: {real_answer}\n
Model-generated answer: {model_answer}\n
Your output:
"""

In [None]:
# MMLU Inference with the fine-tuned model
!pip install openai
from openai import OpenAI
from datasets import DatasetDict

""" Loads MMLU dataset """
def load_mmlu_dataset():
    ds_chemistry = load_dataset("cais/mmlu", "high_school_chemistry")
    ds_biology = load_dataset("cais/mmlu", "high_school_biology")
    ds_physics = load_dataset("cais/mmlu", "high_school_physics")

    # Combine them into a single DatasetDict
    combined_test = concatenate_datasets([ds_chemistry['test'], ds_biology['test'], ds_physics['test']])
    combined_validation = concatenate_datasets([ds_chemistry['validation'], ds_biology['validation'], ds_physics['validation']])
    combined_dev = concatenate_datasets([ds_chemistry['dev'], ds_biology['dev'], ds_physics['dev']])

    dataset = DatasetDict({
        'test': combined_test,
        'validation': combined_validation,
        'dev': combined_dev
    })
    return dataset

""" Returns the MMLU baseline prompt """
def get_baseline_prompt(question, options):
    formatted_options = "\n".join([f"{chr(65+i)}: {opt}" for i, opt in enumerate(options)])
    prompt = build_prompt(
            prompt=mmlu_baseline_prompt_template,
            input_vars={"question": question, "formatted_options": formatted_options}
            )
    return prompt

""" Calls the fine-tuned model and returns the model-generated answer """
def get_model_response(model, prompt, max_tokens):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    outputs = model.generate(input_ids, max_length=max_tokens)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer


""" Calls GPT-4 via OpenAI API and uses it as an evaluator for Mistral answers """
def call_gpt4_verifier(client, template_prompt, model_answer, real_answer):
    verifier_prompt = build_prompt(
            prompt=template_prompt,
            input_vars={"model_answer": model_answer, "real_answer": real_answer}
            )
    stream = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": verifier_prompt}],
        stream=True,
        )
    gpt_answer = []
    for chunk in stream:
        gpt_answer.append(chunk.choices[0].delta.content)
    return gpt_answer[1]


""" Evaluation """
def evaluate():
    openai_api_key = openai_api_key
    openai_client = OpenAI(api_key=openai_api_key)
    dataset = load_mmlu_dataset()
    test_data = dataset["test"]
    processed_questions = 0
    total_questions = len(test_data['question'])
    correct_answers = 0

    for i in range(total_questions):

        prompt = get_baseline_prompt(test_data['question'][i], test_data['choices'][i])

        model_answer = get_model_response(m, prompt, 512)
        gpt_response = call_gpt4_verifier(openai_client,
                                          mmlu_verifier_template,
                                          model_answer,
                                          test_data['answer'][i]
                                          )
        # Check if the predicted answer is correct
        if gpt_response == 'correct':
            correct_answers += 1
        print(correct_answers / (i + 1))
    accuracy = correct_answers / total_questions
    print(accuracy)
    return accuracy

acc = evaluate()
print(acc)




