In [1]:
!pip install transformers datasets huggingface_hub
from huggingface_hub import login
import time
# Log in to Hugging Face with your token
login("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")
from datasets import load_dataset

# Load the MMLU College Mathematics dataset
dataset = load_dataset("cais/mmlu", "college_mathematics")

# Show an example question
print(dataset['validation'][0])
# Zero-shot prompt
def zero_shot_prompt(question, options):
    prompt = f"Choose the answer to the given question from below options.\n"
    prompt += f"Question: {question}\n"
    for idx, option in enumerate(options):
        prompt += f"Option {idx + 1}: {option}\n"
    return prompt

# Chain of Thought (Zero-shot) prompt
def chain_of_thought_prompt(question, options):
    prompt = f"Choose the answer to the given question from below options.\n"
    prompt += f"Question: {question}\n"
    for idx, option in enumerate(options):
        prompt += f"Option {idx + 1}: {option}\n"
    prompt += "Think step by step before choosing the correct answer."
    return prompt

# ReAct prompt
def react_prompt(question, options):
    prompt = f"Reason and choose the correct answer to the question.\n"
    prompt += f"Question: {question}\n"
    for idx, option in enumerate(options):
        prompt += f"Option {idx + 1}: {option}\n"
    prompt += "First, reason through the problem, then take an action to select the correct answer."
    return prompt
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load Gemma-2B model
tokenizer_gemma = AutoTokenizer.from_pretrained("google/gemma-2b-it", use_auth_token=True)
model_gemma = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", use_auth_token=True)

def run_inference(model, tokenizer, prompt, max_new_tokens=50):
    inputs = tokenizer(prompt, return_tensors="pt")
    start_time = time.time()

    # Generate response
    outputs = model.generate(inputs.input_ids, max_new_tokens=max_new_tokens)

    end_time = time.time()
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Calculate inference time
    inference_time = end_time - start_time
    return response, inference_time
# Choose a sample from the dataset for testing
sample = dataset['validation'][0]
question = sample['question']
options = sample['choices']
correct_answer = sample['answer']

# Generate prompts
zero_shot = zero_shot_prompt(question, options)
chain_of_thought = chain_of_thought_prompt(question, options)
react = react_prompt(question, options)

# Run inference for all models and prompts
models = [("Gemma-2B", model_gemma, tokenizer_gemma)]
prompts = [("Zero-shot", zero_shot), ("Chain of Thought", chain_of_thought), ("ReAct", react)]

# Iterate over models and prompt types
for model_name, model, tokenizer in models:
    print(f"Evaluating {model_name}:")
    for prompt_name, prompt in prompts:
        response, inference_time = run_inference(model, tokenizer, prompt)
        print(f"Prompt Type: {prompt_name}")
        print(f"Response: {response}")
        print(f"Inference Time: {inference_time:.2f} seconds\n")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


{'question': 'If a polynomial f(x) over the real numbers has the complex numbers 2 + i and 1 - i as roots, then f(x) could be', 'subject': 'college_mathematics', 'choices': ['x^3 + 5x^2 + 4x + 1', 'x^4 - 6x^3 + 15x^2 - 18x + 10', 'x^3 - x^2 + 4x + 1', 'x^4 + 7x^2 + 10'], 'answer': 1}


`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Evaluating Gemma-2B:
Prompt Type: Zero-shot
Response: Choose the answer to the given question from below options.
Question: If a polynomial f(x) over the real numbers has the complex numbers 2 + i and 1 - i as roots, then f(x) could be
Option 1: x^3 + 5x^2 + 4x + 1
Option 2: x^4 - 6x^3 + 15x^2 - 18x + 10
Option 3: x^3 - x^2 + 4x + 1
Option 4: x^4 + 7x^2 + 10
Correct answer: Option 2: x^4 - 6x^3 + 15x^2 - 18x + 10

Explanation: The complex numbers 2 + i and 1 - i are roots of
Inference Time: 54.61 seconds

Prompt Type: Chain of Thought
Response: Choose the answer to the given question from below options.
Question: If a polynomial f(x) over the real numbers has the complex numbers 2 + i and 1 - i as roots, then f(x) could be
Option 1: x^3 + 5x^2 + 4x + 1
Option 2: x^4 - 6x^3 + 15x^2 - 18x + 10
Option 3: x^3 - x^2 + 4x + 1
Option 4: x^4 + 7x^2 + 10
Think step by step before choosing the correct answer.

Answer: Option 1: x^3 + 5x^2 + 4x + 1

Explanation: The complex numbers 2 + i and 1 - 

In [2]:
# Function to evaluate model accuracy based on the output
def evaluate_output(model_output, correct_answer_index, options):
    # Retrieve the correct answer based on the index
    correct_answer = options[correct_answer_index]

    print(f"Model Output: {model_output}")
    print(f"Correct Answer: {correct_answer}")
    print(f"Options: {options}")

    # Check if the model output contains the correct answer
    if correct_answer in model_output:
        print(f"Match Found: {correct_answer} in {model_output}")
        return True

    print("No match found in model output.")
    return False

# Function to run evaluation on multiple questions
def evaluate_multiple_samples(model, tokenizer, dataset, num_samples=10, max_new_tokens=50):
    total_correct = {"Zero-shot": 0, "Chain of Thought": 0, "ReAct": 0}
    total_inference_time = {"Zero-shot": 0.0, "Chain of Thought": 0.0, "ReAct": 0.0}
    sample_count = min(num_samples, len(dataset['validation']))  # Limit to the available validation samples

    for i in range(sample_count):
        sample = dataset['validation'][i]
        question = sample['question']
        options = sample['choices']
        correct_answer_index = sample['answer']

        print(f"Sample {i+1}/{sample_count} - Question: {question}")
        print(f"Correct Answer Index: {correct_answer_index}, Options: {options}\n")

        # Generate prompts for the current question
        zero_shot = zero_shot_prompt(question, options)
        chain_of_thought = chain_of_thought_prompt(question, options)
        react = react_prompt(question, options)

        # Evaluate for each type of prompt
        prompts = [("Zero-shot", zero_shot), ("Chain of Thought", chain_of_thought), ("ReAct", react)]

        for prompt_name, prompt in prompts:
            # Run inference and get response and inference time
            response, inference_time = run_inference(model, tokenizer, prompt, max_new_tokens=max_new_tokens)

            # Log prompt and model response for debugging
            print(f"Prompt Type: {prompt_name}")
            print(f"Prompt: {prompt}")
            print(f"Model Response: {response}")
            print(f"Inference Time: {inference_time:.2f} seconds\n")

            # Evaluate correctness of the response using the index
            is_correct = evaluate_output(response, correct_answer_index, options)

            # Accumulate results for the specific prompt type
            total_correct[prompt_name] += is_correct
            total_inference_time[prompt_name] += inference_time

    # Calculate average accuracy and inference time for each prompt type
    avg_accuracy = {prompt: total_correct[prompt] / sample_count for prompt in total_correct}
    avg_inference_time = {prompt: total_inference_time[prompt] / sample_count for prompt in total_inference_time}

    return avg_accuracy, avg_inference_time

# Run evaluation on 10 samples for the Phi-3.5-mini-instruct model
models = [("gemma-2b-it", model_gemma, tokenizer_gemma)]
num_samples = 10  # Set the number of samples to evaluate

for model_name, model, tokenizer in models:
    print(f"Evaluating {model_name} over {num_samples} samples:")

    avg_accuracy, avg_inference_time = evaluate_multiple_samples(model, tokenizer, dataset, num_samples=num_samples)

    # Print average accuracy and inference time for each prompt type
    for prompt in avg_accuracy:
        print(f"Prompt Type: {prompt}")
        print(f"Average Accuracy: {avg_accuracy[prompt] * 100:.2f}%")
        print(f"Average Inference Time: {avg_inference_time[prompt]:.2f} seconds\n")


Evaluating gemma-2b-it over 10 samples:
Sample 1/10 - Question: If a polynomial f(x) over the real numbers has the complex numbers 2 + i and 1 - i as roots, then f(x) could be
Correct Answer Index: 1, Options: ['x^3 + 5x^2 + 4x + 1', 'x^4 - 6x^3 + 15x^2 - 18x + 10', 'x^3 - x^2 + 4x + 1', 'x^4 + 7x^2 + 10']

Prompt Type: Zero-shot
Prompt: Choose the answer to the given question from below options.
Question: If a polynomial f(x) over the real numbers has the complex numbers 2 + i and 1 - i as roots, then f(x) could be
Option 1: x^3 + 5x^2 + 4x + 1
Option 2: x^4 - 6x^3 + 15x^2 - 18x + 10
Option 3: x^3 - x^2 + 4x + 1
Option 4: x^4 + 7x^2 + 10

Model Response: Choose the answer to the given question from below options.
Question: If a polynomial f(x) over the real numbers has the complex numbers 2 + i and 1 - i as roots, then f(x) could be
Option 1: x^3 + 5x^2 + 4x + 1
Option 2: x^4 - 6x^3 + 15x^2 - 18x + 10
Option 3: x^3 - x^2 + 4x + 1
Option 4: x^4 + 7x^2 + 10
Correct answer: Option 2: x^4