In [1]:
!pip install transformers datasets huggingface_hub
from datasets import load_dataset
!pip install bitsandbytes

# Load the MMLU College Mathematics dataset
dataset = load_dataset("cais/mmlu", "college_mathematics")

from huggingface_hub import login
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM
import torch
login("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")
# Zero-shot prompt
def zero_shot_prompt(question, options):
    prompt = f"Choose the answer to the given question from below options.\n"
    prompt += f"Question: {question}\n"
    for idx, option in enumerate(options):
        prompt += f"Option {idx + 1}: {option}\n"
    return prompt

# Chain of Thought (Zero-shot) prompt
def chain_of_thought_prompt(question, options):
    prompt = f"Choose the answer to the given question from below options.\n"
    prompt += f"Question: {question}\n"
    for idx, option in enumerate(options):
        prompt += f"Option {idx + 1}: {option}\n"
    prompt += "Think step by step before choosing the correct answer."
    return prompt

# ReAct prompt
def react_prompt(question, options):
    prompt = f"Reason and choose the correct answer to the question.\n"
    prompt += f"Question: {question}\n"
    for idx, option in enumerate(options):
        prompt += f"Option {idx + 1}: {option}\n"
    prompt += "First, reason through the problem, then take an action to select the correct answer."
    return prompt
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Check if GPU is available and define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set up 8-bit quantization to reduce memory usage
bnb_config = BitsAndBytesConfig(load_in_8bit=True)

# Load the Phi model with 8-bit precision and move it to GPU
model_phi = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3.5-mini-instruct",
    quantization_config=bnb_config,
    token=True  # Use `token` instead of deprecated `use_auth_token`
)

# Load the tokenizer
tokenizer_phi = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct", token=True)


# Set the pad_token to eos_token (if not already set)
tokenizer_phi.pad_token = tokenizer_phi.eos_token

def run_inference(model, tokenizer, prompt, max_new_tokens=50):
    # Tokenize the input and create attention mask
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
    inputs = inputs.to(device)  # Move inputs to GPU

    # Pass attention_mask and pad_token_id explicitly to prevent warnings
    start_time = time.time()

    # Generate response
    outputs = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,  # Include attention mask
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.eos_token_id  # Set pad token to EOS token
    )

    end_time = time.time()
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Calculate inference time
    inference_time = end_time - start_time
    return response, inference_time
import time# Choose a sample from the dataset for testing
sample = dataset['validation'][0]
question = sample['question']
options = sample['choices']
correct_answer = sample['answer']

# Generate prompts
zero_shot = zero_shot_prompt(question, options)
chain_of_thought = chain_of_thought_prompt(question, options)
react = react_prompt(question, options)

# Run inference for all models and prompts
models = [("Phi-3.5-mini-instruct", model_phi, tokenizer_phi)]

prompts = [("Zero-shot", zero_shot), ("Chain of Thought", chain_of_thought), ("ReAct", react)]
# Iterate over models and prompt types
for model_name, model, tokenizer in models:
    print(f"Evaluating {model_name}:")
    for prompt_name, prompt in prompts:
        response, inference_time = run_inference(model, tokenizer, prompt)
        print(f"Prompt Type: {prompt_name}")
        print(f"Response: {response}")
        print(f"Inference Time: {inference_time:.2f} seconds\n")



Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/53.2k [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/138k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/16.6k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/5.00k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/5.16k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Evaluating Phi-3.5-mini-instruct:


You are not running the flash-attention implementation, expect numerical differences.


Prompt Type: Zero-shot
Response: Choose the answer to the given question from below options.
Question: If a polynomial f(x) over the real numbers has the complex numbers 2 + i and 1 - i as roots, then f(x) could be
Option 1: x^3 + 5x^2 + 4x + 1
Option 2: x^4 - 6x^3 + 15x^2 - 18x + 10
Option 3: x^3 - x^2 + 4x + 1
Option 4: x^4 + 7x^2 + 10
Answer: Option 2

Explanation:
If a polynomial has complex roots, then its conjugate must also be a root. This is because the coefficients of the polynomial are real numbers, and complex roots always come in conjugate pairs
Inference Time: 16.52 seconds

Prompt Type: Chain of Thought
Response: Choose the answer to the given question from below options.
Question: If a polynomial f(x) over the real numbers has the complex numbers 2 + i and 1 - i as roots, then f(x) could be
Option 1: x^3 + 5x^2 + 4x + 1
Option 2: x^4 - 6x^3 + 15x^2 - 18x + 10
Option 3: x^3 - x^2 + 4x + 1
Option 4: x^4 + 7x^2 + 10
Think step by step before choosing the correct answer.

An

In [8]:
dataset['validation'][0]

{'question': 'If a polynomial f(x) over the real numbers has the complex numbers 2 + i and 1 - i as roots, then f(x) could be',
 'subject': 'college_mathematics',
 'choices': ['x^3 + 5x^2 + 4x + 1',
  'x^4 - 6x^3 + 15x^2 - 18x + 10',
  'x^3 - x^2 + 4x + 1',
  'x^4 + 7x^2 + 10'],
 'answer': 1}

In [10]:
# Function to evaluate model accuracy based on the output
def evaluate_output(model_output, correct_answer_index, options):
    # Retrieve the correct answer based on the index
    correct_answer = options[correct_answer_index]

    print(f"Model Output: {model_output}")
    print(f"Correct Answer: {correct_answer}")
    print(f"Options: {options}")

    # Check if the model output contains the correct answer
    if correct_answer in model_output:
        print(f"Match Found: {correct_answer} in {model_output}")
        return True

    print("No match found in model output.")
    return False

# Function to run evaluation on multiple questions
def evaluate_multiple_samples(model, tokenizer, dataset, num_samples=10, max_new_tokens=50):
    total_correct = {"Zero-shot": 0, "Chain of Thought": 0, "ReAct": 0}
    total_inference_time = {"Zero-shot": 0.0, "Chain of Thought": 0.0, "ReAct": 0.0}
    sample_count = min(num_samples, len(dataset['validation']))  # Limit to the available validation samples

    for i in range(sample_count):
        sample = dataset['validation'][i]
        question = sample['question']
        options = sample['choices']
        correct_answer_index = sample['answer']

        print(f"Sample {i+1}/{sample_count} - Question: {question}")
        print(f"Correct Answer Index: {correct_answer_index}, Options: {options}\n")

        # Generate prompts for the current question
        zero_shot = zero_shot_prompt(question, options)
        chain_of_thought = chain_of_thought_prompt(question, options)
        react = react_prompt(question, options)

        # Evaluate for each type of prompt
        prompts = [("Zero-shot", zero_shot), ("Chain of Thought", chain_of_thought), ("ReAct", react)]

        for prompt_name, prompt in prompts:
            # Run inference and get response and inference time
            response, inference_time = run_inference(model, tokenizer, prompt, max_new_tokens=max_new_tokens)

            # Log prompt and model response for debugging
            print(f"Prompt Type: {prompt_name}")
            print(f"Prompt: {prompt}")
            print(f"Model Response: {response}")
            print(f"Inference Time: {inference_time:.2f} seconds\n")

            # Evaluate correctness of the response using the index
            is_correct = evaluate_output(response, correct_answer_index, options)

            # Accumulate results for the specific prompt type
            total_correct[prompt_name] += is_correct
            total_inference_time[prompt_name] += inference_time

    # Calculate average accuracy and inference time for each prompt type
    avg_accuracy = {prompt: total_correct[prompt] / sample_count for prompt in total_correct}
    avg_inference_time = {prompt: total_inference_time[prompt] / sample_count for prompt in total_inference_time}

    return avg_accuracy, avg_inference_time

# Run evaluation on 10 samples for the Phi-3.5-mini-instruct model
models = [("Phi-3.5-mini-instruct", model_phi, tokenizer_phi)]
num_samples = 10  # Set the number of samples to evaluate

for model_name, model, tokenizer in models:
    print(f"Evaluating {model_name} over {num_samples} samples:")

    avg_accuracy, avg_inference_time = evaluate_multiple_samples(model, tokenizer, dataset, num_samples=num_samples)

    # Print average accuracy and inference time for each prompt type
    for prompt in avg_accuracy:
        print(f"Prompt Type: {prompt}")
        print(f"Average Accuracy: {avg_accuracy[prompt] * 100:.2f}%")
        print(f"Average Inference Time: {avg_inference_time[prompt]:.2f} seconds\n")


Evaluating Phi-3.5-mini-instruct over 20 samples:
Sample 1/11 - Question: If a polynomial f(x) over the real numbers has the complex numbers 2 + i and 1 - i as roots, then f(x) could be
Correct Answer Index: 1, Options: ['x^3 + 5x^2 + 4x + 1', 'x^4 - 6x^3 + 15x^2 - 18x + 10', 'x^3 - x^2 + 4x + 1', 'x^4 + 7x^2 + 10']

Prompt Type: Zero-shot
Prompt: Choose the answer to the given question from below options.
Question: If a polynomial f(x) over the real numbers has the complex numbers 2 + i and 1 - i as roots, then f(x) could be
Option 1: x^3 + 5x^2 + 4x + 1
Option 2: x^4 - 6x^3 + 15x^2 - 18x + 10
Option 3: x^3 - x^2 + 4x + 1
Option 4: x^4 + 7x^2 + 10

Model Response: Choose the answer to the given question from below options.
Question: If a polynomial f(x) over the real numbers has the complex numbers 2 + i and 1 - i as roots, then f(x) could be
Option 1: x^3 + 5x^2 + 4x + 1
Option 2: x^4 - 6x^3 + 15x^2 - 18x + 10
Option 3: x^3 - x^2 + 4x + 1
Option 4: x^4 + 7x^2 + 10
Answer: Option 2

E