In [1]:
elementary_qa = [
    {
        "question": "What is 5 plus 3?",
        "choices": ["6", "7", "8", "9"],
        "answer": 2,
        "reference_answer": "The sum of 5 and 3 is 8."
    },
    {
        "question": "What color do you get when you mix red and yellow?",
        "choices": ["Orange", "Purple", "Green", "Pink"],
        "answer": 0,
        "reference_answer": "Mixing red and yellow gives the color orange."
    },
    {
        "question": "Which planet do we live on?",
        "choices": ["Mars", "Venus", "Earth", "Jupiter"],
        "answer": 2,
        "reference_answer": "We live on the planet Earth."
    },
    {
        "question": "How many legs does a spider have?",
        "choices": ["Six", "Eight", "Ten", "Four"],
        "answer": 1,
        "reference_answer": "A spider has eight legs."
    },
    {
        "question": "What do bees make?",
        "choices": ["Milk", "Honey", "Wax", "Butter"],
        "answer": 1,
        "reference_answer": "Bees make honey."
    },
    {
        "question": "Which animal is known as the King of the Jungle?",
        "choices": ["Tiger", "Elephant", "Lion", "Bear"],
        "answer": 2,
        "reference_answer": "The lion is known as the King of the Jungle."
    },
    {
        "question": "What is the opposite of hot?",
        "choices": ["Warm", "Cold", "Cool", "Burning"],
        "answer": 1,
        "reference_answer": "The opposite of hot is cold."
    },
    {
        "question": "How many days are there in a week?",
        "choices": ["5", "6", "7", "8"],
        "answer": 2,
        "reference_answer": "There are seven days in a week."
    },
    {
        "question": "What do you call a baby dog?",
        "choices": ["Kitten", "Cub", "Puppy", "Calf"],
        "answer": 2,
        "reference_answer": "A baby dog is called a puppy."
    },
    {
        "question": "Which shape has 3 sides?",
        "choices": ["Square", "Rectangle", "Circle", "Triangle"],
        "answer": 3,
        "reference_answer": "A triangle is a shape with three sides."
    },
    {
        "question": "Which season comes after winter?",
        "choices": ["Spring", "Summer", "Fall", "Monsoon"],
        "answer": 0,
        "reference_answer": "The season that comes after winter is spring."
    },
    {
        "question": "What do we use to write on a blackboard?",
        "choices": ["Pencil", "Pen", "Chalk", "Crayon"],
        "answer": 2,
        "reference_answer": "We use chalk to write on a blackboard."
    },
    {
        "question": "What is H2O commonly known as?",
        "choices": ["Salt", "Water", "Oxygen", "Ice"],
        "answer": 1,
        "reference_answer": "H2O is commonly known as water."
    },
    {
        "question": "How many hours are there in a day?",
        "choices": ["12", "24", "30", "36"],
        "answer": 1,
        "reference_answer": "There are 24 hours in a day."
    },
    {
        "question": "Which direction does the sun rise from?",
        "choices": ["West", "North", "East", "South"],
        "answer": 2,
        "reference_answer": "The sun rises in the east."
    },
    {
        "question": "What part of the body helps us see?",
        "choices": ["Ears", "Eyes", "Nose", "Mouth"],
        "answer": 1,
        "reference_answer": "The eyes help us to see."
    },
    {
        "question": "How many fingers do most people have?",
        "choices": ["Eight", "Ten", "Twelve", "Nine"],
        "answer": 1,
        "reference_answer": "Most people have ten fingers."
    },
    {
        "question": "Which of these is a fruit?",
        "choices": ["Potato", "Carrot", "Apple", "Onion"],
        "answer": 2,
        "reference_answer": "An apple is a fruit."
    },
    {
        "question": "What is the capital of India?",
        "choices": ["Mumbai", "Delhi", "Kolkata", "Chennai"],
        "answer": 1,
        "reference_answer": "The capital of India is New Delhi."
    },
    {
        "question": "What gas do humans need to breathe?",
        "choices": ["Carbon dioxide", "Oxygen", "Hydrogen", "Nitrogen"],
        "answer": 1,
        "reference_answer": "Humans need oxygen to breathe."
    }
]

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import string

# ----- CONFIGURATION -----
GEN_MODEL = "deepseek-ai/deepseek-coder-1.3b-instruct"   # For generation
MATCH_MODEL = "Qwen/Qwen1.5-0.5B"    # For answer matching
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# ----- MODEL LOADING -----
def load_model(name):
    tokenizer = AutoTokenizer.from_pretrained(name, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(name).to(DEVICE)
    return tokenizer, model

tokenizer_gen, model_gen = load_model(GEN_MODEL)
tokenizer_match, model_match = load_model(MATCH_MODEL)

# ----- PROMPTS -----
def mcq_prompt(q, opts):
    opts_text = "\n".join([f"{string.ascii_uppercase[i]}. {c}" for i, c in enumerate(opts)])
    return f"Q: {q}\nOptions:\n{opts_text}\nAnswer:"

def gen_prompt(q):
    return f"Q: {q}\nAnswer:"

# ----- GENERATION FUNCTION -----
def generate(model, tokenizer, prompt, max_new=1):
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    outputs = model.generate(**inputs, max_new_tokens=max_new, do_sample=False)
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text.split("Answer:")[-1].split("\n")[0].strip()

# ----- MCQ Label Extraction -----
def mcq_label(pred, opts):
    p = pred.strip()
    if p and p[0] in string.ascii_uppercase:
        idx = ord(p[0]) - ord("A")
        if 0 <= idx < len(opts):
            return idx
    for i, o in enumerate(opts):
        if o.lower() in p.lower():
            return i
    return -1

# ----- LLM Matcher -----
def match_llm(q, ref, ans):
    prompt = (
        f"You are a strict grader. Determine if the candidate answer is semantically equivalent to the correct answer.\n\n"
        f"Question: {q}\n"
        f"Correct Answer: {ref}\n"
        f"Candidate Answer: {ans}\n\n"
        f"Is the candidate answer correct? Answer only with 'Yes' or 'No'.\n"
        f"Answer:"
    )
    out = generate(model_match, tokenizer_match, prompt, max_new=4).strip().lower()
    return out.startswith("yes")

# ----- EVALUATION -----
results = {
    "mcq_acc": 0,
    "match_acc": 0
}

for ex in tqdm(elementary_qa):
    q = ex["question"]
    opts = ex["choices"]
    correct = ex["answer"]
    ref = ex["reference_answer"]

    # MCQ - generate only 1 token (letter or short)
    mcq_out = generate(model_gen, tokenizer_gen, mcq_prompt(q, opts), max_new=1)

    # Answer Matching - full sentence
    gen_out = generate(model_gen, tokenizer_gen, gen_prompt(q), max_new=64)

    print(f"\nQ: {q}")
    print(f"MCQ model output: {mcq_out} | Correct: {opts[correct]}")
    print(f"Generated answer: {gen_out}")
    print(f"Reference answer: {ref}")

    if mcq_label(mcq_out, opts) == correct:
        results["mcq_acc"] += 1

    if match_llm(q, ref, gen_out):
        results["match_acc"] += 1

results["mcq_acc"] /= len(elementary_qa)
results["match_acc"] /= len(elementary_qa)

print("\nFinal Results:")
print(f"MCQ Accuracy: {results['mcq_acc']*100:.1f}%")
print(f"Answer Matching Accuracy: {results['match_acc']*100:.1f}%")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  0%|          | 0/20 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



Q: What is 5 plus 3?
MCQ model output:  | Correct: 8
Generated answer: 5 + 3 = 8
Reference answer: The sum of 5 and 3 is 8.


  5%|▌         | 1/20 [00:47<14:55, 47.12s/it]Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



Q: What color do you get when you mix red and yellow?
MCQ model output:  | Correct: Orange
Generated answer: The color you get when you mix red and yellow is orange.
Reference answer: Mixing red and yellow gives the color orange.


 10%|█         | 2/20 [01:42<15:35, 51.97s/it]Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



Q: Which planet do we live on?
MCQ model output: C | Correct: Earth
Generated answer: We live on the Earth.
Reference answer: We live on the planet Earth.


 15%|█▌        | 3/20 [02:28<14:00, 49.45s/it]Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



Q: How many legs does a spider have?
MCQ model output:  | Correct: Eight
Generated answer: A spider has 8 legs.
Reference answer: A spider has eight legs.


 20%|██        | 4/20 [03:15<12:55, 48.44s/it]Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



Q: What do bees make?
MCQ model output:  | Correct: Honey
Generated answer: Bees do not make food. They are insects that eat other insects.
Reference answer: Bees make honey.


 25%|██▌       | 5/20 [03:44<10:21, 41.41s/it]Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



Q: Which animal is known as the King of the Jungle?
MCQ model output: C | Correct: Lion
Generated answer: Elephant
Reference answer: The lion is known as the King of the Jungle.


 30%|███       | 6/20 [04:31<10:03, 43.12s/it]Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



Q: What is the opposite of hot?
MCQ model output:  | Correct: Cold
Generated answer: cold
Reference answer: The opposite of hot is cold.


 35%|███▌      | 7/20 [05:22<09:56, 45.87s/it]Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



Q: How many days are there in a week?
MCQ model output:  | Correct: 7
Generated answer: There are 7 days in a week.
Reference answer: There are seven days in a week.


 40%|████      | 8/20 [05:48<07:54, 39.52s/it]Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



Q: What do you call a baby dog?
MCQ model output: A | Correct: Puppy
Generated answer: Baby Dog
Reference answer: A baby dog is called a puppy.


 45%|████▌     | 9/20 [06:33<07:32, 41.11s/it]Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



Q: Which shape has 3 sides?
MCQ model output: B | Correct: Triangle
Generated answer: The shape with 3 sides is a triangle.
Reference answer: A triangle is a shape with three sides.


 50%|█████     | 10/20 [07:01<06:12, 37.21s/it]Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



Q: Which season comes after winter?
MCQ model output:  | Correct: Spring
Generated answer: The season that comes after winter is Autumn.
Reference answer: The season that comes after winter is spring.


 55%|█████▌    | 11/20 [07:31<05:14, 34.98s/it]Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



Q: What do we use to write on a blackboard?
MCQ model output: C | Correct: Chalk
Generated answer: Blackboard
Reference answer: We use chalk to write on a blackboard.


 60%|██████    | 12/20 [08:16<05:04, 38.11s/it]Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



Q: What is H2O commonly known as?
MCQ model output: C | Correct: Water
Generated answer: H2O is often referred to as "Hydrogen" or "Water" in the scientific community. It is a chemical compound that is the primary constituent of water.
Reference answer: H2O is commonly known as water.


 65%|██████▌   | 13/20 [09:01<04:41, 40.19s/it]Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



Q: How many hours are there in a day?
MCQ model output: C | Correct: 24
Generated answer: There are 24 hours in a day.
Reference answer: There are 24 hours in a day.


 70%|███████   | 14/20 [09:32<03:44, 37.38s/it]Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



Q: Which direction does the sun rise from?
MCQ model output: C | Correct: East
Generated answer: The sun rises from the west
Reference answer: The sun rises in the east.


 75%|███████▌  | 15/20 [10:18<03:18, 39.77s/it]Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



Q: What part of the body helps us see?
MCQ model output: C | Correct: Eyes
Generated answer: The brain helps us see.
Reference answer: The eyes help us to see.


 80%|████████  | 16/20 [11:04<02:46, 41.65s/it]Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



Q: How many fingers do most people have?
MCQ model output: C | Correct: Ten
Generated answer: As an AI, I don't have personal experiences or emotions, so I can't provide a specific number. However, it's generally considered that most people have around 5-6 fingers.
Reference answer: Most people have ten fingers.


 85%|████████▌ | 17/20 [11:47<02:06, 42.27s/it]Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



Q: Which of these is a fruit?
MCQ model output: C | Correct: Apple
Generated answer: Neither of these is a fruit.
Reference answer: An apple is a fruit.


 90%|█████████ | 18/20 [12:29<01:24, 42.21s/it]Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



Q: What is the capital of India?
MCQ model output: C | Correct: Delhi
Generated answer: New Delhi
Reference answer: The capital of India is New Delhi.


 95%|█████████▌| 19/20 [13:13<00:42, 42.55s/it]Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



Q: What gas do humans need to breathe?
MCQ model output: C | Correct: Oxygen
Generated answer: A human
Reference answer: Humans need oxygen to breathe.


100%|██████████| 20/20 [13:57<00:00, 41.85s/it]


Final Results:
MCQ Accuracy: 25.0%
Answer Matching Accuracy: 45.0%



