# Evaluate Models

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import json
test_data = [json.loads(line) for line in open("/content/drive/MyDrive/final_mcqs_test.jsonl")]

In [4]:
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import re
from tqdm import tqdm

def evaluate_model_batched(model_name_or_path, test_data, subset_size=50, batch_size=4, model_description=None):
    print(f"\nEvaluating model: {model_description or model_name_or_path}")

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"

    model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map="auto", torch_dtype=torch.float16)
    model.eval()

    # Slice test data
    data = test_data[:subset_size]

    correct = 0
    total = 0
    num_a = num_b = num_c = num_d = num_skipped = 0

    # Process in batches
    for i in tqdm(range(0, len(data), batch_size)):
        batch = data[i:i + batch_size]
        prompts = [x["prompt"].strip() for x in batch]
        true_answers = [re.search(r"<answer>([A-D])</answer>", x["completion"]) for x in batch]
        true_answers = [m.group(1) if m else None for m in true_answers]

        inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=128,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

        decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        for j, decoded in enumerate(decoded_outputs):
            total += 1
            prompt = prompts[j]
            true_answer = true_answers[j]
            generated = decoded[len(prompt):].strip()

            pred_matches = re.findall(r"<answer>([A-D])</answer>", generated)
            if len(pred_matches) < 1:
                num_skipped += 1
                continue

            pred_answer = pred_matches[0]
            if pred_answer == "A": num_a += 1
            elif pred_answer == "B": num_b += 1
            elif pred_answer == "C": num_c += 1
            elif pred_answer == "D": num_d += 1

            if pred_answer == true_answer:
                correct += 1

    accuracy = correct / total if total > 0 else 0

    print(f"\nAccuracy on {total} samples: {accuracy:.2%} ({correct}/{total})")
    print(f"Skipped due to no <answer>: {num_skipped}")
    print("Answer Distribution:")
    print(f"  A: {num_a}")
    print(f"  B: {num_b}")
    print(f"  C: {num_c}")
    print(f"  D: {num_d}")

## Fine-Tuned TinyLlama

In [5]:
model_path = "/content/drive/MyDrive/tinyllama-mcq-lora"
evaluate_model_batched(model_path, test_data, subset_size=len(test_data), batch_size=32, model_description='Finetuned TinyLlama')


Evaluating model: Finetuned TinyLlama


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

100%|██████████| 4/4 [00:26<00:00,  6.71s/it]


Accuracy on 103 samples: 33.98% (35/103)
Skipped due to no <answer>: 0
Answer Distribution:
  A: 3
  B: 0
  C: 78
  D: 22





## Baseline TinyLlama

In [6]:
model_path = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
evaluate_model_batched(model_path, test_data, subset_size=len(test_data), batch_size=32, model_description='Baseline TinyLlama')


Evaluating model: Baseline TinyLlama


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

100%|██████████| 4/4 [00:21<00:00,  5.33s/it]


Accuracy on 103 samples: 14.56% (15/103)
Skipped due to no <answer>: 45
Answer Distribution:
  A: 0
  B: 58
  C: 0
  D: 0





## MedAlpaca

In [7]:
model_path = "medalpaca/medalpaca-7b"
evaluate_model_batched(model_path, test_data, subset_size=len(test_data), batch_size=4, model_description='MedAlpaca 7B')


Evaluating model: MedAlpaca 7B


tokenizer_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggin

config.json:   0%|          | 0.00/542 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.88G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/7.18G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.89G [00:00<?, ?B/s]



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

100%|██████████| 26/26 [02:32<00:00,  5.87s/it]


Accuracy on 103 samples: 18.45% (19/103)
Skipped due to no <answer>: 62
Answer Distribution:
  A: 9
  B: 24
  C: 5
  D: 3



