In [2]:
# Import Libraries
import os
import json
import yaml
import logging
from tqdm import tqdm
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

In [3]:
# Set global logging level
logging.getLogger().setLevel(logging.WARNING)

# Specifically reduce Azure-related logging in this notebook
logging.getLogger("azure").setLevel(logging.ERROR)
logging.getLogger("azure.identity").setLevel(logging.ERROR)
logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(logging.ERROR)

In [None]:
# read the Azure ML workspace configuration from config.yml
with open("config.yml", "r") as f:
    config = yaml.safe_load(f)

# Azure ML workspace configuration
subscription_id = config["subscription_id"]
resource_group = config["resource_group"]
workspace_name = config["workspace_name"]

# baseline endpoint configuration
baseline_endpoint_name = config["baseline_endpoint_name"]

# finetuned model real-time endpoint configuration
endpoint_name = config["endpoint_name"]

In [None]:
# Uncomment the following line to log in to Azure
#!az login

In [6]:
# get the MLClient instance
ml_client = MLClient(DefaultAzureCredential(), subscription_id, resource_group, workspace_name)

In [7]:
# get the test data
with open("./data/test.jsonl", "r", encoding='utf-8') as f:
    test_data = [json.loads(line) for line in f]

##### Baseline Model Evaluation

In [28]:
# Baseline Model Evaluation
correct = 0
total = len(test_data)

def formatting_func(example):
    question = example["question"]
    options = example["options"]
    answer_idx = example["answer_idx"]

    # Format options as A. Option text...
    formatted_options = "\n".join([f"{key}. {val}" for key, val in sorted(options.items())])
    
    user_content = f"Question:\n{question}\n\nOptions:\n{formatted_options}"

    system_prompt = f"You are a medical expert. Read the following USMLE question and choose the best answer. Give me the answer as A/B/C/D/E."

    return system_prompt, user_content, answer_idx


for item in tqdm(test_data, desc="Evaluating test dataset"):

    system_prompt, user_content, gold_answer = formatting_func(item)
    try:        
        sample = {
            "input_data": {
                "input_string": 
                [
                    {
                        "role": "system",
                        "content": system_prompt
                    },
                    {
                        "role": "user",
                        "content": user_content
                    }
                ],
                "parameters": {
                    "temperature": 0.0,
                    "max_new_tokens": 10,
                    "do_sample": False
                }
            }
        }
        with open("request.json", "w") as f:
            json.dump(sample, f, indent=4)

        response = ml_client.online_endpoints.invoke(
            endpoint_name=baseline_endpoint_name,
            request_file="request.json"
        )

        response_json = json.loads(response)
        model_answer = response_json["output"].strip()

        if model_answer.startswith(gold_answer):
            correct += 1

    except Exception as e:
        print(f"Error: {e}")
        total -= 1  # Skip from total if failed

# Final accuracy
accuracy = correct / total if total > 0 else 0
print(f"\Baseline Accuracy on MedQA test set: {accuracy:.2%}")

  print(f"\Baseline Accuracy on MedQA test set: {accuracy:.2%}")
Evaluating test dataset: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1273/1273 [20:07<00:00,  1.05it/s]

\Baseline Accuracy on MedQA test set: 44.85%





##### Finetuned Model Evaluation

In [12]:
# Finetuned Model Evaluation
correct = 0
total = len(test_data)

def formatting_func(example):
    question = example["question"]
    options = example["options"]
    answer_idx = example["answer_idx"]

    # Format options as A. Option text...
    formatted_options = "\n".join([f"{key}. {val}" for key, val in sorted(options.items())])
    
    user_content = f"Question:\n{question}\n\nOptions:\n{formatted_options}"

    system_prompt = f"You are a medical expert. Read the following USMLE question and choose the best answer. Give me the answer as A/B/C/D/E."

    return system_prompt, user_content, answer_idx


for item in tqdm(test_data, desc="Evaluating test dataset"):

    system_prompt, user_content, gold_answer = formatting_func(item)
    try:        
        sample = {
            "messages": [
                {
                    "role": "system",
                    "content": system_prompt
                },
                {
                    "role": "user",
                    "content": user_content
                }
            ],
            "temperature": 0.0,
            "max_new_tokens": 10,
            "do_sample": False,
        }

        with open("request.json", "w") as f:
            json.dump(sample, f, indent=4)

        response = ml_client.online_endpoints.invoke(
            endpoint_name=endpoint_name,
            request_file="request.json"
        )

        model_answer = response.strip().strip('"')

        # print(f"response: {response}")
        # print(f"Model Answer: {model_answer}")
        # print(f"Gold Answer: {gold_answer}")

        if model_answer.startswith(gold_answer):
            correct += 1

    except Exception as e:
        print(f"Error: {e}")
        total -= 1  # Skip from total if failed

# Final accuracy
accuracy = correct / total if total > 0 else 0
print(f"\nFinetuned Accuracy on MedQA test set: {accuracy:.2%}")

Evaluating test dataset: 100%|██████████| 1273/1273 [16:45<00:00,  1.27it/s]


Finetuned Accuracy on MedQA test set: 46.27%



