In [1]:
import os
import pandas as pd
from typing import List, Dict, Tuple
import json
from pathlib import Path
import numpy as np
from tqdm import tqdm
import time # Chong Ming. For Exercise task. To measure Inference Latency
import re

from datasets import load_dataset

with open("nebius_api_key", "r") as file:
    nebius_api_key = file.read().strip()

os.environ["NEBIUS_API_KEY"] = nebius_api_key

from openai import OpenAI

# Nebius uses the same OpenAI() class, but with additional details
nebius_client = OpenAI(
    base_url="https://api.studio.nebius.ai/v1/",
    api_key=os.environ.get("NEBIUS_API_KEY"),
)


In [2]:
def prettify_string(text, max_line_length=80):
    """Prints a string with line breaks at spaces to prevent horizontal scrolling.

    Args:
        text: The string to print.
        max_line_length: The maximum length of each line.
    """

    output_lines = []
    lines = text.split("\n") #Split the chunk of text retrieved from LLM into lines
    for line in lines:       #Loop all the lines
        current_line = ""
        words = line.split() #Split the lines into words separate by whitespace
        for word in words:
            if len(current_line) + len(word) + 1 <= max_line_length:
                current_line += word + " "
            else:
                output_lines.append(current_line.strip())
                current_line = word + " "
        output_lines.append(current_line.strip())  # Append the last line
    return "\n".join(output_lines)

In [3]:
def answer_with_llm(prompt: str,
                    system_prompt,
                    max_tokens=2056,
                    client=nebius_client,
                    model="",
                    prettify=False,
                    temperature=0.7) -> str:

    messages = []
    #print("\nModel Type in answer_with_llm: "+model+"\n")

    if system_prompt:
        messages.append(
            {
                "role": "system",
                "content": system_prompt
            }
        )

    messages.append(
        {
            "role": "user",
            "content": prompt
        }
    )

    completion = client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=max_tokens,
        temperature=temperature
    )
    if prettify:
        return prettify_string(completion.choices[0].message.content)
    else:
        return completion.choices[0].message.content

In [4]:


'''
This code builds an LLM evaluation tool using the MMLU dataset, a benchmark set of multiple-choice questions get from hugging face. The goal is to:
Ask the model questions, Evaluate its answers, Measure how accurate the model is on a specific topic.
This class runs the whole evaluation process.
'''
class MMLUEvaluator:
    #Initializes the evaluator with:
    #a system prompt (model’s role/instructions), a custom user prompt, a topic.
    
    def __init__(self, system_prompt: str = None, prompt: str = None,
                 topic: str = "high_school_mathematics"):
        """
        Initialize the MMLU evaluator.

        Args:
            system_prompt: Optional system prompt for the model
            prompt: Custom prompt for the model
            topic: Which topic to choose
        """

        self.topic = topic
        self.topic_prettified = topic.replace("_", " ")
        self.system_prompt = system_prompt or f"You are an expert in {self.topic_prettified}."

        self.prompt = """You are given a question in {topic_prettified} with four answer options labeled by A, B, C, and D.
You need to ponder the question and justify the choice of one of the options A, B, C, or D.
At the end, do write the chosen answer option A, B, C, D after #ANSWER:
Now, take a deep breath and work out this problem step by step. If you do well, I'll tip you 200$.

QUESTION: {question}

ANSWER OPTIONS:
A: {A}
B: {B}
C: {C}
D: {D}
"""

        self.questions, self.choices, self.answers = self.load_mmlu_data(topic=self.topic)

    def load_mmlu_data(self, topic: str) -> pd.DataFrame:
        """
        Load MMLU test data on a given topic.

        Args:
            topic: Which topic to choose

        Returns:
            DataFrame with questions and answers
        """

        dataset = load_dataset("cais/mmlu", topic, split="test")

        dataset = dataset
        dataset = pd.DataFrame(dataset)

        # Load questions and choices separately
        questions = dataset["question"]
        choices = pd.DataFrame(
            data=dataset["choices"].tolist(), columns=["A", "B", "C", "D"]
        )
        # In the dataset, true answer labels are in 0-3 format;
        # We convert it to A-D
        answers = dataset["answer"].map(lambda ans: {0: "A", 1: "B", 2: "C", 3: "D"}[ans])

        return questions, choices, answers

    def extract_answer(self, solution: str) -> str:
        """
        Extract the letter answer from model's response. Example: If the model writes #ANSWER: C, this grabs the C.

        Args:
            response: Raw model response

        Returns:
            Extracted answer letter (A, B, C, D, or Failed to parse)
        """
        # Look for a single letter answer in the response
        try:
            #print("Print solution: "+solution)
            #answer = solution.split('#ANSWER:')[1].strip()
        #except:
         #   answer = "Failed to parse"
        #return answer
            
            #the above codes has a flaw where if LLM response contains more than 1 "#ANSWER:" not matching the real answer, accuracy become zero.
            #Modify the codes to look for #ANSWER: followed by optional spaces, then a capital letter A-D.
            match = re.search(r"#ANSWER:\s*([A-D])", solution)
            if match:
                return match.group(1)
            else:
                return "Failed to parse"
        except Exception as e:
            print(f"Error extracting answer: {e}")
            return "Failed to parse"


    def evaluate_single_question(self, question: str, choices: Dict[str, str],
                                 correct_answer: str,
                                 client, model) -> Tuple[bool, str]:
        """
        Evaluate a single question. Sends one question to the model.
        Collects its answer compares it to the correct answer and return if:
        Whether the answer was correct,
        The chosen answer,
        The model's full response.

        Args:
            question: Formatted question string
            correct_answer: Correct answer letter

        Returns:
            Tuple of (is_correct, extracted_answer, model_response)
        """
        
        try:
            start_time = time.time()  # Chong Ming. For Exercise task. Add this line to measure inference latency. Start timer
            model_response = answer_with_llm(
                prompt=self.prompt.format(
                    client=client, model=model,
                    topic_prettified=self.topic_prettified,
                    question=question,
                    A=choices['A'], B=choices['B'], C=choices['C'], D=choices['D']
                ),
                system_prompt=self.system_prompt,
                model = model, #Put this in to insert in answer_with_llm function
                prettify=False
            )
            # Chong Ming. For Exercise task. Elapsed time
            end_time = time.time()  # End timer
            inference_time = end_time - start_time
            
            answer = self.extract_answer(model_response)
            is_correct = (answer.upper() == correct_answer.upper())
            #return is_correct, answer, model_response

            # Chong Ming. For Exercise task. Modify with additional inference_time for return
            return is_correct, answer, model_response, inference_time
        except Exception as e:
            print(f"Error evaluating question: {e}")
            return False, None, None

    def run_evaluation(self, client=nebius_client, model="none",
                       n_questions=50) -> Dict:
        """
        Runs evaluation on the first n questions (default: 50)
        Loops through each question, checks if the model’s answer is right
        Calculates the accuracy (correct answers ÷ total)
        Returns an accuracy score and all the answers/responses in a log

        Args
            client: Which client to use (OpenAI or Nebius)
            model: Which model to use
            n_questions: How many first questions to take

        Returns:
            Dictionary with evaluation metrics
        """
        
        evaluation_log = []
        correct_count = 0
        inference_times = [] #Chong Ming. For Exercise task. to calculate average inference time, store inference_time in a list. Method 1
        
        total_inference_time = 0.0   #Chong Ming. For Exercise task. For purpose of collecting the time and derived an average. Method 2
        
        #print("The model in run_evaluation: "+model)
        #print("The model in n_questions: "+str(n_questions))

        if n_questions:
            n_questions = min(n_questions, len(self.questions))
        else:
            n_questions = len(self.questions)

        for i in tqdm(range(n_questions)):
            is_correct, answer, model_response, inference_time = self.evaluate_single_question(
                question=self.questions[i],
                choices=self.choices.iloc[i],
                correct_answer=self.answers[i],
                client=client,
                model=model,
            )

            if is_correct:
                correct_count += 1
                
            evaluation_log.append({
                'answer': answer,
                'model_response': model_response,
                'is_correct': is_correct,
                'inference_time': inference_time #Chong Ming. For Exercise. Append only interence time for Method 1.
            })
        
        #Chong Ming. For Exercise task. Adding up the time. Method 1
        total_inference_time += inference_time 
        
        #Chong Ming. For Exercise task. To keep adding into 'inference_times' list. Method 2    
        inference_times.append(inference_time) 

        accuracy = correct_count / n_questions
        avg_inference_time = total_inference_time / n_questions #Chong Ming. Calculate average. Method 1
        avg_inference_time2 = sum(inference_times) / len(inference_times) if inference_times else 0. # Method 2
        
        evaluation_results = {
            'accuracy': accuracy,
            'evaluation_log': evaluation_log,
            'avg_inference_time': avg_inference_time, #Chong Ming. Add in to return additional info on the time results
            'avg_inference_time_Method2': avg_inference_time2, #Chong Ming. Add in to return additional info on the time results in list
        }

        return evaluation_results

In [5]:
evaluator = MMLUEvaluator(topic="global_facts")
results = evaluator.run_evaluation(model="deepseek-ai/DeepSeek-R1-0528",
                         n_questions=1)
print(f'\nAccuracy: {results["accuracy"]}')

100%|██████████| 1/1 [00:44<00:00, 44.42s/it]


Accuracy: 1.0





In [6]:
print(f'\nEvaluation Log: {results["evaluation_log"]}')


Evaluation Log: [{'answer': 'C', 'model_response': '<think>\nFirst, the question is: "As of 2016, about what percentage of adults aged 18 years or older were overweight?" The options are A: 10%, B: 20%, C: 40%, D: 80%.\n\nI need to recall global facts about overweight statistics. I remember that the World Health Organization (WHO) and other health organizations provide data on this. The keyword is "as of 2016," so I should think about data from around that year.\n\nFrom my knowledge, overweight and obesity are major global health issues. According to WHO, in 2016, more than 1.9 billion adults were overweight, and of these, over 650 million were obese. But I need the percentage of adults who are overweight, which includes both overweight and obese individuals? No, overweight is a category that often includes pre-obesity, but let\'s clarify the definitions.\n\nTypically, overweight is defined as having a Body Mass Index (BMI) of 25 or higher, and obesity is a subset with BMI 30 or highe

In [7]:
print(f'\nAvg_Inference_Time: {results["avg_inference_time"]:.2f}secs')


Avg_Inference_Time: 44.42secs


In [8]:
print(f'\nAvg_Inference_Time Method 2: {results["avg_inference_time_Method2"]:.2f}secs')


Avg_Inference_Time Method 2: 44.42secs
