In [20]:
import os
import pandas as pd
from typing import List, Dict, Tuple
import json
from pathlib import Path
import numpy as np
from tqdm import tqdm
import time # Chong Ming. For Exercise task. To measure Inference Latency
import re
from openai import OpenAI
from pydantic import BaseModel 

from datasets import load_dataset

with open("nebius_api_key", "r") as file:
    nebius_api_key = file.read().strip()

os.environ["NEBIUS_API_KEY"] = nebius_api_key

from openai import OpenAI

# Nebius uses the same OpenAI() class, but with additional details
nebius_client = OpenAI(
    base_url="https://api.studio.nebius.ai/v1/",
    api_key=os.environ.get("NEBIUS_API_KEY"),
)

In [21]:
def prettify_string(text, max_line_length=80):
    """Prints a string with line breaks at spaces to prevent horizontal scrolling.

    Args:
        text: The string to print.
        max_line_length: The maximum length of each line.
    """

    output_lines = []
    lines = text.split("\n") #Split the chunk of text retrieved from LLM into lines
    for line in lines:       #Loop all the lines
        current_line = ""
        words = line.split() #Split the lines into words separate by whitespace
        for word in words:
            if len(current_line) + len(word) + 1 <= max_line_length:
                current_line += word + " "
            else:
                output_lines.append(current_line.strip())
                current_line = word + " "
        output_lines.append(current_line.strip())  # Append the last line
    return "\n".join(output_lines)

In [22]:
def answer_with_llm(prompt: str,
                    system_prompt,
                    max_tokens=2056,
                    client=nebius_client,
                    model="",
                    prettify=False,
                    temperature=0.7) -> str:

    messages = []
    #print("\nModel Type in answer_with_llm: "+model+"\n")

    if system_prompt:
        messages.append(
            {
                "role": "system",
                "content": system_prompt
            }
        )

    messages.append(
        {
            "role": "user",
            "content": prompt
        }
    )

    completion = client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=max_tokens,
        temperature=temperature
    )
    if prettify:
        return prettify_string(completion.choices[0].message.content)
    else:
        return completion.choices[0].message.content

In [29]:


# --- New BLOCK HERE for translation---
class MMLUSample(BaseModel):
    question: str
    A: str
    B: str
    C: str
    D: str
    correct_answer: str

def translate_mmlu_sample(sample: MMLUSample, target_language: str) -> MMLUSample:
    completion = nebius_client.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-8B-Instruct",  # or your preferred model
        messages=[
            {
                "role": "user",
                "content": f"""Translate this MMLU sample into {target_language}:\n
Question: {sample.question}\n
A: {sample.A}\n
B: {sample.B}\n
C: {sample.C}\n
D: {sample.D}\n
Correct answer: {sample.correct_answer}\n
Output the result as a JSON object with the same structure."""
            }
        ],
        extra_body={"guided_json": MMLUSample.model_json_schema()}
    )
    translated = MMLUSample.model_validate_json(completion.choices[0].message.content)
    translated.correct_answer = sample.correct_answer
    return translated

# --- END OF BLOCK ---

'''
This code builds an LLM evaluation tool using the MMLU dataset, a benchmark set of multiple-choice questions get from hugging face. The goal is to:
Ask the model questions, Evaluate its answers, Measure how accurate the model is on a specific topic.
This class runs the whole evaluation process.
'''
class MMLUEvaluator:
    #Initializes the evaluator with:
    #a system prompt (model’s role/instructions), a custom user prompt, a topic.
    
    def __init__(self, system_prompt: str = None, prompt: str = None,
                 topic: str = "high_school_mathematics", language: str = "English"): #29Jun Add in new argument for Language
        """
        Initialize the MMLU evaluator.

        Args:
            system_prompt: Optional system prompt for the model
            prompt: Custom prompt for the model
            topic: Which topic to choose
        """
        
        self.language = language #29Jun Add in this line for language
        self.topic = topic
        self.topic_prettified = topic.replace("_", " ")
        self.system_prompt = system_prompt or f"You are an expert in {self.topic_prettified}."

        self.prompt = """You are given a question in {topic_prettified} with four answer options labeled by A, B, C, and D.
You need to ponder the question and justify the choice of one of the options A, B, C, or D.
At the end, do write the chosen answer option A, B, C, D after #ANSWER:
Now, take a deep breath and work out this problem step by step. If you do well, I'll tip you 200$.

QUESTION: {question}

ANSWER OPTIONS:
A: {A}
B: {B}
C: {C}
D: {D}
"""

        self.questions, self.choices, self.answers, self.translated_q = self.load_mmlu_data(topic=self.topic)

    '''
    Add in load_mmlu_data function to translate the dataset
    The function loads a dataset from the MMLU benchmark for a given topic
    ''' 

    
    def load_mmlu_data(self, topic: str):
        
        dataset = load_dataset("cais/mmlu", topic, split="test") #Load Dataset from HuggingFace. "cais/mmlu" is the dataset repository name.
        dataset = pd.DataFrame(dataset) #Converts the HuggingFace dataset object to a Pandas DataFrame 

        questions = dataset["question"] #Extract qns Texts
    
        #Extract ans choices
        #dataset["choices"] contains answer choices as a list like ["choice1", "choice2", "choice3", "choice4"]
        #.tolist() converts it from a Series of lists to a list of lists
        #The result is turned into a new DataFrame with columns explicitly labeled "A", "B", "C", "D" to match multiple-choice options
        choices = pd.DataFrame(dataset["choices"].tolist(), columns=["A", "B", "C", "D"]) #Extract ans choices

        #Convert Numeric Answers to A/B/C/D
        #The dataset gives the correct answer as integers (0, 1, 2, 3), where:0 → A, 1 → B, 2 → C, 3 → D
        answers = dataset["answer"].map(lambda ans: {0: "A", 1: "B", 2: "C", 3: "D"}[ans])

        if self.language != "English":
            #Initializes two empty lists to store translated questions and translated answer options.
            #Loops over each row in the dataset to perform translations one by one.
            translated_questions = []
            translated_choices = []
            for i in range(len(dataset)):
                #Creates a structured MMLUSample object for a single row.
                #This object is needed for guided LLM translation using pydantic.
                sample = MMLUSample(
                    question=questions[i],
                    A=choices.iloc[i]["A"],
                    B=choices.iloc[i]["B"],
                    C=choices.iloc[i]["C"],
                    D=choices.iloc[i]["D"],
                    correct_answer=answers[i]
                
                    )
                
                translated = translate_mmlu_sample(sample, self.language)
                translated_questions.append(translated.question)
                #Store the Translated Results
                translated_choices.append({
                    "A": translated.A,
                    "B": translated.B,
                    "C": translated.C,
                    "D": translated.D
                    })
            #After translation is complete, replace the original questions and choices with the translated ones.
            questions = translated_questions
            choices = pd.DataFrame(translated_choices)

            #Return these 3 information for evaluation
            #questions: a list or Series of questions (translated if needed)
            #choices: a DataFrame with answer options labeled "A", "B", "C", "D"
            #answers: a Series of correct answers ("A" to "D")
            return questions, choices, answers, translated_questions
        else:
            # For English, just return original questions and choices
            return questions, choices, answers, list(questions)  # last one is a dummy for translated_q


    def extract_answer(self, solution: str) -> str:
        """
        Extract the letter answer from model's response. Example: If the model writes #ANSWER: C, this grabs the C.

        Args:
            response: Raw model response

        Returns:
            Extracted answer letter (A, B, C, D, or Failed to parse)
        """
        # Look for a single letter answer in the response
        try:
            #print("Print solution: "+solution)
            #answer = solution.split('#ANSWER:')[1].strip()
        #except:
         #   answer = "Failed to parse"
        #return answer
            
            #the above codes has a flaw where if LLM response contains more than 1 "#ANSWER:" not matching the real answer, accuracy become zero.
            #Modify the codes to look for #ANSWER: followed by optional spaces, then a capital letter A-D.
            match = re.search(r"#ANSWER:\s*([A-D])", solution)
            if match:
                return match.group(1)
            else:
                return "Failed to parse"
        except Exception as e:
            print(f"Error extracting answer: {e}")
            return "Failed to parse"


    def evaluate_single_question(self, question: str, choices: Dict[str, str],
                                 correct_answer: str,
                                 client, model) -> Tuple[bool, str]:
        """
        Evaluate a single question. Sends one question to the model.
        Collects its answer compares it to the correct answer and return if:
        Whether the answer was correct,
        The chosen answer,
        The model's full response.

        Args:
            question: Formatted question string
            correct_answer: Correct answer letter

        Returns:
            Tuple of (is_correct, extracted_answer, model_response)
        """
        
        try:
            start_time = time.time()  # Chong Ming. For Exercise task. Add this line to measure inference latency. Start timer
            model_response = answer_with_llm(
                prompt=self.prompt.format(
                    client=client, model=model,
                    topic_prettified=self.topic_prettified,
                    question=question,
                    A=choices['A'], B=choices['B'], C=choices['C'], D=choices['D']
                ),
                system_prompt=self.system_prompt,
                model = model, #Put this in to insert in answer_with_llm function
                prettify=False
            )
            # Chong Ming. For Exercise task. Elapsed time
            end_time = time.time()  # End timer
            inference_time = end_time - start_time
            
            answer = self.extract_answer(model_response)
            is_correct = (answer.upper() == correct_answer.upper())
            #return is_correct, answer, model_response

            # Chong Ming. For Exercise task. Modify with additional inference_time for return
            return is_correct, answer, model_response, inference_time
        except Exception as e:
            print(f"Error evaluating question: {e}")
            return False, None, None

    def run_evaluation(self, client=nebius_client, model="none",
                       n_questions=50) -> Dict:
        """
        Runs evaluation on the first n questions (default: 50)
        Loops through each question, checks if the model’s answer is right
        Calculates the accuracy (correct answers ÷ total)
        Returns an accuracy score and all the answers/responses in a log

        Args
            client: Which client to use (OpenAI or Nebius)
            model: Which model to use
            n_questions: How many first questions to take

        Returns:
            Dictionary with evaluation metrics
        """
        
        evaluation_log = []
        correct_count = 0
        inference_times = [] #Chong Ming. For Exercise task. to calculate average inference time, store inference_time in a list. Method 1
        
        total_inference_time = 0.0   #Chong Ming. For Exercise task. For purpose of collecting the time and derived an average. Method 2
        
        #print("The model in run_evaluation: "+model)
        #print("The model in n_questions: "+str(n_questions))

        if n_questions:
            n_questions = min(n_questions, len(self.questions))
        else:
            n_questions = len(self.questions)

        for i in tqdm(range(n_questions)):
            is_correct, answer, model_response, inference_time = self.evaluate_single_question(
                question=self.questions[i],
                choices=self.choices.iloc[i],
                correct_answer=self.answers[i],
                client=client,
                model=model,
            )

            if is_correct:
                correct_count += 1
                
            evaluation_log.append({
                'answer': answer,
                'model_response': model_response,
                'is_correct': is_correct,
                'inference_time': inference_time #Chong Ming. For Exercise. Append only interence time for Method 1.
            })
        
        #Chong Ming. For Exercise task. Adding up the time. Method 1
        total_inference_time += inference_time 
        
        #Chong Ming. For Exercise task. To keep adding into 'inference_times' list. Method 2    
        inference_times.append(inference_time) 

        accuracy = correct_count / n_questions
        avg_inference_time = total_inference_time / n_questions #Chong Ming. Calculate average. Method 1
        avg_inference_time2 = sum(inference_times) / len(inference_times) if inference_times else 0. # Method 2
        
        evaluation_results = {
            'accuracy': accuracy,
            'evaluation_log': evaluation_log,
            'avg_inference_time': avg_inference_time, #Chong Ming. Add in to return additional info on the time results
            'avg_inference_time_Method2': avg_inference_time2, #Chong Ming. Add in to return additional info on the time results in list
        }

        return evaluation_results

In [30]:
evaluator = MMLUEvaluator(topic="global_facts", language="English")
results = evaluator.run_evaluation(model="meta-llama/Meta-Llama-3.1-8B-Instruct",
                         n_questions=5)
print(f'\nAccuracy: {results["accuracy"]}')

100%|██████████| 5/5 [00:29<00:00,  5.95s/it]


Accuracy: 0.2





In [31]:
print(f'\nEvaluation Log: {results["evaluation_log"]}')


Evaluation Log: [{'answer': 'C', 'model_response': "To answer this question, I'll consider the global trend of obesity and overweight rates. According to various reports and studies from reputable organizations such as the World Health Organization (WHO), there has been a steady increase in overweight and obesity rates globally over the past few decades.\n\nAs of 2016, the WHO reported that approximately 39% of adults aged 18 years or older worldwide were overweight, and about 13% were obese. This suggests that the majority of adults were either overweight or obese, with a significant proportion being overweight.\n\nConsidering the options provided, I'll evaluate them as follows:\n\nA: 10% - This is an underestimation, given the reported global rates.\n\nB: 20% - This is still lower than the reported global average.\n\nC: 40% - This aligns closely with the reported global rate of overweight adults as of 2016.\n\nD: 80% - This is an overestimation, as the global average is not this hig

In [32]:
evaluator = MMLUEvaluator(topic="global_facts", language="French")
results = evaluator.run_evaluation(model="meta-llama/Meta-Llama-3.1-8B-Instruct",
                         n_questions=5)
print(f'\nAccuracy: {results["accuracy"]}')

100%|██████████| 5/5 [00:21<00:00,  4.39s/it]


Accuracy: 0.6





In [33]:
for i in range(5):
    print(f"\nQuestion {i+1}: {evaluator.translated_q[i]}")
    print(f"A: {evaluator.choices.iloc[i]['A']}")
    print(f"B: {evaluator.choices.iloc[i]['B']}")
    print(f"C: {evaluator.choices.iloc[i]['C']}")
    print(f"D: {evaluator.choices.iloc[i]['D']}")


Question 1: À partir de 2016, quel pourcentage environ des adultes âgés de 18 ans ou plus étaient en surpoids ?
A: 10 %
B: 20 %
C: 40 %
D: 80 %

Question 2: Quel est le PIB par habitant aux Etats-Unis en 1850 en tenant compte de l'inflation et du PPA en 2011 ?
A: Environ 300$
B: Environ 3 000$
C: Environ 8 000$
D: Environ 15 000$

Question 3: À compter de 2019, quelle est la pourcentage d'environ les gens des États-Unis pouvant dire que l'homosexualité doit être acceptée de la part de la société ?
A: 52%
B: 62%
C: 72%
D: 82%

Question 4: Quel pays a généré le plus de l'énergie totale à partir de sources solaires en 2019?
A: Chine
B: États-Unis
C: Allemagne
D: Japon

Question 5: À quoi s'est élevé le PIB par tête au Japon, ajusté par l'inflation et les PPP, entre 1950 et 2016 ?
A: de 5 fois
B: de 10 fois
C: de 15 fois
D: de 20 fois


In [34]:
print(f'\nEvaluation Log: {results["evaluation_log"]}')


Evaluation Log: [{'answer': 'C', 'model_response': "Prenons le temps de réfléchir à cette question !\n\nSelon les données de la FAO (Nations Unies pour l'alimentation et l'agriculture), les taux de surpoids et d'obésité ont considérablement augmenté dans le monde depuis les années 1980.\n\nEn 2016, la FAO a publié un rapport sur la situation nutritionnelle dans le monde, qui indique que plus de 1 milliard de personnes dans le monde souffraient de surpoids ou d'obésité en 2016.\n\nEn ce qui concerne les adultes âgés de 18 ans ou plus, la proportion de personnes en surpoids varie considérablement d'un pays à l'autre en fonction de facteurs tels que le niveau de développement économique, les habitudes alimentaires, les niveaux de vie et les conditions de vie.\n\nCependant, selon les données de la OMS (Organisation mondiale de la santé), en 2016, environ 39 % des adultes âgés de 18 ans ou plus dans le monde étaient en surpoids ou obèses.\n\nEn tenant compte de ces informations, je pense q