# Load Nebius API key

In [1]:
import os
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm
import json
import time
import re

from datasets import load_dataset
from openai import OpenAI

# --- SETUP ---

# Load Nebius API key
with open("nebius_api_key", "r") as file:
    nebius_api_key = file.read().strip()
os.environ["NEBIUS_API_KEY"] = nebius_api_key

# Create Nebius client
nebius_client = OpenAI(
    base_url="https://api.studio.nebius.ai/v1/",
    api_key=os.environ.get("NEBIUS_API_KEY"),
)

# Prompt variants

In [2]:
prompt_cot_suppression = """You are given a question in {topic_prettified} with four answer options labeled by A, B, C, and D.
Pick the best answer, and write only the answer letter after #ANSWER:.
QUESTION: {question}
ANSWER OPTIONS:
A: {A}
B: {B}
C: {C}
D: {D}
#ANSWER:"""

prompt_basic_cot = """You are given a question in {topic_prettified} with four answer options labeled by A, B, C, and D.
Think step by step and explain your reasoning. Then, write the chosen answer letter A, B, C, or D after #ANSWER:.
QUESTION: {question}
ANSWER OPTIONS:
A: {A}
B: {B}
C: {C}
D: {D}
"""

# Cost settings per 1M tokens (Nebius, sample values, adjust if needed)

In [3]:
costs = {
    '70B': {'input': 0.4, 'output': 1.2},
    '8B': {'input': 0.02, 'output': 0.06}
}
def compute_cost(model_name, input_tokens, output_tokens):
    key = '70B' if '70B' in model_name else '8B'
    input_cost = input_tokens / 1_000_000 * costs[key]['input']
    output_cost = output_tokens / 1_000_000 * costs[key]['output']
    return input_cost + output_cost


# --- MMLU Evaluator

In [4]:
class MMLUEvaluator:
    def __init__(self, system_prompt=None, prompt=None, topic="high_school_mathematics"):
        self.topic = topic
        self.topic_prettified = topic.replace("_", " ")
        self.system_prompt = system_prompt or f"You are an expert in {self.topic_prettified}."
        self.prompt = prompt or """You are given a question in {topic_prettified} with four answer options labeled by A, B, C, and D.
                                    You need to ponder the question and justify the choice of one of the options A, B, C, or D.
                                    At the end, do write the chosen answer option A, B, C, D after #ANSWER:"""
        self.questions, self.choices, self.answers = self.load_mmlu_data(topic=self.topic)

    def load_mmlu_data(self, topic: str):
        dataset = load_dataset("cais/mmlu", topic, split="test")
        dataset = pd.DataFrame(dataset)
        questions = dataset["question"]
        choices = pd.DataFrame(dataset["choices"].tolist(), columns=["A", "B", "C", "D"])
        answers = dataset["answer"].map(lambda ans: {0: "A", 1: "B", 2: "C", 3: "D"}[ans])
        return questions, choices, answers

    def extract_answer(self, solution: str) -> str:
        try:
            match = re.search(r"#ANSWER:\s*([A-D])", solution)
            if match:
                return match.group(1)
            else:
                return "Failed to parse"
        except Exception as e:
            print(f"Error extracting answer: {e}")
            return "Failed to parse"

    def evaluate_single_question(self, question, choices, correct_answer, client, model):
        try:
            start_time = time.time()
            formatted_prompt = self.prompt.format(
                topic_prettified=self.topic_prettified,
                question=question,
                A=choices['A'], B=choices['B'], C=choices['C'], D=choices['D']
            )
            completion = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": self.system_prompt},
                    {"role": "user", "content": formatted_prompt}
                ],
                max_tokens=2056,
                temperature=0.7  # Default, can override
            )
            end_time = time.time()
            model_response = completion.choices[0].message.content
            answer = self.extract_answer(model_response)
            is_correct = (answer.upper() == correct_answer.upper())
            usage = completion.usage
            inference_time = end_time - start_time
            return is_correct, answer, model_response, inference_time, usage
        except Exception as e:
            print(f"Error evaluating question: {e}")
            return False, None, None, 0, None

    def run_evaluation(self, client, model, n_questions=50):
        evaluation_log = []
        correct_count = 0
        total_inference_time = 0.0
        total_input_tokens = 0
        total_output_tokens = 0

        for i in tqdm(range(n_questions)):
            is_correct, answer, model_response, inference_time, usage = self.evaluate_single_question(
                question=self.questions[i],
                choices=self.choices.iloc[i],
                correct_answer=self.answers[i],
                client=client,
                model=model
            )
            if is_correct:
                correct_count += 1
            if usage:
                total_input_tokens += usage.prompt_tokens
                total_output_tokens += usage.completion_tokens
            total_inference_time += inference_time
            evaluation_log.append({
                'question': self.questions[i],
                'answer': answer,
                'model_response': model_response,
                'is_correct': is_correct,
                'inference_time': inference_time,
                'input_tokens': usage.prompt_tokens if usage else 0,
                'output_tokens': usage.completion_tokens if usage else 0,
                'correct_answer': self.answers[i]
            })
        accuracy = correct_count / n_questions
        avg_inference_time = total_inference_time / n_questions
        return {
            'accuracy': accuracy,
            'evaluation_log': evaluation_log,
            'avg_inference_time': avg_inference_time,
            'total_input_tokens': total_input_tokens,
            'total_output_tokens': total_output_tokens
        }

# --------- EXPERIMENT SETUPS ------------

In [5]:
topic = "high_school_mathematics"  # or your chosen math topic
n_questions = 50

models = {
    '8B': "meta-llama/Meta-Llama-3.1-8B-Instruct",
    '70B': "meta-llama/Meta-Llama-3.1-70B-Instruct" 
}
results_summary = []

# --- 1. Llama-3.1-70B, CoT Suppression ---

In [6]:
evaluator = MMLUEvaluator(system_prompt=None, prompt=prompt_cot_suppression, topic=topic)
results_70b_cots = evaluator.run_evaluation(
    client=nebius_client, model=models['70B'], n_questions=n_questions)
results_summary.append({
    "model": "Llama-3.1-70B", "strategy": "CoT Suppression",
    "accuracy": results_70b_cots["accuracy"],
    "cost": compute_cost(models['70B'], results_70b_cots["total_input_tokens"], results_70b_cots["total_output_tokens"])
})
with open('results_70b_cotSuppress.json', 'w', encoding='utf-8') as f:
    json.dump(results_70b_cots, f, indent=2, ensure_ascii=False)

100%|██████████| 50/50 [01:04<00:00,  1.28s/it]


# --- 2. Llama-3.1-8B, CoT Suppression ---

In [7]:
evaluator = MMLUEvaluator(system_prompt=None, prompt=prompt_cot_suppression, topic=topic)
results_8b_cots = evaluator.run_evaluation(
    client=nebius_client, model=models['8B'], n_questions=n_questions)
results_summary.append({
    "model": "Llama-3.1-8B", "strategy": "CoT Suppression",
    "accuracy": results_8b_cots["accuracy"],
    "cost": compute_cost(models['8B'], results_8b_cots["total_input_tokens"], results_8b_cots["total_output_tokens"])
})
with open('results_8b_cotSuppress.json', 'w', encoding='utf-8') as f:
    json.dump(results_8b_cots, f, indent=2, ensure_ascii=False)

100%|██████████| 50/50 [00:15<00:00,  3.14it/s]


# --- 3. Llama-3.1-70B, Basic CoT ---

In [8]:
evaluator = MMLUEvaluator(system_prompt=None, prompt=prompt_basic_cot, topic=topic)
results_70b_cot = evaluator.run_evaluation(
    client=nebius_client, model=models['70B'], n_questions=n_questions)
results_summary.append({
    "model": "Llama-3.1-70B", "strategy": "Basic CoT",
    "accuracy": results_70b_cot["accuracy"],
    "cost": compute_cost(models['70B'], results_70b_cot["total_input_tokens"], results_70b_cot["total_output_tokens"])
})
with open('results_70b_cot.json', 'w', encoding='utf-8') as f:
    json.dump(results_70b_cot, f, indent=2, ensure_ascii=False)

100%|██████████| 50/50 [12:18<00:00, 14.77s/it]


# --- 4. Llama-3.1-8B, Basic CoT ---

In [9]:
evaluator = MMLUEvaluator(system_prompt=None, prompt=prompt_basic_cot, topic=topic)
results_8b_cot = evaluator.run_evaluation(
    client=nebius_client, model=models['8B'], n_questions=n_questions)
results_summary.append({
    "model": "Llama-3.1-8B", "strategy": "Basic CoT",
    "accuracy": results_8b_cot["accuracy"],
    "cost": compute_cost(models['8B'], results_8b_cot["total_input_tokens"], results_8b_cot["total_output_tokens"])
})
with open('results_8b_cot.json', 'w', encoding='utf-8') as f:
    json.dump(results_8b_cot, f, indent=2, ensure_ascii=False)

100%|██████████| 50/50 [08:40<00:00, 10.40s/it]


# --- 5. Llama-3.1-8B, Self-Consistency (majority vote, 5 runs per Q) ---

In [10]:
# --- 5. Llama-3.1-8B, Self-Consistency (majority vote, 5 runs per Q) ---
evaluator = MMLUEvaluator(system_prompt=None, prompt=prompt_basic_cot, topic=topic)
all_results_sc = []
correct_count = 0
total_input_tokens = 0
total_output_tokens = 0
total_inference_time = 0.0
for i in tqdm(range(n_questions)):
    answers = []
    usages = []
    model_responses = []
    times = []
    for _ in range(5):
        is_correct, answer, model_response, inference_time, usage = evaluator.evaluate_single_question(
            evaluator.questions[i], evaluator.choices.iloc[i], evaluator.answers[i],
            nebius_client, models['8B'])
        answers.append(answer)
        model_responses.append(model_response)
        times.append(inference_time)
        if usage:
            usages.append(usage)
    # Majority vote (excluding Failed to parse)
    valid_answers = [a for a in answers if a and a in "ABCD"]
    majority = Counter(valid_answers).most_common(1)[0][0] if valid_answers else "Failed to parse"
    is_correct = (majority == evaluator.answers[i])
    if is_correct:
        correct_count += 1
    total_input_tokens += sum(u.prompt_tokens for u in usages)
    total_output_tokens += sum(u.completion_tokens for u in usages)
    total_inference_time += np.mean(times)
    all_results_sc.append({
        'question': evaluator.questions[i],
        'answers': answers,
        'model_responses': model_responses,
        'chosen_answer': majority,
        'is_correct': is_correct,
        'inference_time': np.mean(times),
        'input_tokens': sum(u.prompt_tokens for u in usages),
        'output_tokens': sum(u.completion_tokens for u in usages),
        'correct_answer': evaluator.answers[i]
    })
accuracy_sc = correct_count / n_questions
avg_inference_time_sc = total_inference_time / n_questions
results_summary.append({
    "model": "Llama-3.1-8B", "strategy": "Self-Consistency (5x, Basic CoT)",
    "accuracy": accuracy_sc,
    "cost": compute_cost(models['8B'], total_input_tokens, total_output_tokens)
})
with open('results_8b_selfconsistency.json', 'w', encoding='utf-8') as f:
    json.dump(all_results_sc, f, indent=2, ensure_ascii=False)

100%|██████████| 50/50 [35:23<00:00, 42.48s/it]


# --- SUMMARY TABLE ---

In [11]:
import pandas as pd
df_summary = pd.DataFrame(results_summary)
print("\n===== Results Summary =====\n")
print(df_summary[["model", "strategy", "accuracy", "cost"]])
df_summary.to_csv("experiment_results_summary.csv", index=False)


===== Results Summary =====

           model                          strategy  accuracy      cost
0  Llama-3.1-70B                   CoT Suppression      0.54  0.004493
1   Llama-3.1-8B                   CoT Suppression      0.02  0.000149
2  Llama-3.1-70B                         Basic CoT      0.82  0.030715
3   Llama-3.1-8B                         Basic CoT      0.62  0.002193
4   Llama-3.1-8B  Self-Consistency (5x, Basic CoT)      0.76  0.008778
