# GSM8K

In [None]:
import json
import pandas as pd
import os
import re
from mistralai import Mistral
from datasets import load_dataset
from tqdm import tqdm

# Retrieve the Mistral API key from environment variables
api_key = os.getenv("MISTRAL_API_KEY")
client = Mistral(api_key=api_key)

# Load the GSM8K dataset
dataset = load_dataset("gsm8k", "main", split="test")

benchmark = "gsm8k"
certainty = "control"
model = "mistral-large"

# Directory to save results
output_file_path = f"{benchmark}_{certainty}_{model}.csv"
summary_file_path = f"{benchmark}_{certainty}_{model}_summary.json"

# Initialize or load existing outputs and summary
if os.path.exists(output_file_path):
    output_df = pd.read_csv(output_file_path)
    outputs = output_df.to_dict("records")
else:
    outputs = []

if os.path.exists(summary_file_path):
    with open(summary_file_path, 'r', encoding="utf-8") as file:
        summary = json.load(file)
else:
    summary = {
        "correct": 0,
        "wrong": 0,
        "inconclusive": 0,
        "total": 0,
    }

INVALID_ANS = "[invalid]"

def extract_answer(response):
    """Extract the numerical answer by finding the last number that occurs"""
    match = re.findall(r"\-?\d[\d\.,]*", response)
    if match:
        answer = match[-1]
        answer = answer.replace(",", "")
        answer = answer.replace(".", "")
        return answer
    else:
        return INVALID_ANS

def call_mistral(question):
    while True:
        chat_response = client.chat.complete(
            model = "mistral-large-latest",
            messages = [
                  {
                      "role":"user",
                      "content":f"{question}\nPlease end your final response with the answer in the format '#### <number>'.",
                  }
            ],
            temperature = 0
        )
        return chat_response.choices[0].message.content

# Main execution loop
for instance in tqdm(dataset):
    if any(output["question"] == instance["question"] for output in outputs):
        continue  # Skip already processed claims

    question = instance["question"]
    solution = instance["answer"] # Ground truth answer

    # Call the model to get its answer
    model_response = call_mistral(question)

    model_answer = extract_answer(model_response)
    actual_answer = extract_answer(solution)

    # Compare the model's response with the actual answer
    if model_answer == INVALID_ANS:
        summary["inconclusive"] += 1
    elif model_answer == actual_answer:
        summary["correct"] += 1
    else:
        summary["wrong"] += 1
    summary["total"] += 1
    
    # Store the question and answer
    output_record = {
        "question": question,
        "model_response": model_response,
        "actual_answer": actual_answer,
        "model_answer": model_answer,
        "correct": actual_answer == model_answer
    }
    outputs.append(output_record)

    pd.DataFrame(outputs).to_csv(output_file_path, index=False, encoding="utf-8")
    with open(summary_file_path, 'w', encoding="utf-8") as file:
        json.dump(summary, file, ensure_ascii=False, indent=4)

print(f"Results saved to {output_file_path} and {summary_file_path}")

# MATH (Level 5)

In [None]:
import json
import pandas as pd
import os
import re
from mistralai import Mistral
from datasets import load_dataset
from tqdm import tqdm

# Retrieve the Mistral API key from environment variables
api_key = os.getenv("MISTRAL_API_KEY")
client = Mistral(api_key=api_key)

# Load the MATH dataset
dataset = load_dataset("hendrycks/competition_math", split="test")
dataset = dataset.filter(lambda example: example["level"] == "Level 5")

benchmark = "math"
level = "level-5"
certainty = "control"
model = "mistral-large"

# Directory to save results
output_file_path = f"{benchmark}_{level}_{certainty}_{model}.csv"
summary_file_path = f"{benchmark}_{level}_{certainty}_{model}_summary.json"

# Initialize or load existing outputs and summary
if os.path.exists(output_file_path):
    output_df = pd.read_csv(output_file_path)
    outputs = output_df.to_dict("records")
else:
    outputs = []

if os.path.exists(summary_file_path):
    with open(summary_file_path, 'r', encoding="utf-8") as file:
        summary = json.load(file)
else:
    summary = {
        "correct": 0,
        "wrong": 0,
        "inconclusive": 0,
        "total": 0,
    }

def extract_answer(actual_solution, model_solution):
    while True:
        chat_response = client.chat.complete(
            model = "mistral-large-latest",
            messages = [
                  {
                      "role":"user",
                      "content":f"Compare the following two mathematical solutions:\nSolution 1:\n\"{actual_solution}\"\nSolution 2:\n\"{model_solution}\"\nDo these two solutions yield mathematically equivalent answers? Consider all standard mathematical rules and provide only a 'true' or 'false' output",
                  }
            ],
            temperature = 0
        )
        return chat_response.choices[0].message.content

def call_mistral(question):
    while True:
        chat_response = client.chat.complete(
            model = "mistral-large-latest",
            messages = [
                  {
                      "role":"user",
                      "content":f"{question}\nPlease end your final response with the answer in the format '#### <number>'.",
                  }
            ],
            temperature = 0
        )
        return chat_response.choices[0].message.content

# Main execution loop
for instance in tqdm(dataset):
    if any(output["question"] == instance["problem"] for output in outputs):
        continue # Skip already processed claims

    question = instance["problem"]
    solution = instance["solution"] # Ground truth solution

    # Call the model to get its answer
    model_response = call_mistral(question)

    compare = extract_answer(solution, model_response).lower()

    # Compare the model's answer with the actual answer
    if compare == "true":
        summary["correct"] += 1
    elif compare == "false":
        summary["wrong"] += 1
    else:
        summary["inconclusive"] += 1
    summary["total"] += 1
    
    # Store the question and answer
    output_record = {
        "question": question,
        "solution": solution,
        "model_response": model_response,
        "comparison": compare
    }
    outputs.append(output_record)

    pd.DataFrame(outputs).to_csv(output_file_path, index=False, encoding="utf-8")
    with open(summary_file_path, 'w', encoding="utf-8") as file:
        json.dump(summary, file, ensure_ascii=False, indent=4)

print(f"Results saved to {output_file_path} and {summary_file_path}")

  from .autonotebook import tqdm as notebook_tqdm
Filter: 100%|██████████| 5000/5000 [00:00<00:00, 41524.06 examples/s]
 13%|█▎        | 172/1324 [29:06<3:14:59, 10.16s/it]


ValueError: invalid literal for int() with base 10: '[invalid]'