# GSM8K

In [None]:
import json
import pandas as pd
import os
import re
from openai import OpenAI
from datasets import load_dataset
from tqdm import tqdm

# Retrieve the API key from environment variables
client = OpenAI(api_key=os.getenv("DEEPSEEK_API_KEY"), base_url="https://api.deepseek.com")

# Load the GSM8K dataset
dataset = load_dataset('gsm8k', 'main', split='test')

benchmark = "gsm8k"
certainty = "control"
model = "deepseek-v3"

# Directory to save results
output_file_path = f"{benchmark}_{certainty}_{model}.csv"
summary_file_path = f"{benchmark}_{certainty}_{model}_summary.json"

# Initialize or load existing outputs and summary
if os.path.exists(output_file_path):
    output_df = pd.read_csv(output_file_path)
    outputs = output_df.to_dict('records')
else:
    outputs = []

if os.path.exists(summary_file_path):
    with open(summary_file_path, 'r', encoding='utf-8') as file:
        summary = json.load(file)
else:
    summary = {
        'correct': 0,
        'wrong': 0,
        'inconclusive': 0,
        'total': 0,
    }

def extract_answer(response):
    """Extract the numerical answer by finding the last number that occurs"""
    match = re.findall(r"\-?\d[\d\.,]*", response)
    if match:
        answer = match[-1]
        answer = answer.replace(",", "")
        answer = answer.replace(".", "")
        return answer
    else:
        return "[invalid]"

def call_deepseek(question):
    """Call model with a question and return the response."""
    while True:
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {
                    "role": "user",
                    "content": f"{question}\nPlease end your final response with the answer in the format '#### <number>'."
                },
            ],
            stream=False,
            temperature = 0
        )
        return response.choices[0].message.content

# Main execution loop
for instance in tqdm(dataset):
    if any(output['question'] == instance['question'] for output in outputs):
        continue  # Skip already processed claims

    question = instance['question']
    solution = instance['answer'] # Ground truth answer

    # Call the model to get its answer
    model_response = call_deepseek(question)

    model_answer = extract_answer(model_response)
    actual_answer = extract_answer(solution)

    # Compare the model's response with the actual answer
    if model_answer == "[invalid]":
        summary["inconclusive"] += 1
    elif model_answer == actual_answer:
        summary["correct"] += 1
    else:
        summary["wrong"] += 1
    summary["total"] += 1
    
    # Store the question and answer
    output_record = {
        "question": question,
        "model_response": model_response,
        "actual_answer": actual_answer,
        "model_answer": model_answer,
        "correct": actual_answer == model_answer
    }
    outputs.append(output_record)

    pd.DataFrame(outputs).to_csv(output_file_path, index=False, encoding='utf-8')
    with open(summary_file_path, 'w', encoding='utf-8') as file:
        json.dump(summary, file, ensure_ascii=False, indent=4)

print(f"Results saved to {output_file_path} and {summary_file_path}")

  0%|          | 0/1319 [00:00<?, ?it/s]


AuthenticationError: Error code: 401 - {'error': {'message': 'Authentication Fails (no such user)', 'type': 'authentication_error', 'param': None, 'code': 'invalid_request_error'}}