In [34]:
# Start with imports - ask ChatGPT to explain any package that you don't know

import os
import json
from dotenv import load_dotenv
from openai import OpenAI
from anthropic import Anthropic
from IPython.display import Markdown, display

In [35]:
# Always remember to do this!
load_dotenv(override=True)

True

In [37]:
# Print the key prefixes to help with any debugging

openai_api_key = os.getenv('OPENAI_API_KEY')
anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
google_api_key = os.getenv('GOOGLE_API_KEY')
deepseek_api_key = os.getenv('DEEPSEEK_API_KEY')
groq_api_key = os.getenv('GROQ_API_KEY')

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")
    
if anthropic_api_key:
    print(f"Anthropic API Key exists and begins {anthropic_api_key[:7]}")
else:
    print("Anthropic API Key not set (and this is optional)")

if google_api_key:
    print(f"Google API Key exists and begins {google_api_key[:2]}")
else:
    print("Google API Key not set (and this is optional)")

if deepseek_api_key:
    print(f"DeepSeek API Key exists and begins {deepseek_api_key[:3]}")
else:
    print("DeepSeek API Key not set (and this is optional)")

if groq_api_key:
    print(f"Groq API Key exists and begins {groq_api_key[:4]}")
else:
    print("Groq API Key not set (and this is optional)")

OpenAI API Key exists and begins sk-proj-
Anthropic API Key exists and begins sk-ant-
Google API Key exists and begins AI
DeepSeek API Key not set (and this is optional)
Groq API Key exists and begins gsk_


In [38]:
openai_client = OpenAI(
    api_key=openai_api_key,
)

# Groq (Llama, Mixtral via OpenAI-compatible API)
groq_client = OpenAI(
    api_key=groq_api_key,
    base_url="https://api.groq.com/openai/v1",
)

# Anthropic (Claude)
anthropic_client = Anthropic(
    api_key=anthropic_api_key,
)

# Google Gemini via OpenAI-compatible endpoint

google_client = OpenAI(
    api_key=google_api_key,
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
)



In [39]:
request = "Please come up with a challenging, nuanced question that I can ask a number of LLMs to evaluate their intelligence. "
request += "Answer only with the question, no explanation and and should be crisp and should be question"
messages = [{"role": "user", "content": request}]

In [40]:
COMPETITOR_MODELS = [
    "gpt-4.1-mini",
    "llama-3.1-8b",
    "gemini-2.0-flash",
]

In [41]:
openai = OpenAI()
response = openai.chat.completions.create(
    model=COMPETITOR_MODELS[0],
    messages=messages
)
question = response.choices[0].message.content
print(question)


How can the concept of free will be reconciled with deterministic physical laws without resorting to dualism or randomness?


In [42]:
MODEL_REGISTRY = {
    "gpt-4.1-mini": {
        "client": openai_client,
        "model_name": "gpt-4.1-mini",
    },
    "gpt-4.1": {
        "client": openai_client,
        "model_name": "gpt-4.1",
    },
    "llama-3.1-8b": {
        "client": groq_client,
        "model_name": "llama-3.1-8b-instant",
    },
    "gemini-2.0-flash": {
        "client": google_client,
        "model_name": "gemini-2.0-flash",
    },
}

In [43]:
def get_model_answer(client, model_name, question):
    response = client.chat.completions.create(
        model=model_name,
        messages=[{"role": "system", "content": "You are a helpful assistant that can answer questions."}, {"role": "user", "content":  question}]
    )
    return response.choices[0].message.content


In [48]:
answers = []
competitors = []
for model_info in COMPETITOR_MODELS:
    print(model_info)
    model_name = MODEL_REGISTRY[model_info]["model_name"]
    competitors.append(model_name)
    client = MODEL_REGISTRY[model_info]["client"]
    answer = get_model_answer(client, model_name, question)
    answers.append(answer)






gpt-4.1-mini
llama-3.1-8b
gemini-2.0-flash


In [49]:
print(answers)
print(competitors)

['Reconciling free will with deterministic physical laws, without invoking dualism (a separate non-physical realm) or randomness, is a classic and nuanced philosophical challenge. Here are some approaches and concepts that attempt to resolve this tension within a monistic, deterministic framework:\n\n### 1. Compatibilism (Soft Determinism)\nCompatibilism is the most prominent strategy. It redefines free will so that **free will and determinism are not mutually exclusive**. The key idea is that free will does not require indeterminism or the absence of causality, but rather the ability to act according to one’s desires, intentions, and rational deliberations—even if these are causally determined.\n\n- **Freedom as Volitional Control:** Free will is understood as acting in alignment with one’s internal states (beliefs, desires, reasons) without external coercion or compulsion.\n- **Example:** If you decide to have tea instead of coffee because you prefer tea, and that preference is cause

In [57]:
answers_json = []
for idx, (name, ans) in enumerate(zip(competitors, answers)):
    answers_json.append({
        "idx": idx,
        "model": name,
        "answer": ans
        
    })

print(answers_json)


[{'idx': 0, 'model': 'gpt-4.1-mini', 'answer': 'Reconciling free will with deterministic physical laws, without invoking dualism (a separate non-physical realm) or randomness, is a classic and nuanced philosophical challenge. Here are some approaches and concepts that attempt to resolve this tension within a monistic, deterministic framework:\n\n### 1. Compatibilism (Soft Determinism)\nCompatibilism is the most prominent strategy. It redefines free will so that **free will and determinism are not mutually exclusive**. The key idea is that free will does not require indeterminism or the absence of causality, but rather the ability to act according to one’s desires, intentions, and rational deliberations—even if these are causally determined.\n\n- **Freedom as Volitional Control:** Free will is understood as acting in alignment with one’s internal states (beliefs, desires, reasons) without external coercion or compulsion.\n- **Example:** If you decide to have tea instead of coffee becaus

In [58]:
judge_prompt = (
        "You are judging a competition between multiple models.\n"
        "You will receive a JSON object with:\n"
        "- 'question': the question text\n"
        "- 'answers': a list of objects with 'index', 'model', 'answer'\n\n"
        "Your job:\n"
        "1. Read all answers carefully.\n"
        "2. Rank them from best to worst based on:\n"
        "   - correctness\n"
        "   - clarity\n"
        "   - depth of reasoning\n\n"
        "Return ONLY valid JSON in this format:\n"
        "{ \"results\": [best_index, second_index, third_index, ...] }\n"
        "Do NOT add explanation, comments, or extra text."
    )

judge_input = {
    "question": question,
    "answers": answers_json
}

judge_response = openai.chat.completions.create(
    model="gpt-4o-mini",
    messages=[{"role": "system", "content": judge_prompt}, {"role": "user", "content": json.dumps(judge_input)}]
)

judge_output = json.loads(judge_response.choices[0].message.content)
print(judge_output)


{'results': [0, 2, 1]}


In [60]:
ranking_models = []
for idx in judge_output["results"]:
    ranking_models.append(competitors[idx])

print(ranking_models)



['gpt-4.1-mini', 'gemini-2.0-flash', 'llama-3.1-8b-instant']
