In [30]:
# Import the packages needed
import os
import json
from dotenv import load_dotenv
from openai import OpenAI
from anthropic import Anthropic
# from google import genai
from IPython.display import Markdown, display

In [5]:
# Load the environment variables
load_dotenv(override=True)

True

In [61]:
# Set the API key
openai_api_key = os.getenv('OPENAI_API_KEY')
anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
google_api_key = os.getenv('GOOGLE_API_KEY')
deepseek_api_key = os.getenv('DEEPSEEK_API_KEY')
groq_api_key = os.getenv('GROQ_API_KEY')
ollama_api_key = os.getenv('OLLAMA_API_KEY')

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")
    
if anthropic_api_key:
    print(f"Anthropic API Key exists and begins with {anthropic_api_key[:7]}")
else:
    print("Anthropic API Key not set (and this is optional)")

if google_api_key:
    print(f"Google API Key exists and begins with {google_api_key[:2]}")
else:
    print("Google API Key not set (and this is optional)")

if deepseek_api_key:
    print(f"DeepSeek API Key exists and begins with {deepseek_api_key[:3]}")
else:
    print("DeepSeek API Key not set (and this is optional)")

if groq_api_key:
    print(f"Groq API Key exists and begins with {groq_api_key[:4]}")
else:
    print("Groq API Key not set (and this is optional)")


OpenAI API Key exists and begins sk-proj-
Anthropic API Key exists and begins with sk-ant-
Google API Key exists and begins with AI
DeepSeek API Key exists and begins with sk-
Groq API Key exists and begins with gsk_


In [6]:
request = "Please come up with a challenging, nuanced question that I can ask a number of LLMs to evaluate their intelligence. "
request += "Answer only with the question, no explanation."
messages = [{"role": "user", "content": request}]

print(f"The question is: {messages[0]['content']}")

The question is: Please come up with a challenging, nuanced question that I can ask a number of LLMs to evaluate their intelligence. Answer only with the question, no explanation.


In [None]:
openai_client = OpenAI()
response = openai_client.chat.completions.create(
    model="gpt-4o-mini",
    messages=messages
)

display(Markdown(response.choices[0].message.content))

question = response.choices[0].message.content
print(question)

If you were tasked with designing an entirely new ethical framework to govern the development and implementation of artificial intelligence, what core principles would you prioritize, and how would you address potential conflicts between those principles in practical scenarios?

If you were tasked with designing an entirely new ethical framework to govern the development and implementation of artificial intelligence, what core principles would you prioritize, and how would you address potential conflicts between those principles in practical scenarios?


In [10]:
competitors = []
answers = []
messages = [{"role": "user", "content": question}]

In [None]:
# GPT 4o mini
model_name = "gpt-4o-mini"

response = openai_client.chat.completions.create(
    model=model_name,
    messages=messages
)

answer = response.choices[0].message.content
display(Markdown(answer))

competitors.append(model_name)
answers.append(answer)



In [None]:
# Anthropic Claude 3.7 Sonnet
anthropic_client = Anthropic()
model_name = "claude-3-7-sonnet-20250219"

response = anthropic_client.messages.create(
    model=model_name,
    messages=messages,
    max_tokens=1000
)

answer = response.content[0].text
display(Markdown(answer))

competitors.append(model_name)
answers.append(answer)

In [None]:
# I had to delete the second answer because I ran the code twice. 
# Cleaned it up without rerunning the entire notebook again.
#

# print(competitors)

# from IPython.display import Markdown, display

# for idx, answer in enumerate(answers, 1):
#     display(Markdown(f"**Answer {idx}:**\n\n{answer}\n\n---"))

# del competitors[1]
# del answers[1]

# print(competitors)
# for idx, answer in enumerate(answers, 1):
#     display(Markdown(f"**Answer {idx}:**\n\n{answer}\n\n---"))


In [None]:
# This is how the Google client has to be called but I cannot import google.generativeai
# as it is not installed in the environment.
#
# gemini_client = GoogleGenerativeAI()
# model_name = "gemini-2.0-flash-exp"

# response = gemini_client.generate_content(
#     model=model_name,
#     contents=messages
# )

# answer = response.text
# display(Markdown(answer))

# competitors.append(model_name)
# answers.append(answer)

gemini_client = OpenAI(api_key=google_api_key, base_url="https://generativelanguage.googleapis.com/v1beta/openai/")
model_name = "gemini-2.0-flash"

response = gemini_client.chat.completions.create(
    model=model_name, 
    messages=messages
)
answer = response.choices[0].message.content

display(Markdown(answer))
competitors.append(model_name)
answers.append(answer)

In [None]:
# DeepSeek Chat

deepseek_client = OpenAI(api_key=deepseek_api_key, base_url="https://api.deepseek.com/v1")
model_name = "deepseek-chat"

response = deepseek_client.chat.completions.create(
    model=model_name, 
    messages=messages
)
answer = response.choices[0].message.content

display(Markdown(answer))
competitors.append(model_name)
answers.append(answer)

In [None]:
# Groq 

groq_client = OpenAI(api_key=groq_api_key, base_url="https://api.groq.com/openai/v1")
model_name = "llama-3.3-70b-versatile"

response = groq_client.chat.completions.create(
    model=model_name, 
    messages=messages, 
    temperature=0.7
)
answer = response.choices[0].message.content

display(Markdown(answer))
competitors.append(model_name)
answers.append(answer)

In [None]:
# Ollama from local machine
# Currently llama3.2:latest is being used. 
# We can also pull deepseek, Gemma, Phi, Qwen, or DeepSeek

ollama_client = OpenAI(api_key=ollama_api_key, base_url="http://localhost:11434/v1")
model_name = "llama3.2:latest"

response = ollama_client.chat.completions.create(
    model=model_name, 
    messages=messages, 
    temperature=0.7
)
answer = response.choices[0].message.content

display(Markdown(answer))
competitors.append(model_name)
answers.append(answer) 


In [48]:
# Let's list all the competitors that we have used so far
print(competitors)


['gpt-4o-mini', 'claude-3-7-sonnet-20250219', 'gemini-2.0-flash', 'deepseek-chat', 'llama-3.3-70b-versatile', 'llama3.2:latest']


In [None]:
# lets zip the competitors and answers
competitors_answers = list(zip(competitors, answers))

# lets print the competitors and answers
for competitor, answer in competitors_answers:
    print(f"{competitor}: {answer}")

In [None]:
print(competitors_answers)

In [55]:
together = ""
for index, answer in enumerate(answers):
    together += f"# Response from competitor {index+1} - {competitors[index]}\n\n"
    together += answer + "\n\n"

# display(Markdown(together))

In [56]:
judge = f"""You are judging a competition between {len(competitors)} competitors.
Each model has been given this question:

{question}

Your job is to evaluate each response for clarity and strength of argument, and rank them in order of best to worst.
Respond with JSON, and only JSON, with the following format:
{{"results": ["best competitor number", "second best competitor number", "third best competitor number", ...]}}

Here are the responses from each competitor:

{together}

Now respond with the JSON with the ranked order of the competitors, nothing else. Do not include markdown formatting or code blocks."""

In [None]:
print(judge)

In [None]:
# lets create the judge agent and ask it to judge the responses
judge_messages = [
    {"role": "user", "content": judge}
]

In [None]:
# Lets use o3-mini to judge the responses

judge_response = openai_client.chat.completions.create(
    model="o3-mini",
    messages=judge_messages
)

results = judge_response.choices[0].message.content

print(results)

{"results": ["3", "1", "5", "4", "2", "6"]}


In [None]:
# Lets convert the json into results

results_dict = json.loads(results)
ranks = results_dict["results"]
for index, result in enumerate(ranks):
    competitor = competitors[int(result)-1]
    print(f"Rank {index+1}: {competitor}")

Rank 1: gemini-2.0-flash
Rank 2: gpt-4o-mini
Rank 3: llama-3.3-70b-versatile
Rank 4: deepseek-chat
Rank 5: claude-3-7-sonnet-20250219
Rank 6: llama3.2:latest
