Using and comparing different models

In [1]:
import os
import json
from dotenv import load_dotenv
from openai import OpenAI
from IPython.display import Markdown, display

In [None]:
load_dotenv(override=True)

In [None]:
openai_api_key = os.getenv("OPENAI_API_KEY")
google_api_key = os.getenv("GOOGLE_API_KEY")
gemini_api_key = os.getenv("GEMINI_API_KEY")
ollama_api_key = os.getenv("OLLAMA_API_KEY")

if openai_api_key:
    print(f"OPENAI_API_KEY is set {openai_api_key[:8]}")
else:
    print("OPENAI_API_KEY is not set")

if google_api_key:
    print(f"GOOGLE_API_KEY is set {google_api_key[:8]}")
else:
    print("GOOGLE_API_KEY is not set")

if gemini_api_key:
    print(f"GEMINI_API_KEY is set {gemini_api_key[:8]}")
else:
    print("GEMINI_API_KEY is not set")

if ollama_api_key:
    print(f"OLLAMA_API_KEY is set {ollama_api_key[:8]}")
else:
    print("OLLAMA_API_KEY is not set")

In [4]:
request = "I want to evaluate the performance of multiple LLMs. I need you to give me a challenging question which can be used to evaluate the performance of multiple LLMs. Provide only Question, no answer."
messages = [{"role": "user", "content": request}]
openai_model_name = "gpt-4o-mini"
google_model_name = "gemini-2.0-flash"
gemini_model_name = "gemini-2.0-flash"
ollama_model_name = "llama3.2"

In [None]:
messages 

In [None]:
openai_client = OpenAI()
response = openai_client.chat.completions.create(model=openai_model_name, messages=messages)

question = response.choices[0].message.content

display(Markdown(question))

In [7]:
competitors = []
answers = []
messages = [{"role": "user", "content": question}]

In [None]:
response = openai_client.chat.completions.create(model=openai_model_name, messages=messages)
openai_model_answer = response.choices[0].message.content

display(Markdown(openai_model_answer))
competitors.append(openai_model_name)
answers.append(openai_model_answer)

In [None]:
gemini_openai_client = OpenAI(api_key=gemini_api_key, base_url="https://generativelanguage.googleapis.com/v1beta/openai/")

response = gemini_openai_client.chat.completions.create(model=gemini_model_name, messages=messages)
gemini_model_answer = response.choices[0].message.content

display(Markdown(gemini_model_answer))
competitors.append(gemini_model_name)
answers.append(gemini_model_answer)

#### No need to install a model using `!ollama pull llama3.2` because I am running this in a docker container

Step 1: Pull the Image

> docker pull alpine/ollama

Step 2: Run the Container

Make sure to expose the port so that we can use it with the localhost

> docker run -d \
>   --name ollama \
>   -p 11434:11434 \
>   -v ollama_data:/root/.ollama \
>   alpine/ollama

Step 3: Pull the Model

> docker exec -it ollama pull llama3.2

Step 4: Run the Model

> docker exec -it ollama run llama3.2

In [None]:
ollama_openai_client = OpenAI(api_key=ollama_api_key, base_url="http://localhost:11434/v1")

response = ollama_openai_client.chat.completions.create(model=ollama_model_name, messages=messages)

ollama_model_answer = response.choices[0].message.content
display(Markdown(ollama_model_answer))
competitors.append(ollama_model_name)
answers.append(ollama_model_answer)

In [None]:
from tabulate import tabulate
table_data = list(zip(competitors, answers))

table = tabulate(table_data, headers=["Competitor", "Answer"], tablefmt="grid")

print(table)


In [None]:
together = ""
for index, answer in enumerate(answers):
    together += f"{competitors[index]}: {answer}\n\n"

display(Markdown(together))

In [22]:
judge = f"""You are judging a competition between {len(competitors)} competitors.
Each model has been given this question:

{question}

Your job is to evaluate each response for clarity and strength of argument, and rank them in order of best to worst.
Respond with JSON, and only JSON, with the following format:
{{"results": ["best competitor number", "second best competitor number", "third best competitor number", ...]}}

Here are the responses from each competitor:

{together}

Now respond with the JSON with the ranked order of the competitors, nothing else. Do not include markdown formatting or code blocks."""

In [None]:
print(judge)


In [26]:
judge_messages = [{"role": "user", "content": judge}]

In [None]:
response = openai_client.chat.completions.create(model=openai_model_name, messages=judge_messages)

openai_judge_result = response.choices[0].message.content

print(openai_judge_result)

In [None]:
response = gemini_openai_client.chat.completions.create(model=gemini_model_name, messages=judge_messages)

gemini_judge_result = response.choices[0].message.content

print(gemini_judge_result)

In [None]:
response = ollama_openai_client.chat.completions.create(model=ollama_model_name, messages=judge_messages)

ollama_judge_result = response.choices[0].message.content

print(ollama_judge_result)

In [None]:
# openai_judge_result = {"results": ["gemini-2.0-flash", "gpt-4o-mini", "llama3.2"]} 
results_dict = json.loads(openai_judge_result)

print(results_dict)

ranks_list = results_dict["results"]

print(ranks_list)

for rank, competitor in enumerate(ranks_list):
    print(f"Rank {rank + 1}: {competitor}")





# Exercise 

Which pattern(s) did this use? 
1. Prompt Chaining
2. Parallelization (Code does the parallelization/orchestration like Co-ordinator and Aggregator)
3. Evaluation-Optimizer (Validator)

Try updating this to add another Agentic design pattern. 

# Commercial implications

These kinds of patterns - to send a task to multiple models, and evaluate results, are common where you need to improve the quality of your LLM response. 

This approach can be universally applied to business projects where accuracy is critical. 