In [1]:
import dotenv
import json
import os
import openai
from pydantic import BaseModel
import requests
import time
from typing import List, Dict, Literal

dotenv.load_dotenv(override=True)

API_URL = os.getenv('API_URL')
API_KEY = os.getenv('API_KEY')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
NVIDIA_API_KEY = os.getenv('NVIDIA_API_KEY')

### Setup a function for calling the Chai API

In [2]:
def send_chat_message(messages: List[Dict[str, str]]) -> dict:
    """
    Send a chat message to the API with up to 3 retries on failure
    
    Args:
        messages: List of message dictionaries, each containing 'sender' and 'message'
        base_url: Base URL of the API server
    
    Returns:
        dict: Response from the API
    """
    endpoint = API_URL
    chai_request = {
        "memory": "The year is 500 AD. You are the great sage of Babylon. You are known to be the wisest of all the wise men of Babylon and people come from far and wide to seek your wisdom.",
        "prompt": "An engaging conversation with Great Sage of Babylon.",
        "bot_name": "Great Sage of Babylon",
        "user_name": "User",
        "chat_history": messages,
    }
    
    max_retries = 3
    for attempt in range(max_retries):
        try:
            response = requests.post(endpoint, json=chai_request, headers={"Authorization": f"Bearer {API_KEY}"})
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            if attempt == max_retries - 1:  # Last attempt]
                print(f"Failed to get response after {max_retries} attempts")
                raise  # Re-raise the last exception
            time.sleep(1 * (attempt + 1))  # Exponential backoff


# do a test call to the chai api
messages = [{"sender": "user", "message": "What is the capital of France?"}]
response = send_chat_message(messages)
print(json.dumps(response, indent=4))


{
    "model_output": " Paris, the capital of France. It is known for its art, culture, cuisine and fashion. Some famous landmarks include the Eiffel Tower, Louvre Museum, and Notre-Dame Cathedral. It is the most populated city in France with over 2 million residents. Is there anything specific you wanted to know about it?",
    "model_input": "User: What is the capital of France?\nGreat Sage of Babylon:",
    "model_name": "blend_jikul_2024-11-25",
    "generation_params": {
        "temperature": 0.9,
        "top_p": 1.0,
        "min_p": 0.05,
        "top_k": 80,
        "presence_penalty": 0.0,
        "frequency_penalty": 0.0,
        "stopping_words": [
            "\n",
            "</s>",
            "####",
            "Bot:",
            "User:",
            "You:",
            "<|im_end|>",
            "<|eot_id|>"
        ],
        "max_input_tokens": 256,
        "best_of": 4,
        "max_output_tokens": 64
    },
    "formatter": {
        "memory_template": "",
     

### Generate test cases

In [3]:
client = openai.OpenAI(api_key=OPENAI_API_KEY)


class TestCase(BaseModel):
    question: str
    answer: str

class TestCaseResponse(BaseModel):
    test_cases: List[TestCase]


def generate_test_cases(num_cases: int = 10) -> List[TestCase]:
    """
    Use GPT-4o to generate diverse test cases
    """
    system_prompt = """
    Generate challenging test cases to evaluate an AI model that acts as a Babylonian sage from 500 AD.
    We are specifically interested in testing the model's knowledge.

    Keep in mind that the model we are testing is optimized for social interactions. It is uniquely good at being engaging, entertaining, and 
    aligned with the user who is interacting with it. It is not optimized for logical reasoning, so take that into account when generating test cases.

    Furthermore (and this is very important), I want to evaluate the model objectively. So try to generate test cases that have objective answers. 
    However, the answers should be obscure if possible, so that a small LLM may not be able to answer it (but importantly, these need to be things you
    are sure about, not made up facts). For example, the question "What is the name of the Babylonian king who is famous for creating one of the earliest 
    and most complete written legal codes?" has an objective answer (Hammurabi), but it is not a well-known fact taught in high school history classes, 
    so it's probably not obscure enough to be a good test case. Also, the question should have a single objectively correct answer, not multiple correct answers.

    For example, the question "What is the capital of France?" has a single objective answer (Paris). The question "Who are Barack and Michelle Obama's children?",
    it has what most people would consider a single objective answer (Malia and Sasha Obama). The question "What are the best poems from ancient Babylon" is not 
    objective, because there are many different opinions on what the best poems from ancient Babylon are. Where possible, generate test cases that have objective
    answers, and that relate to questions you can imagine a traveler in the ancient world might ask of a Babylonian sage.


    For each question, provide:
    
    1. The question text
    2. Correct answer

    Format as JSON array of objects with fields: question, expected
    """
    
    try:
        response = client.beta.chat.completions.parse(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": f"Generate {num_cases} diverse test cases."}
            ],
            # Temperature is set to 1 to encourage diversity, but would be interesting to experiment with different values, espcially if 
            # we built a better framework for evaluating the test cases (e.g. against other frontier models to filter out bad questions)
            temperature=1,
            response_format=TestCaseResponse
        )
        message = response.choices[0].message
        json_data = json.loads(message.content)
        # Convert dictionary objects to TestCase objects
        return [TestCase(**test_case) for test_case in json_data['test_cases']]
    except Exception as e:
        print(f"Error generating test cases: {e}")
        return []

# Update the last line to remove the ['test_cases'] access
knowledge_test_cases = generate_test_cases(20)


In [4]:
print("KNOWLEDGE TEST CASES\n")
for test_case in knowledge_test_cases:
    print("Question: ", test_case.question)
    print("Answer: ", test_case.answer, '\n')


KNOWLEDGE TEST CASES

Question:  What is the Babylonian name for the constellation known today as Taurus?
Answer:  GU4_AN_NA (The Heavenly Bull) 

Question:  Which Babylonian ruler is known for his military campaign against Elam in 1158 BC, commemorated by the Victory Stele of Naram-Sin?
Answer:  Naram-Sin 

Question:  What ancient Babylonian artifact holds a map that is one of the oldest known representations of the world?
Answer:  Babylonian World Map (Imago Mundi) 

Question:  Who is the Babylonian god of fresh water, wisdom, and incantations?
Answer:  Ea (also known as Enki) 

Question:  Which ancient Babylonian text is known for its comprehensive law code, consisting of 282 laws?
Answer:  The Code of Hammurabi 

Question:  What is the name of the Babylonian inscription detailing a great flood, which predates the Biblical story of Noah's Ark?
Answer:  The Epic of Atrahasis 

Question:  Who is the Babylonian moon god, often depicted with a flowing beard and lunar features?
Answer:  

### Setup and run a function for evaluating the test cases against Chai

In [5]:



class EvaluationResult(BaseModel):
    score: Literal['CORRECT', 'INCORRECT', 'NOT_ATTEMPTED']
    reasoning: str

class Evaluation(BaseModel):
    score: Literal['CORRECT', 'INCORRECT', 'NOT_ATTEMPTED']
    reasoning: str
    question: str
    response: str
    answer: str

def evaluate_response(test_case: TestCase, model_output: str) -> Evaluation:
    """
    Use GPT-4o to evaluate the response of a model to the question
    """
    system_prompt = f"""
    You are evaluating the response of an AI model to a question.
    The question is: {test_case.question}
    An example of the correct answer is: {test_case.answer}
    The response of the AI model is: {model_output}

    You are to evaluate the response of the AI model to the question. Keep in mind that the AI model might be pretending
    to be a character of some sort, so don't hold that against the model in evaluating the response. Just evaluate whether
    the model's response contains the correct answer or not.

    Keep in mind that the model we are testing is optimized for social interactions. It is uniquely good at being engaging, entertaining, and 
    aligned with the user who is interacting with it. It is not optimized for logical reasoning, so it will often give answers that may 
    or may not be correct, but are likely to contain additional information that is not relevant to the question.

    You are to return one of the following scores:
    CORRECT: The model's response contains the correct answer.
    INCORRECT: The model's response does not contain the correct answer.
    NOT_ATTEMPTED: The model did not attempt to answer the question.

    As an example, if the question is "What is the capital of France?", CORRECT answers would be:
    - Paris
    - The capital of France is Paris
    - The capital of France is Paris, which is in Europe

    INCORRECT answers would be:
    - France
    - Frankfurt is the capital of France
    - Bordeaux is a city in France

    NOT_ATTEMPTED answers would be:
    - I don't know
    - I'm not sure
    - I'm not sure, but I think it's Frankfurt
    """

    try:
        response = client.beta.chat.completions.parse(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": "Evaluate the response of the AI model to the question."}
            ],
            temperature=0.7,
            response_format=EvaluationResult
        )
        message = response.choices[0].message
        json_data = json.loads(message.content)
        eval_result = EvaluationResult(**json_data)
        
        # Create and return Evaluation object with all fields
        return Evaluation(
            score=eval_result.score,
            reasoning=eval_result.reasoning,
            question=test_case.question,
            response=model_output,
            answer=test_case.answer
        )
    except Exception as e:
        print(f"Error generating test cases: {e}")
        return None


def evaluate_test_cases_chai(test_case: TestCase) -> Evaluation:
    """
    Evaluate the test cases
    """
    messages = [
        {"sender": "user", "message": test_case.question}
    ]
    response = send_chat_message(messages)
    model_output = response['model_output']

    return evaluate_response(test_case, model_output)


total_scores_chai = {'CORRECT': 0, 'INCORRECT': 0, 'NOT_ATTEMPTED': 0}

for idx, test_case in enumerate(knowledge_test_cases):
    print(f"TEST CASE {idx+1}: ", test_case.question)
    print("EXPECTED: ", test_case.answer)

    evaluation = evaluate_test_cases_chai(test_case)
    score = evaluation.score
    total_scores_chai[score] += 1
    print(f'RESPONSE: {evaluation.response}')
    print(f'SCORE: {score}')
    print(f'REASONING: {evaluation.reasoning}')
    print('\n')
    time.sleep(3)

print("Total scores (Chai): ", total_scores_chai)

TEST CASE 1:  What is the Babylonian name for the constellation known today as Taurus?
EXPECTED:  GU4_AN_NA (The Heavenly Bull)
RESPONSE:  The Babylonian name for the constellation we know today as Taurus was "Gud-Alim," which translates to "The Bull of Heaven." This name reflects the ancient Babylonians' observation of the constellation's shape resembling a bull's form.
SCORE: INCORRECT
REASONING: The model's response incorrectly identifies the Babylonian name for the constellation Taurus as "Gud-Alim." The correct Babylonian name for Taurus is GU4_AN_NA (The Heavenly Bull). Therefore, the response does not contain the correct answer.


TEST CASE 2:  Which Babylonian ruler is known for his military campaign against Elam in 1158 BC, commemorated by the Victory Stele of Naram-Sin?
EXPECTED:  Naram-Sin
RESPONSE:  The Babylonian ruler known for his military campaign against Elam in 1158 BC, commemorated by the Victory Stele of Naram-Sin, is actually Naram-Sin himself, not to be confused w

### Setup and run a function for evaluating the test cases against Llama 3.8B


In [6]:


llama_client = openai.OpenAI(
    base_url="https://integrate.api.nvidia.com/v1",
    api_key=NVIDIA_API_KEY
)

completion = llama_client.chat.completions.create(
    model="meta/llama3-8b-instruct",
    messages=[{"role": "user", "content": "What's the capital of France?"}],
    temperature=0.5,
    top_p=1,
    max_tokens=64,
    stream=False  # Changed to False
)

# Simply print the response content
print(completion.choices[0].message.content)



That's an easy one! The capital of France is Paris.


In [7]:


def evaluate_test_case_nvidia(test_case: TestCase) -> Evaluation:
    """
    Evaluate the test cases
    """
    messages = [
        {"role": "user", "content": test_case.question}
    ]

    response = llama_client.chat.completions.create(
        model="meta/llama3-8b-instruct",
        messages=messages,
        temperature=0.5,
        top_p=1,
        max_tokens=64,  # for an apples-to-apples comparison, I use the same max tokens as the Chai model
        stream=False
    )
    
    model_output = response.choices[0].message.content

    return evaluate_response(test_case, model_output)


total_scores_llama = {'CORRECT': 0, 'INCORRECT': 0, 'NOT_ATTEMPTED': 0}


for idx, test_case in enumerate(knowledge_test_cases):
    print(f"TEST CASE {idx+1}: ", test_case.question)
    print("EXPECTED: ", test_case.answer)

    evaluation = evaluate_test_case_nvidia(test_case)
    score = evaluation.score
    total_scores_llama[score] += 1
    print(f'RESPONSE: {evaluation.response}')
    print(f'SCORE: {score}')
    print(f'REASONING: {evaluation.reasoning}')
    print('\n')
    time.sleep(3)

print("Total scores (Llama): ", total_scores_llama)






TEST CASE 1:  What is the Babylonian name for the constellation known today as Taurus?
EXPECTED:  GU4_AN_NA (The Heavenly Bull)
RESPONSE: In ancient Babylonian astronomy, the constellation known today as Taurus was referred to as "Zibra" or "Zabrar" (also spelled "Zibren" or "Zabren"). This name was derived from the Babylonian god of the moon, Nabu, who was often depicted with the
SCORE: INCORRECT
REASONING: The AI model's response does not contain the correct Babylonian name for the constellation Taurus, which is GU4_AN_NA (The Heavenly Bull). Instead, it provides names 'Zibra' or 'Zabrar,' which are not recognized as the Babylonian name for Taurus. Therefore, the response is incorrect.


TEST CASE 2:  Which Babylonian ruler is known for his military campaign against Elam in 1158 BC, commemorated by the Victory Stele of Naram-Sin?
EXPECTED:  Naram-Sin
RESPONSE: The Babylonian ruler known for his military campaign against Elam in 1158 BC, commemorated by the Victory Stele of Naram-Sin,

### Print summary of results

In [8]:
print("Total scores (Chai): ", total_scores_chai)
print("Total scores (Llama): ", total_scores_llama)

Total scores (Chai):  {'CORRECT': 11, 'INCORRECT': 9, 'NOT_ATTEMPTED': 0}
Total scores (Llama):  {'CORRECT': 12, 'INCORRECT': 8, 'NOT_ATTEMPTED': 0}
