In [8]:


import os
import dotenv
from pydantic import BaseModel
import openai
from typing import List, Dict, Literal
import pandas as pd
from datetime import datetime
import json
import time
import requests

dotenv.load_dotenv(override=True)

API_URL = os.getenv('API_URL')
API_KEY = os.getenv('API_KEY')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')


In [9]:
def send_chat_message(messages: List[Dict[str, str]]) -> dict:
    """
    Send a chat message to the API with up to 3 retries on failure
    
    Args:
        messages: List of message dictionaries, each containing 'sender' and 'message'
        base_url: Base URL of the API server
    
    Returns:
        dict: Response from the API
    """
    endpoint = API_URL
    chai_request = {
        "memory": "The year is 500 AD. You are the great sage of Babylon. You are known to be the wisest of all the wise men of Babylon and people come from far and wide to seek your wisdom.",
        "prompt": "An engaging conversation with Great Sage of Babylon.",
        "bot_name": "Great Sage of Babylon",
        "user_name": "User",
        "chat_history": messages,
    }
    
    max_retries = 3
    for attempt in range(max_retries):
        try:
            response = requests.post(endpoint, json=chai_request, headers={"Authorization": f"Bearer {API_KEY}"})
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            if attempt == max_retries - 1:  # Last attempt]
                print(f"Failed to get response after {max_retries} attempts")
                raise  # Re-raise the last exception
            time.sleep(1 * (attempt + 1))  # Exponential backoff



In [21]:


client = openai.OpenAI(api_key=OPENAI_API_KEY)


class TestCase(BaseModel):
    question: str
    answer: str

class TestCaseResponse(BaseModel):
    test_cases: List[TestCase]


def generate_test_cases(num_cases: int = 10) -> List[TestCase]:
    """
    Use GPT-4 to generate diverse test cases
    """
    system_prompt = """
    Generate challenging test cases to evaluate an AI model that acts as a Babylonian sage from 500 AD.
    We are specifically interested in testing the model's knowledge.

    Keep in mind that the model we are testing is optimized for social interactions. It is uniquely good at being engaging, entertaining, and 
    aligned with the user who is interacting with it. It is not optimized for logical reasoning, so take that into account when generating test cases.

    Furthermore (and this is very important), I want to evaluate the model objectively. So try to generate test cases that have objective answers. 
    However, the answers should be somewhat obscure, so that a small LLM may not be able to answer it. For example, the question "What is the name of 
    the Babylonian king who is famous for creating one of the earliest and most complete written legal codes?" has an objective answer (Hammurabi), 
    but it is not a well-known fact taught in high school history classes, so it's probably not obscure enough to be a good test case.

    For example, the question "What is the capital of France?" has an objective answer (Paris). The question "Who are Barack and Michelle Obama's children?",
    it has an objective answer (Malia and Sasha). The question "What are the best poems from ancient Babylon" is not objective, because there are many different
    opinions on what the best poems from ancient Babylon are. Where possible, generate test cases that have objective answers, and that relate to questions
    you can imagine a traveler in the ancient world might ask of a Babylonian sage.


    For each question, provide:
    
    1. The question text
    2. Correct answer

    Format as JSON array of objects with fields: question, expected
    """
    
    try:
        response = client.beta.chat.completions.parse(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": f"Generate {num_cases} diverse test cases."}
            ],
            temperature=0.4,
            response_format=TestCaseResponse
        )
        message = response.choices[0].message
        json_data = json.loads(message.content)
        # Convert dictionary objects to TestCase objects
        return [TestCase(**test_case) for test_case in json_data['test_cases']]
    except Exception as e:
        print(f"Error generating test cases: {e}")
        return []

# Update the last line to remove the ['test_cases'] access
knowledge_test_cases = generate_test_cases(10)



In [22]:
print("KNOWLEDGE TEST CASES\n")
for test_case in knowledge_test_cases:
    print("Question: ", test_case.question)
    print("Answer: ", test_case.answer, '\n')


KNOWLEDGE TEST CASES

Question:  What is the name of the Babylonian god associated with wisdom and writing, often depicted with a stylus and tablet?
Answer:  Nabu 

Question:  In Babylonian astronomy, what was the name given to the planet Venus when it appeared as the morning star?
Answer:  Dilbat 

Question:  What is the name of the ancient Babylonian ziggurat that is believed to have inspired the biblical story of the Tower of Babel?
Answer:  Etemenanki 

Question:  Which Babylonian king is credited with the construction of the Hanging Gardens of Babylon, one of the Seven Wonders of the Ancient World?
Answer:  Nebuchadnezzar II 

Question:  What was the name of the Babylonian festival that celebrated the New Year and the victory of the god Marduk over the forces of chaos?
Answer:  Akitu 

Question:  What is the name of the ancient Babylonian text that contains a list of kings and their reigns, providing a chronological framework for Babylonian history?
Answer:  The Babylonian King Li

In [23]:



class EvaluationResult(BaseModel):
    score: Literal['CORRECT', 'INCORRECT', 'NOT_ATTEMPTED']
    reasoning: str

class Evaluation(BaseModel):
    score: Literal['CORRECT', 'INCORRECT', 'NOT_ATTEMPTED']
    reasoning: str
    question: str
    response: str
    answer: str
    chai_model: str

def evaluate_test_case(test_case: TestCase) -> Evaluation:
    """
    Evaluate the test cases
    """
    messages = [
        {"sender": "user", "message": test_case.question}
    ]
    response = send_chat_message(messages)
    chai_model = response['model_name']
    model_output = response['model_output']

    system_prompt = f"""
    You are evaluating the response of an AI model to a question.
    The question is: {test_case.question}
    An example of the correct answer is: {test_case.answer}
    The response of the AI model is: {model_output}

    You are to evaluate the response of the AI model to the question. Keep in mind that the AI model might be pretending
    to be a character of some sort, so don't hold that against the model in evaluating the response. Just evaluate whether
    the model's response contains the correct answer or not.

    Keep in mind that the model we are testing is optimized for social interactions. It is uniquely good at being engaging, entertaining, and 
    aligned with the user who is interacting with it. It is not optimized for logical reasoning, so it will often give answers that may 
    or may not be correct, but are likely to contain additional information that is not relevant to the question.

    You are to return one of the following scores:
    CORRECT: The model's response contains the correct answer.
    INCORRECT: The model's response does not contain the correct answer.
    NOT_ATTEMPTED: The model did not attempt to answer the question.

    As an example, if the question is "What is the capital of France?", CORRECT answers would be:
    - Paris
    - The capital of France is Paris
    - The capital of France is Paris, which is in Europe

    INCORRECT answers would be:
    - France
    - Frankfurt is the capital of France
    - Bordeaux is a city in France

    NOT_ATTEMPTED answers would be:
    - I don't know
    - I'm not sure
    - I'm not sure, but I think it's Frankfurt
    """

    try:
        response = client.beta.chat.completions.parse(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": "Evaluate the response of the AI model to the question."}
            ],
            temperature=0.7,
            response_format=EvaluationResult
        )
        message = response.choices[0].message
        json_data = json.loads(message.content)
        eval_result = EvaluationResult(**json_data)
        
        # Create and return Evaluation object with all fields
        return Evaluation(
            score=eval_result.score,
            reasoning=eval_result.reasoning,
            question=test_case.question,
            response=model_output,
            answer=test_case.answer,
            chai_model=chai_model
        )
    except Exception as e:
        print(f"Error generating test cases: {e}")
        return None


total_scores = {'CORRECT': 0, 'INCORRECT': 0, 'NOT_ATTEMPTED': 0}


for idx, test_case in enumerate(knowledge_test_cases):
    print(f"TEST CASE {idx+1}: ", test_case.question)
    print("EXPECTED: ", test_case.answer)

    evaluation = evaluate_test_case(test_case)
    score = evaluation.score
    total_scores[score] += 1
    print(f'RESPONSE: {evaluation.response}')
    print(f'SCORE: {score}')
    print(f'REASONING: {evaluation.reasoning}')
    print('\n')
    time.sleep(3)

print("Total scores: ", total_scores)





TEST CASE 1:  What is the name of the Babylonian god associated with wisdom and writing, often depicted with a stylus and tablet?
EXPECTED:  Nabu
RESPONSE:  The name of the Babylonian god associated with wisdom and writing is Nabu. He is often depicted holding a stylus and tablet, which are symbols of writing and knowledge. Nabu is also known for his role in divination and was considered a son of the chief god Marduk.
SCORE: CORRECT
REASONING: The model's response contains the correct answer, which is 'Nabu.' The additional information about Nabu being associated with wisdom, writing, and divination, as well as being depicted with a stylus and tablet, supports the main answer and is accurate. Thus, the response is correct.


TEST CASE 2:  In Babylonian astronomy, what was the name given to the planet Venus when it appeared as the morning star?
EXPECTED:  Dilbat
RESPONSE:  Ah, a question that speaks to the heart of Babylonian wisdom. When Venus appeared in the eastern sky before sunrise