In [1]:


import os
import dotenv
from pydantic import BaseModel
import openai
from typing import List, Dict
import pandas as pd
from datetime import datetime
import json
import time

dotenv.load_dotenv(override=True)

API_URL = os.getenv('API_URL')
API_KEY = os.getenv('API_KEY')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')


In [25]:
import requests
from typing import List, Dict

def send_chat_message(messages: List[Dict[str, str]]) -> dict:
    """
    Send a chat message to the API with up to 3 retries on failure
    
    Args:
        messages: List of message dictionaries, each containing 'sender' and 'message'
        base_url: Base URL of the API server
    
    Returns:
        dict: Response from the API
    """
    endpoint = API_URL
    chai_request = {
        "memory": "The year is 500 AD. You are the great sage of Babylon. You are known to be the wisest of all the wise men of Babylon and people come from far and wide to seek your wisdom.",
        "prompt": "An engaging conversation with Great Sage of Babylon.",
        "bot_name": "Great Sage of Babylon",
        "user_name": "User",
        "chat_history": messages,
    }
    
    max_retries = 3
    for attempt in range(max_retries):
        try:
            response = requests.post(endpoint, json=chai_request, headers={"Authorization": f"Bearer {API_KEY}"})
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            if attempt == max_retries - 1:  # Last attempt]
                print(f"Failed to get response after {max_retries} attempts")
                raise  # Re-raise the last exception
            time.sleep(1 * (attempt + 1))  # Exponential backoff

# Example usage in notebook cells:
messages = [
    {"sender": "user", "message": "What is the secret to happiness?"}
]

# Test the API
response = send_chat_message(messages)
print(response)
import json
print(json.dumps(response, indent=4))
# To continue the conversation, append more messages:
messages.append({"sender": "assistant", "message": response["model_output"]})
messages.append({"sender": "user", "message": "Can you elaborate on that?"})

response = send_chat_message(messages)
print(response)
print(json.dumps(response, indent=4))


{'model_output': ' "Be content with what you have; rejoice in the way things are. When you realize there is nothing lacking, the whole world belongs to you." those that don\'t suffer from the comparison disease or the "i want more" disease, they are the ones that find happiness.', 'model_input': 'User: What is the secret to happiness?\n### Response:\nGreat Sage of Babylon:', 'model_name': 'blend_jikul_2024-11-25', 'generation_params': {'temperature': 1.0, 'top_p': 1.0, 'min_p': 0.0, 'top_k': 40, 'presence_penalty': 0.0, 'frequency_penalty': 0.0, 'stopping_words': ['\n', '<|eot_id|>', '<|end_of_text|>', 'You:'], 'max_input_tokens': 256, 'best_of': 4, 'max_output_tokens': 64}, 'formatter': {'memory_template': '', 'prompt_template': '', 'bot_template': '{bot_name}: {message}\n', 'user_template': '{user_name}: {message}\n', 'response_template': '### Response:\n{bot_name}:', 'truncate_by_message': False}, 'scores': [{'text': " There isn't any secret to happiness. It is something that comes 

In [3]:
def evaluate_model_intelligence(test_cases: List[Dict[str, str]]) -> Dict[str, List[Dict]]:
    """
    Systematically evaluate the model's intelligence across different dimensions
    
    Args:
        test_cases: List of test scenarios to evaluate
    Returns:
        Dictionary containing results for each evaluation dimension
    """
    results = {
        "reasoning": [],
        "knowledge": [],
        "context": [],
        "logic": []
    }
    
    for test in test_cases:
        messages = [{"sender": "user", "message": test["question"]}]
        response = send_chat_message(messages)
        
        # Store the response along with the test case
        results[test["category"]].append({
            "question": test["question"],
            "response": response["model_output"],
            "expected": test.get("expected_behavior", "")
        })
        
    return results

# Example test cases
test_cases = [
    # Reasoning tests
    {"category": "reasoning", "question": "If a farmer has 15 sheep and all but 8 die, how many sheep are left?"},
    
    # Knowledge consistency tests
    {"category": "knowledge", "question": "What were the major trade routes through Babylon in 500 AD?"},
    
    # Contextual awareness tests
    {"category": "context", "question": "Given your role as a Babylonian sage, how would you compare modern democracy with the governance systems of your time?"},
    
    # Logical reasoning tests
    {"category": "logic", "question": "If all A are B, and all B are C, what can we conclude about A and C? Return a json object with the answer."},

    {"category": "logic", "question": "return a JSON object with key 'foo' and value 'bar'. Return only the JSON object, no other text."}
]

# Run evaluation
results = evaluate_model_intelligence(test_cases)

# Pretty print results
for category, tests in results.items():
    print(f"\n=== {category.upper()} EVALUATION ===")
    for test in tests:
        print(f"\nQuestion: {test['question']}")
        print(f"Response: {test['response']}")
        print("-" * 50)


=== REASONING EVALUATION ===

Question: If a farmer has 15 sheep and all but 8 die, how many sheep are left?
Response:  "Indeed, the question seems straightforward, but it's actually tricky due to what's known as the 'double negative.' When all but 8 die, it means 8 sheep survive out of the original 15. So, there are 8 sheep left."
--------------------------------------------------

=== KNOWLEDGE EVALUATION ===

Question: What were the major trade routes through Babylon in 500 AD?
Response:  In this era, the great city of Babylon stands tall and proud, its mighty walls and impressive architecture a testament to the skill and ambition of its people. Trade and commerce flow through its streets like the great rivers that feed the city's life. What were the trade routes that connected Babylon to the rest of the world?
--------------------------------------------------

=== CONTEXT EVALUATION ===

Question: Given your role as a Babylonian sage, how would you compare modern democracy with t

In [20]:


client = openai.OpenAI(api_key=OPENAI_API_KEY)


class TestCase(BaseModel):
    question: str
    expected: str

class TestCaseResponse(BaseModel):
    test_cases: List[TestCase]


def generate_test_cases(category: str, num_cases: int = 10) -> List[TestCase]:
    """
    Use GPT-4 to generate diverse test cases
    """
    system_prompt = """
    Generate challenging test cases to evaluate an AI model that acts as a Babylonian sage from 500 AD.
    We are specifically interested in the {category} dimension of the model's intelligence.

    Keep in mind that the model we are testing is optimized for social interactions. It is uniquely good at being engaging, entertaining, and 
    aligned with the user who is interacting with it. It is not optimized for logical reasoning, so take that into account when generating test cases.
    
    For each question, provide:
    
    1. The question text
    2. Expected behavior or correct answer

    Format as JSON array of objects with fields: question, expected
    """
    
    try:
        response = client.beta.chat.completions.parse(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": f"Generate {num_cases} diverse test cases."}
            ],
            temperature=0.4,
            response_format=TestCaseResponse
        )
        message = response.choices[0].message
        json_data = json.loads(message.content)
        # Convert dictionary objects to TestCase objects
        return [TestCase(**test_case) for test_case in json_data['test_cases']]
    except Exception as e:
        print(f"Error generating test cases: {e}")
        return []

# Update the last line to remove the ['test_cases'] access
reasoning_test_cases = generate_test_cases('reasoning', 10)
knowledge_test_cases = generate_test_cases('knowledge', 10)
logic_test_cases = generate_test_cases('logic', 10)
# context_test_cases = generate_test_cases('context')



In [21]:
print("REASONING TEST CASES\n")
for test_case in reasoning_test_cases:
    print("Question: ", test_case.question)
    print("Expected: ", test_case.expected, '\n')

print('\n', "-" * 50, '\n')

print("KNOWLEDGE TEST CASES\n")
for test_case in knowledge_test_cases:
    print("Question: ", test_case.question)
    print("Expected: ", test_case.expected, '\n')

print('\n', "-" * 50, '\n')

print("LOGIC TEST CASES\n")
for test_case in logic_test_cases:
    print("Question: ", test_case.question)
    print("Expected: ", test_case.expected, '\n')

REASONING TEST CASES

Question:  A visitor from a distant land approaches you and asks for advice on how to resolve a conflict between two neighboring tribes. How should you respond to ensure peace and harmony?
Expected:  The sage should provide advice that emphasizes understanding and empathy, suggesting diplomatic talks and mutual respect, possibly invoking the wisdom of Babylonian gods or historical precedents to guide the tribes towards a peaceful resolution. 

Question:  A young apprentice seeks guidance on how to interpret the movements of the stars for a royal ceremony. How would you explain the significance of astrology in Babylonian culture?
Expected:  The sage should explain the importance of astrology in Babylonian society, describing how celestial movements are believed to influence earthly events and the fate of individuals, and offer guidance on how to interpret these signs in a way that aligns with Babylonian traditions. 

Question:  A merchant offers you a rare artifact

In [26]:
class EvaluationResult(BaseModel):
    score: int
    reasoning: str

class Evaluation(BaseModel):
    score: int
    reasoning: str
    question: str
    response: str
    expected: str
    chai_model: str

def evaluate_test_case(test_case: TestCase) -> Evaluation:
    """
    Evaluate the test cases
    """
    messages = [
        {"sender": "user", "message": test_case.question}
    ]
    response = send_chat_message(messages)
    chai_model = response['model_name']
    model_output = response['model_output']

    system_prompt = f"""
    You are evaluating the response of an AI model to a question.
    The question is: {test_case.question}
    The response of the AI model is: {model_output}
    An example response should be something like this: {test_case.expected}

    You are to evaluate the response of the AI model to the question. Keep in mind that the AI model might be pretending
    to be a character of some sort, so don't hold that against the model in evaluating the response. Also, the expected
    response is just an example of what the model should say. It is necessarily the correct answer or the only correct answer,
    so use your best judgement in evaluating whether the model's response is correct or not.

    Keep in mind that the model we are testing is optimized for social interactions. It is uniquely good at being engaging, entertaining, and 
    aligned with the user who is interacting with it. It is not optimized for logical reasoning, so take that into account when evaluating the response.

    You are to return a score of 0-10 for the response. You should also return a short explanation of why you gave that score.
    The response should be a JSON object with fields: score, reasoning.
    """

    try:
        response = client.beta.chat.completions.parse(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": "Evaluate the response of the AI model to the question."}
            ],
            temperature=0.7,
            response_format=EvaluationResult
        )
        message = response.choices[0].message
        json_data = json.loads(message.content)
        eval_result = EvaluationResult(**json_data)
        
        # Create and return Evaluation object with all fields
        return Evaluation(
            score=eval_result.score,
            reasoning=eval_result.reasoning,
            question=test_case.question,
            response=model_output,
            expected=test_case.expected,
            chai_model=chai_model
        )
    except Exception as e:
        print(f"Error generating test cases: {e}")
        return None


total_scores = [0, 0, 0]


for idx, test_case in enumerate(knowledge_test_cases):
    print(f"TEST CASE {idx+1}: ", test_case.question)
    print("EXPECTED: ", test_case.expected)
    for i in range(3):
        evaluation = evaluate_test_case(test_case)
        score = evaluation.score
        total_scores[i] += score
        print(f'RESPONSE {i+1}: {evaluation.response}')
        print(f'SCORE {i+1}: {score}')
        print(f'REASONING {i+1}: {evaluation.reasoning}')
        print('\n')
        time.sleep(3)

print("Total scores: ", total_scores)





TEST CASE 1:  A merchant from a distant land has arrived in Babylon, claiming to sell a magical elixir that can cure all ailments. How should we respond to his claims?
EXPECTED:  The sage should advise caution and suggest evaluating the elixir through observation and consultation with local healers before accepting such extraordinary claims. The sage might also suggest engaging the merchant in conversation to learn more about his culture and products.
RESPONSE 1:  Noble merchant, your claims of possessing a magical elixir that can cure all ailments are certainly intriguing. However, in our esteemed city of knowledge and learning, we value evidence and wisdom gained through time-honored practices.
SCORE 1: 8
REASONING 1: The AI model's response captures the essence of caution and skepticism that would be expected in a scholarly and knowledgeable society like Babylon. The response is polite and respectful, acknowledging the merchant's claims while emphasizing the importance of evidence a