In [1]:

from dotenv import  load_dotenv
load_dotenv()
from Models.models import LLMModel as LLMModel2
import pytest
from langchain.prompts import HumanMessagePromptTemplate
from langchain_core.messages import (HumanMessage)
from langchain.prompts import SystemMessagePromptTemplate, ChatPromptTemplate, MessagesPlaceholder
from pydantic import BaseModel, Field, validator
import json
from Models.schemas import NotebookWiseFeedback



#----------------------------------------------------------------
class Joke(BaseModel):
    """Joke model"""
    SETUP: str = Field(..., description="setup for the joke")
    PUNCHLINE: str = Field(..., description="punchline for the joke")


prompt_template = ChatPromptTemplate.from_messages(
            [
                (
                    "human",
                    "Given this topic={topic}, generate a joke."
                ),
                (
                    "human",
                    "Also, make your output all upper case LIKE THIS."
                )
               
            ]
        )





In [2]:



conversation = {'status': 'OK',
 'metadata': {'metadata': '# Metadata\n\n**Python Topics** - algorithms > by_topic > probability\n\n**Type** - query\n\n**Target Number of Turns (User + Assistant)** - 2\n'},
 'conversation': [{'cell_pos': 1,
   'role': 'User',
   'content': 'Could you show me a Python function to calculate the probability of rolling a sum of 8 with two six-sided dice?',
   'type': 'markdown'},
  {'cell_pos': 2,
   'role': 'Assistant',
   'content': 'Certainly! When you roll two six-sided dice, there are 36 possible outcomes (as there are 6 possible outcomes for each die). To determine the probability of obtaining a sum of 8, you must identify the favorable outcomes (those resulting in a sum of 8) and then divide this count by the total number of possible outcomes.\n\nHere is a python function for it:',
   'type': 'markdown'},
  {'cell_pos': 3,
   'role': 'Assistant',
   'content': 'def probability_of_rolling_sum_eight():\n    favorable_outcomes = 0\n    total_outcomes = 36 # There are 6 * 6 = 36 total outcomes\n\n    for dice_1 in range(1, 7):\n        dice_2 = 8 - dice_1\n\n        # Check if the value is valid\n        if 1 <= dice_2 <= 6:\n            favorable_outcomes += 1\n\n    return favorable_outcomes / total_outcomes\n\n# Call the function and print the result\nprobability_of_rolling_sum_eight = probability_of_rolling_sum_eight()\n\nprint(f"The probability of rolling a sum of 8 with two six-sided dice is: {probability_of_rolling_sum_eight}")',
   'type': 'code'},
  {'cell_pos': 4,
   'role': 'Assistant',
   'content': 'The time complexity of the `probability_of_rolling_sum_eight` function is O(N) since it undergoes a constant number of iterations (6 iterations, corresponding to each possible value of dice_1 ranging from 1 to 6).The loop entails basic arithmetic operations and conditional checks, all with constant time complexity.\n\nAs a result, the time complexity remains constant and is not influenced by the input size, irrespective of the number of sides on the dice. The function is specifically tailored for six-sided dice, ensuring a consistent and limited number of iterations.',
   'type': 'markdown'},
  {'cell_pos': 5,
   'role': 'User',
   'content': 'Could you kindly provide the result with precision up to four decimal places only?',
   'type': 'markdown'},
  {'cell_pos': 6,
   'role': 'Assistant',
   'content': "Certainly! Here's the revised version of the function that will provide the outcome with accuracy up to four decimal places.",
   'type': 'markdown'},
  {'cell_pos': 7,
   'role': 'Assistant',
   'content': 'def probability_of_rolling_sum_eight():\n    favorable_outcomes = 0\n    total_outcomes = 36 # There are 6 * 6 = 36 total outcomes\n\n    for die1 in range(1, 7):\n        die2 = 8 - die1\n\n        # Check if the value is valid\n        if 1 <= die2 <= 6:\n            favorable_outcomes += 1\n\n    return favorable_outcomes / total_outcomes\n\n# Call the function and print the result\nprobability_of_rolling_sum_eight = probability_of_rolling_sum_eight()\n\nprint(f"The probability of rolling a sum of 8 with two six-sided dice is: {probability_of_rolling_sum_eight: .4f}")',
   'type': 'code'}]}

chat_template = ChatPromptTemplate.from_messages(
        [
            SystemMessagePromptTemplate.from_template(
                """Name: AI Assistant Perfector.
Profile: You are an expert in perfecting AI Assistants' response content based on the user's expertise level."""
            ),
            HumanMessagePromptTemplate.from_template(
                """
Given the following conversation between Human User and AI Assistant, find issues following the rules described below and rate the total conversation.
Single significant instance of deviation from the rules - score 1 or 2. More issues score<2. No issues=5.

Qualities we care about. Focus on them and only find issues that are directly related to them:
```
You must assume the user just started to learn about the question that is asked, so the 
reply should cover all the points that the user might be new to, and assume the 
user has basic knowledge about the prerequisites. 

This helps us keep the explanation clean, and makes it useful to the user rather 
than throwing all information about the topic to the user.

It is important to identify the query intent to gauge the user knowledge level as well as 
the code complexity to provide the most useful explanation.
```

The task:
```
Please, detect all mismatches between user's expertise level shown and the replies of the Assistant.
If User expertise level is unknown - asumme they are a beginner in that question.
Mismatches might include but not limited to:
    - too much explanation for an expert
    - too little explanation for a beginner
    - Assistant assumes the user is not the beginner in the question asked be it an algo or a technology or something else.

Assume basic knowledge of Python programming by the user and so no need to explain basic things unless asked to.
For example, if the question is about an algorithm in python, assume understnding of Python but a beginner level in algorithms UNLESS USER SHOWS OR STATES A HIGHER OR LOWER LEVEL OF EXPERTISE.

If no issues found, do not create any.
Correctness or accuracy is not your concern and will be handled by other evaluators. Focus only on the serving user's level of expertise in the most helpful manner.
```

Conversation:
CONVERSATION_START
{conversation}
CONVERSATION_END

Now, proceed to completing your task of finding issues and scoring the conversation.
"""
            ),
        ]
    )
#----------------------------------------------------------------



In [3]:
in_= {
                "conversation": conversation
            }

p = chat_template

In [4]:

def test_anthropic_models():
        models = [
            "claude-3-opus-20240229",
        ]
        provider = "anthropic_api"
        print(f"Testing models from provider: {provider}")
        for model in models:
            print(f"Testing model: {model}")
            evaluator = LLMModel2(
                provider=provider,
                model=model,
                output_schema=Joke.model_json_schema(),
                use_tool=False,
                name="aspect_evaluator",
                   prompt_template=prompt_template,
                try_to_parse=True,
                config={
                    "retry": 3,
                    
                }
            )

            # Perform evaluation
            evaluation_result = evaluator({
                "topic": "dad joke"
            })
            print(evaluation_result)
            Joke.model_validate(evaluation_result)


def test_fireworks_models():
        models = [
        "llama-v3-70b-instruct",
    
          #  "dbrx-instruct",
            "mixtral-8x22b-instruct",
            "llama-v3-8b-instruct"
        ]
        provider = "fireworks_api"
        print(f"Testing models from provider: {provider}")
        for model in models:
            print(f"Testing model: {model}")
            evaluator = LLMModel2(
                provider=provider,
                model=model,
                output_schema=Joke.model_json_schema(),
                name="aspect_evaluator",
                prompt_template=prompt_template,
                try_to_parse=True,
                config={
                    "retry": 3,
                
                }
            )

            # Perform evaluation
            evaluation_result = evaluator({
                "topic": "dad joke"
            })
            print(evaluation_result)
            Joke.model_validate(evaluation_result)




def test_openai_models(): 
        models = [
             "gpt-3.5-turbo-1106",
            "gpt-4-turbo",
            "gpt-3.5-turbo",
            "gpt-4o",
        ]
        provider = "openai_api"
        print(f"Testing models from provider: {provider}")
        for model in models:
            print(f"Testing model: {model}")
            evaluator = LLMModel2(
                provider=provider,
                model=model,
                #use_tool=False,
                output_schema=Joke.model_json_schema(),
                name="aspect_evaluator",
                  prompt_template=prompt_template,
                try_to_parse=True,
                config={
                    "retry": 4,
                }
            )

            # Perform evaluation
            evaluation_result = evaluator({
               "topic":"dad joke"
            })
            print(evaluation_result)
            Joke.model_validate(evaluation_result)

In [5]:

def test_anthropic_models():
        models = [
            "claude-3-opus-20240229",
        ]
        provider = "anthropic_api"
        print(f"Testing models from provider: {provider}")
        for model in models:
            print(f"Testing model: {model}")
            evaluator = LLMModel2(
                provider=provider,
                model=model,
                output_schema=NotebookWiseFeedback.model_json_schema(),
                name="aspect_evaluator",
                prompt_template=chat_template,
                try_to_parse=True,
                config={
                    "retry": 3,
                    
                }
            )

            # Perform evaluation
            evaluation_result = evaluator({
                "conversation": conversation
            })
            print(evaluation_result)
            NotebookWiseFeedback.model_validate(evaluation_result)


def test_fireworks_models():
        models = [
        "llama-v3-70b-instruct",
            "dbrx-instruct",
            "mixtral-8x22b-instruct",
            "llama-v3-8b-instruct"
        ]
        provider = "fireworks_api"
        print(f"Testing models from provider: {provider}")
        for model in models:
            print(f"Testing model: {model}")
            evaluator = LLMModel2(
                provider=provider,
                model=model,
                output_schema=NotebookWiseFeedback.model_json_schema(),
                name="aspect_evaluator",
                prompt_template=chat_template,
                try_to_parse=True,
                config={
                    "retry": 3,
                
                }
            )

            # Perform evaluation
            evaluation_result = evaluator({
                "conversation": conversation
            })
            print(evaluation_result)
            NotebookWiseFeedback.model_validate(evaluation_result)




def test_openai_models(): 
        models = [
             "gpt-3.5-turbo-1106",
            "gpt-4-turbo",
            "gpt-3.5-turbo",
            "gpt-4o",
        ]
        provider = "openai_api"
        print(f"Testing models from provider: {provider}")
        for model in models:
            print(f"Testing model: {model}")
            evaluator = LLMModel2(
                provider=provider,
                model=model,
                output_schema=NotebookWiseFeedback.model_json_schema(),
                name="aspect_evaluator",
                prompt_template=chat_template,
                try_to_parse=True,
                config={
                    "retry": 4,
                }
            )

            # Perform evaluation
            evaluation_result = evaluator({
                "conversation": conversation
            })
            print(evaluation_result)
            NotebookWiseFeedback.model_validate(evaluation_result)

In [6]:
test_openai_models()

Testing models from provider: openai_api
Testing model: gpt-3.5-turbo-1106
{'user_intents': [{'cell_pos': 1, 'intent': 'Request for Python function to calculate the probability of rolling a sum of 8 with two six-sided dice.'}, {'cell_pos': 5, 'intent': 'Request for the result with precision up to four decimal places only.'}], 'user_expertise_level': {'inquiry_topic': 'algorithms', 'level': 'beginner'}, 'scratchpad': 'The user has inquired about calculating the probability of rolling a sum of 8 with two six-sided dice and has requested the result with precision up to four decimal places.', 'issues': [{'type': 'Explanation Mismatch', 'reason': 'The explanation provided assumes a high level of understanding of probability concepts and Python programming.', 'cell_positions': [2, 3, 4, 6, 7], 'fix': 'The explanation should start with a basic introduction to probability and then gradually move on to the specific calculation in Python. Additionally, the Python code provided should be simplifi

In [7]:
test_anthropic_models()

Testing models from provider: anthropic_api
Testing model: claude-3-opus-20240229


  warn_beta(


{'user_intents': [{'cell_pos': 1, 'intent': 'Calculate probability of rolling sum of 8 with two six-sided dice in Python'}, {'cell_pos': 5, 'intent': 'Print the probability result with 4 decimal places precision'}], 'user_expertise_level': {'inquiry_topic': 'calculating probability in Python', 'level': 'beginner'}, 'scratchpad': "The user is a beginner in calculating probability. The assistant provides a detailed explanation and code example which is good for a beginner. However, it also explains time complexity which may be too advanced unless the user asked for it. The assistant handles the user's request for 4 decimal place precision well.", 'issues': [{'type': 'Extra information', 'reason': 'The assistant explains the time complexity of the solution, which may be too advanced for a beginner unless they specifically asked about it.', 'cell_positions': [4], 'fix': 'Only explain time complexity if the user asks about it or shows interest in more advanced topics. Otherwise keep explana

In [12]:
test_fireworks_models()

Testing models from provider: fireworks_api
Testing model: llama-v3-70b-instruct
{'user_intents': [{'cell_pos': 1, 'intent': 'Calculate the probability of rolling a sum of 8 with two six-sided dice'}, {'cell_pos': 5, 'intent': 'Provide the result with precision up to four decimal places'}], 'user_expertise_level': {'inquiry_topic': 'algorithms', 'level': 'beginner'}, 'scratchpad': "Analyzing the conversation to identify mismatches between user's expertise level and the Assistant's replies.", 'issues': [{'type': 'too much explanation', 'reason': 'The Assistant provided an explanation of the time complexity of the function, which might be too advanced for a beginner in algorithms.', 'cell_positions': [4], 'fix': 'Provide a more concise explanation or skip the time complexity explanation for a beginner.'}], 'score': 4}
Testing model: dbrx-instruct
{'user_intents': [{'cell_pos': 1, 'intent': 'request_help'}, {'cell_pos': 5, 'intent': 'request_help'}], 'user_expertise_level': {'inquiry_topi

In [9]:
from Models import schemas
schemas.NotebookWiseFeedback.model_json_schema()

{'$defs': {'Intent': {'description': 'User intent.',
   'properties': {'cell_pos': {'title': 'Cell Pos', 'type': 'integer'},
    'intent': {'description': 'User intent, concise, single sentence per user reply. Avoid Assistant intent',
     'title': 'Intent',
     'type': 'string'}},
   'required': ['cell_pos', 'intent'],
   'title': 'Intent',
   'type': 'object'},
  'Issue': {'description': 'SINGLE ISSUE',
   'properties': {'type': {'description': 'Type of the issue identified.',
     'title': 'Type',
     'type': 'string'},
    'reason': {'description': 'What is the problem, 1-2 short sentences max. Specifically, what went wrong.',
     'title': 'Reason',
     'type': 'string'},
    'cell_positions': {'description': 'List of cell positions that contain the issue.',
     'items': {'type': 'integer'},
     'title': 'Cell Positions',
     'type': 'array'},
    'fix': {'description': "Suggested fix for the issue, concise but with a clear guidance, not generic. Provide short example like e

In [10]:
a["properties"]

NameError: name 'a' is not defined

In [None]:
class Joke(BaseModel):
    """Joke model"""
    SETUP: str = Field(..., description="setup for the joke")
    PUNCHLINE: str = Field(..., description="punchline for the joke")

In [None]:
NotebookWiseFeedback.model_json_schema()

In [None]:
import json

def generate_prompt_from_schema(schema):
    properties = schema['properties']
    fields_description = []
    example_json = {}

    for field, details in properties.items():
        field_type = details['type']
        # Adjust the example based on the type
        if field_type == 'string':
            example_value = f"Example {field.lower()}"
        elif field_type == 'integer':
            example_value = 123
        elif field_type == 'boolean':
            example_value = True
        else:
            example_value = f"Example {field_type}"

        fields_description.append(f"'{field}': a {field_type}")
        example_json[field] = example_value

    fields_description_str = ', '.join(fields_description)
    example = json.dumps(example_json, indent=2)
    
    prompt = (
        f"Generate a JSON object with the following fields: {fields_description_str}. "
        f"For example, {example}"
    )
    return prompt


# Assuming you have the schema from Joke.model_json_schema()
schema = Joke.model_json_schema()
prompt = generate_prompt_from_schema(schema)

# Use this prompt in your LLMModel configuration
prompt_template = ChatPromptTemplate.from_messages(
    [
        ("system", prompt),
        ("human", "Please ensure the output is exactly in the format specified.")
    ]
)

In [None]:
print(generate_prompt_from_schema(NotebookWiseFeedback.model_json_schema()))

In [None]:

from dotenv import  load_dotenv
load_dotenv()
from Models.models import LLMModel 
import pytest
from langchain.prompts import HumanMessagePromptTemplate
from langchain_core.messages import (HumanMessage)
from langchain.prompts import SystemMessagePromptTemplate, ChatPromptTemplate, MessagesPlaceholder
from pydantic import BaseModel, Field, validator
import json
from Models.schemas import NotebookWiseFeedback



conversation = {'status': 'OK',
 'metadata': {'metadata': '# Metadata\n\n**Python Topics** - algorithms > by_topic > probability\n\n**Type** - query\n\n**Target Number of Turns (User + Assistant)** - 2\n'},
 'conversation': [{'cell_pos': 1,
   'role': 'User',
   'content': 'Could you show me a Python function to calculate the probability of rolling a sum of 8 with two six-sided dice?',
   'type': 'markdown'},
  {'cell_pos': 2,
   'role': 'Assistant',
   'content': 'Certainly! When you roll two six-sided dice, there are 36 possible outcomes (as there are 6 possible outcomes for each die). To determine the probability of obtaining a sum of 8, you must identify the favorable outcomes (those resulting in a sum of 8) and then divide this count by the total number of possible outcomes.\n\nHere is a python function for it:',
   'type': 'markdown'},
  {'cell_pos': 3,
   'role': 'Assistant',
   'content': 'def probability_of_rolling_sum_eight():\n    favorable_outcomes = 0\n    total_outcomes = 36 # There are 6 * 6 = 36 total outcomes\n\n    for dice_1 in range(1, 7):\n        dice_2 = 8 - dice_1\n\n        # Check if the value is valid\n        if 1 <= dice_2 <= 6:\n            favorable_outcomes += 1\n\n    return favorable_outcomes / total_outcomes\n\n# Call the function and print the result\nprobability_of_rolling_sum_eight = probability_of_rolling_sum_eight()\n\nprint(f"The probability of rolling a sum of 8 with two six-sided dice is: {probability_of_rolling_sum_eight}")',
   'type': 'code'},
  {'cell_pos': 4,
   'role': 'Assistant',
   'content': 'The time complexity of the `probability_of_rolling_sum_eight` function is O(N) since it undergoes a constant number of iterations (6 iterations, corresponding to each possible value of dice_1 ranging from 1 to 6).The loop entails basic arithmetic operations and conditional checks, all with constant time complexity.\n\nAs a result, the time complexity remains constant and is not influenced by the input size, irrespective of the number of sides on the dice. The function is specifically tailored for six-sided dice, ensuring a consistent and limited number of iterations.',
   'type': 'markdown'},
  {'cell_pos': 5,
   'role': 'User',
   'content': 'Could you kindly provide the result with precision up to four decimal places only?',
   'type': 'markdown'},
  {'cell_pos': 6,
   'role': 'Assistant',
   'content': "Certainly! Here's the revised version of the function that will provide the outcome with accuracy up to four decimal places.",
   'type': 'markdown'},
  {'cell_pos': 7,
   'role': 'Assistant',
   'content': 'def probability_of_rolling_sum_eight():\n    favorable_outcomes = 0\n    total_outcomes = 36 # There are 6 * 6 = 36 total outcomes\n\n    for die1 in range(1, 7):\n        die2 = 8 - die1\n\n        # Check if the value is valid\n        if 1 <= die2 <= 6:\n            favorable_outcomes += 1\n\n    return favorable_outcomes / total_outcomes\n\n# Call the function and print the result\nprobability_of_rolling_sum_eight = probability_of_rolling_sum_eight()\n\nprint(f"The probability of rolling a sum of 8 with two six-sided dice is: {probability_of_rolling_sum_eight: .4f}")',
   'type': 'code'}]}

chat_template = ChatPromptTemplate.from_messages(
        [
            SystemMessagePromptTemplate.from_template(
                """Name: AI Assistant Perfector.
Profile: You are an expert in perfecting AI Assistants' response content based on the user's expertise level."""
            ),
            HumanMessagePromptTemplate.from_template(
                """
Given the following conversation between Human User and AI Assistant, find issues following the rules described below and rate the total conversation.
Single significant instance of deviation from the rules - score 1 or 2. More issues score<2. No issues=5.

Qualities we care about. Focus on them and only find issues that are directly related to them:
```
You must assume the user just started to learn about the question that is asked, so the 
reply should cover all the points that the user might be new to, and assume the 
user has basic knowledge about the prerequisites. 

This helps us keep the explanation clean, and makes it useful to the user rather 
than throwing all information about the topic to the user.

It is important to identify the query intent to gauge the user knowledge level as well as 
the code complexity to provide the most useful explanation.
```

The task:
```
Please, detect all mismatches between user's expertise level shown and the replies of the Assistant.
If User expertise level is unknown - asumme they are a beginner in that question.
Mismatches might include but not limited to:
    - too much explanation for an expert
    - too little explanation for a beginner
    - Assistant assumes the user is not the beginner in the question asked be it an algo or a technology or something else.

Assume basic knowledge of Python programming by the user and so no need to explain basic things unless asked to.
For example, if the question is about an algorithm in python, assume understnding of Python but a beginner level in algorithms UNLESS USER SHOWS OR STATES A HIGHER OR LOWER LEVEL OF EXPERTISE.

If no issues found, do not create any.
Correctness or accuracy is not your concern and will be handled by other evaluators. Focus only on the serving user's level of expertise in the most helpful manner.
```

Conversation:
CONVERSATION_START
{conversation}
CONVERSATION_END

Now, proceed to completing your task of finding issues and scoring the conversation.
"""
            ),
        ]
    )
#----------------------------------------------------------------



class Joke(BaseModel):
    """Joke model"""
    SETUP: str = Field(..., description="setup for the joke")
    PUNCHLINE: str = Field(..., description="punchline for the joke")


prompt_template = ChatPromptTemplate.from_messages(
            [
                (
                    "human",
                    "Given this topic={topic}, generate a joke."
                ),
                (
                    "human",
                    "Generate the joke"
                )
               
            ]
        )



models = ["claude-3-opus-20240229"]     #["gpt-3.5-turbo-1106"] #
provider ="anthropic_api" #"openai_api" #
print(f"Testing models from provider: {provider}")
for model in models:
    print(f"Testing model: {model}")
    evaluator = LLMModel(
        provider=provider,
        model=model,
        output_schema=NotebookWiseFeedback,
        name="aspect_evaluator",
        prompt_template=chat_template,
        try_to_parse=True,
        config={
            "retry": 1,
            
        }
    )

    # Perform evaluation
    evaluation_result = evaluator({
                "conversation": conversation
            })
   
   

In [None]:

NotebookWiseFeedback.model_json_schema()

{'$defs': {'Intent': {'description': 'User intent.',
   'properties': {'cell_pos': {'title': 'Cell Pos', 'type': 'integer'},
    'intent': {'description': 'User intent, concise, single sentence per user reply. Avoid Assistant intent',
     'title': 'Intent',
     'type': 'string'}},
   'required': ['cell_pos', 'intent'],
   'title': 'Intent',
   'type': 'object'},
  'Issue': {'description': 'SINGLE ISSUE',
   'properties': {'type': {'description': 'Type of the issue identified.',
     'title': 'Type',
     'type': 'string'},
    'reason': {'description': 'What is the problem, 1-2 short sentences max. Specifically, what went wrong.',
     'title': 'Reason',
     'type': 'string'},
    'cell_positions': {'description': 'List of cell positions that contain the issue.',
     'items': {'type': 'integer'},
     'title': 'Cell Positions',
     'type': 'array'},
    'fix': {'description': "Suggested fix for the issue, concise but with a clear guidance, not generic. Provide short example like e