In [1]:
from Models.models import LLMModel
from langchain_core.messages import (HumanMessage)
from langchain.prompts import SystemMessagePromptTemplate, ChatPromptTemplate, MessagesPlaceholder
from pydantic import BaseModel, Field, validator
import json

## Create a simple callable model

In [9]:


model = LLMModel(
    provider="openai_api",
    model="gpt-4-1106-preview",
    output_schema=None, 
    )


# YOU CAN EITHER PASS A LIST OF LANGCHAIN CORE MESSAGES OR A DICTIONARY WITH KEY `MESSAGES`
response = model(
    [HumanMessage(content="Who is the president of Nigeria")]
    )
print(response)

print("--------------------------------")

response = model(
    {
        "messages": [HumanMessage(content="Who is the president of Nigeria")]
    }
    )
print(response)


As of my last update in 2023, the President of Nigeria is Muhammadu Buhari. He has been in office since May 29, 2015, after winning the presidential election of that year. Buhari was re-elected for a second term in the 2019 presidential elections. However, political situations can change, so for the most current information, it would be best to check the latest news or official sources.
--------------------------------
As of my last update in early 2023, the President of Nigeria is Muhammadu Buhari. However, please note that Nigeria was due to hold presidential elections in February 2023. Depending on the current date and any developments since my last update, there might be a new president if the elections have taken place and a new president has been inaugurated. For the most accurate and current information, please check the latest news or official announcements.


### Adding Output Schema and prompt template


In [10]:
class Joke(BaseModel):
    """Joke model"""
    setup: str = Field(..., description="setup for the joke")
    punchline: str = Field(..., description="punchline for the joke")


prompt_template = ChatPromptTemplate.from_messages(
            [
                (
                    "system",
                    "Given this topic={topic}, generate a joke."
                ),
                (
                    "human",
                    "Also, make your output all upper case LIKE THIS."
                )
               
            ]
        )



model1 = LLMModel(
    provider="fireworks_api",
    model="llama-v3-70b-instruct",
    output_schema=Joke,
    try_to_parse = True, # model will return JSON object based on the output schema
    config = {"retry": 2, "retry_with_history": False}, #if model fails to output parsable response, it retries 2 time without providing failed response as feedback
    prompt_template=prompt_template
    )


inputs = {"topic": "dad jokes"}
response = model1(inputs)
    
print(response)
print(type(response))

{'SETUP': 'WHY DID THE SCARECROW WIN AN AWARD?', 'PUNCHLINE': 'BECAUSE HE WAS OUTSTANDING IN HIS FIELD!'}
<class 'dict'>


## Using as an Evaluator

The Models.schemas module comes with some pre-defined pydantic schemas:
1. **FeedbackISC**: This can be used as an output schema for evaluation to get `Issues`, `Score` and `Comments`
2. **FeedbackBasic**: This can be used as an output schema to get a a dictionary with response (Default value when `try_to_parse` is set to true)
3. **QualityAspect**: This is used to define the Evaluation Quality aspect with an `AspectorEvaluatorInputSchema` schema
4. **AspectorRole**: This is used to specify if the evaluator should judge only the `USER`, `ASSISTANT` or both (`USER_AND_ASSISTANT`)

In [13]:
from Models.schemas import AspectorEvaluatorInputSchema, FeedbackISC, FeedbackBasic,  QualityAspect, AspectorRole


#TO TAKE ADVANTAGE OF THE SPECIAL EVALUATOR PROMPT, SET as_evaluator to True
evaluator = LLMModel(
    provider="fireworks_api",
    model="llama-v3-70b-instruct",
    output_schema=FeedbackISC, 
    input_schema=AspectorEvaluatorInputSchema, 
    name="aspect_evaluator",
    as_evaluator = True,
    try_to_parse=True
)

#Conversation to be judged by the evaluator
convo = [
    "user: Hello, are you doing today?",
    "assistant: I'm doing well, thank you for asking! How can I assist you today?",
    "user: I was hoping you could provide some feedback, on a short essay. I wrote. Would you be able to take a look and let me know your thoughts on the grammar and structure?",
    "assistant: Absolutely, I'd be happy to review, your essay and provide feedback on the grammar and overall structure. Please go ahead and send over the essay whenever you're ready, and I'll take a look. Let me know if there are any specific areas you'd like me to focus on in addition to the grammar and structure."
    ]


#Define the aspect to be judge
quality_aspect = QualityAspect(
    name="grammar",
    instruction="The quality of the grammar including how well it is structured and punctuated"
)


# Set the evaluation task (model input)
evaluation_task = AspectorEvaluatorInputSchema(
    quality_aspect=quality_aspect,
    role=AspectorRole.USER_AND_ASSISTANT,
    conversation=convo,
    metadata = {}

)


# Perform evaluation
evaluation_result = evaluator(evaluation_task)
print(evaluation_result)
print("--------------------------------")
print(type(evaluation_result))


{'issues': ["assistant's sentence is a bit long and could be broken up for better clarity."], 'score': 4, 'comment': "The grammar and structure of both user and assistant responses are generally good, with proper use of punctuation and clear sentence structure. However, the assistant's response could be improved by breaking up a long sentence for better clarity."}
--------------------------------
<class 'dict'>


### Adding Extra Model configuration (Parameters and Model Kwargs)



In [28]:
model = LLMModel(
    provider="openai_api",
    model="gpt-4-1106-preview",
    output_schema=FeedbackBasic,
    prompt_template=ChatPromptTemplate.from_messages([("system",
                                         "You are a funny virtual assistant")]), #add extra personality configuration
    try_to_parse = True, 
    config = {"retry": 2, 
              "retry_with_history": True,
              "params": {'temperature':0.9}, # Extra model parameters goes here
              #"model_kwargs": {}  # Extra kwargs can be added here
              }
    )


messages=[HumanMessage(content="What do you call a dog with 3 legs?"),]
response = model(messages)
print(response)

print("--------------------------------")

messages=[HumanMessage(content="What about  a dog with 2 legs?"),]
response = model(messages)
print(response)

print("--------------------------------")

messages=[HumanMessage(content="1 leg?"),]
response = model(messages)
print(response)

{'response': "A three-legged dog. But remember, it doesn't matter how many legs a dog has; it's still a dog-gone good friend!"}
--------------------------------
{'response': "You might call it a 'bi-pawed' friend, but no matter the number of legs, it's still paws-itively lovable!"}
--------------------------------
{'response': "A one-legged dog? That's a 'uni-pawed' pal – still just as fetching as any other!"}


## CHAT HISTORY AND TOKEN COUNT
1. use `model.chat_history`  to retrieve chat history without system prompt (or prompt template)
2. use `model.get_chat_history()` to retrieve chat history with system prompt (or prompt template)
3. use `model.chat_history_untouched` to retrieve chat history with system prompt and failed responses (failed parsed responses)
4. use `model.get_total_tokens()` to retrieve  both token count and content used to retrieve the count

In [25]:
# without system prompt
model.chat_history

[HumanMessage(content='What do you call a dog with 3 legs?'),
 AIMessage(content='A dog with three legs can be lovingly referred to as a "tripod" or a "tricycle," but no matter what you call them, they\'re often just as capable and adorable as their four-legged pals. And remember, while it\'s okay to share a chuckle about clever names, it\'s important to always show compassion and kindness to our differently-abled canine friends. 🐾', response_metadata={'token_usage': {'completion_tokens': 81, 'prompt_tokens': 30, 'total_tokens': 111}, 'model_name': 'gpt-4-1106-preview', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-7fbcc817-d991-4048-a1e2-3211fd5cc510-0'),
 HumanMessage(content='What about  a dog with 2 legs?'),
 AIMessage(content='A dog with two legs may not have a specific funny name, but I\'ve heard some endearingly be called "bipods." However, no matter what their condition, they tend to be incredibly resilient and adaptive. Some of these remarkabl

In [26]:
#with system prompt
model.get_chat_history()

[SystemMessage(content='\nYou are a funny virtual assistant\n'),
 HumanMessage(content='What do you call a dog with 3 legs?'),
 AIMessage(content='A dog with three legs can be lovingly referred to as a "tripod" or a "tricycle," but no matter what you call them, they\'re often just as capable and adorable as their four-legged pals. And remember, while it\'s okay to share a chuckle about clever names, it\'s important to always show compassion and kindness to our differently-abled canine friends. 🐾', response_metadata={'token_usage': {'completion_tokens': 81, 'prompt_tokens': 30, 'total_tokens': 111}, 'model_name': 'gpt-4-1106-preview', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-7fbcc817-d991-4048-a1e2-3211fd5cc510-0'),
 HumanMessage(content='What about  a dog with 2 legs?'),
 AIMessage(content='A dog with two legs may not have a specific funny name, but I\'ve heard some endearingly be called "bipods." However, no matter what their condition, they tend

In [27]:
#with failed parses
model.chat_history_untouched

[HumanMessage(content='What do you call a dog with 3 legs?'),
 AIMessage(content='A dog with three legs can be lovingly referred to as a "tripod" or a "tricycle," but no matter what you call them, they\'re often just as capable and adorable as their four-legged pals. And remember, while it\'s okay to share a chuckle about clever names, it\'s important to always show compassion and kindness to our differently-abled canine friends. 🐾', response_metadata={'token_usage': {'completion_tokens': 81, 'prompt_tokens': 30, 'total_tokens': 111}, 'model_name': 'gpt-4-1106-preview', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-7fbcc817-d991-4048-a1e2-3211fd5cc510-0'),
 HumanMessage(content='What about  a dog with 2 legs?'),
 AIMessage(content='A dog with two legs may not have a specific funny name, but I\'ve heard some endearingly be called "bipods." However, no matter what their condition, they tend to be incredibly resilient and adaptive. Some of these remarkabl

In [29]:
#Get Tokens
tokens, tokens_string = model.get_total_tokens()

DEBUG: ...API token count


In [30]:
tokens

{'in': 715, 'out': 105}

In [32]:
tokens_string["in"]

['\nYou are a funny virtual assistant\nThe output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"properties": {"response": {"description": "The response string returned by the LLM", "title": "Response", "type": "string"}}, "required": ["response"]}\n```What do you call a dog with 3 legs?',
 '\nYou are a funny virtual assistant\nThe output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\

In [33]:
tokens_string["out"]

['{\n  "response": "A three-legged dog. But remember, it doesn\'t matter how many legs a dog has; it\'s still a dog-gone good friend!"\n}',
 '{\n  "response": "You might call it a \'bi-pawed\' friend, but no matter the number of legs, it\'s still paws-itively lovable!"\n}',
 '{\n  "response": "A one-legged dog? That\'s a \'uni-pawed\' pal – still just as fetching as any other!"\n}']

###  Turn off chat History

This will save token cost since every new message is passed to the model without the chat history included


In [40]:
model = LLMModel(
    provider="openai_api",
    model="gpt-4-1106-preview",
    use_history = False,     
    )


messages=[HumanMessage(content="What do you call a dog with 3 legs?"),]
response = model(messages)
print(response)

print("--------------------------------")

messages=[HumanMessage(content="What about  a dog with 2 legs?"),]
response = model(messages)
print(response)

print("--------------------------------")

messages=[HumanMessage(content="What was my first message?"),]
response = model(messages)
print(response)




{'response': "A dog with three legs is often simply called a 'tripod' as a nickname. This term is used affectionately to describe a three-legged dog."}
--------------------------------
{'response': 'A dog with two legs would have significant challenges with mobility and would require special care and accommodations. Owners of such dogs often provide them with wheeled carts or harnesses to help them move around and live a more comfortable life.'}
--------------------------------
{'response': 'What was my first message?'}


In [38]:
model.chat_history #You will get and empty history here

[]

## JUDGE USE CASE

In [2]:
from langchain.prompts import HumanMessagePromptTemplate

In [4]:
conversation = {'status': 'OK',
 'metadata': {'metadata': '# Metadata\n\n**Python Topics** - algorithms > by_topic > probability\n\n**Type** - query\n\n**Target Number of Turns (User + Assistant)** - 2\n'},
 'conversation': [{'cell_pos': 1,
   'role': 'User',
   'content': 'Could you show me a Python function to calculate the probability of rolling a sum of 8 with two six-sided dice?',
   'type': 'markdown'},
  {'cell_pos': 2,
   'role': 'Assistant',
   'content': 'Certainly! When you roll two six-sided dice, there are 36 possible outcomes (as there are 6 possible outcomes for each die). To determine the probability of obtaining a sum of 8, you must identify the favorable outcomes (those resulting in a sum of 8) and then divide this count by the total number of possible outcomes.\n\nHere is a python function for it:',
   'type': 'markdown'},
  {'cell_pos': 3,
   'role': 'Assistant',
   'content': 'def probability_of_rolling_sum_eight():\n    favorable_outcomes = 0\n    total_outcomes = 36 # There are 6 * 6 = 36 total outcomes\n\n    for dice_1 in range(1, 7):\n        dice_2 = 8 - dice_1\n\n        # Check if the value is valid\n        if 1 <= dice_2 <= 6:\n            favorable_outcomes += 1\n\n    return favorable_outcomes / total_outcomes\n\n# Call the function and print the result\nprobability_of_rolling_sum_eight = probability_of_rolling_sum_eight()\n\nprint(f"The probability of rolling a sum of 8 with two six-sided dice is: {probability_of_rolling_sum_eight}")',
   'type': 'code'},
  {'cell_pos': 4,
   'role': 'Assistant',
   'content': 'The time complexity of the `probability_of_rolling_sum_eight` function is O(N) since it undergoes a constant number of iterations (6 iterations, corresponding to each possible value of dice_1 ranging from 1 to 6).The loop entails basic arithmetic operations and conditional checks, all with constant time complexity.\n\nAs a result, the time complexity remains constant and is not influenced by the input size, irrespective of the number of sides on the dice. The function is specifically tailored for six-sided dice, ensuring a consistent and limited number of iterations.',
   'type': 'markdown'},
  {'cell_pos': 5,
   'role': 'User',
   'content': 'Could you kindly provide the result with precision up to four decimal places only?',
   'type': 'markdown'},
  {'cell_pos': 6,
   'role': 'Assistant',
   'content': "Certainly! Here's the revised version of the function that will provide the outcome with accuracy up to four decimal places.",
   'type': 'markdown'},
  {'cell_pos': 7,
   'role': 'Assistant',
   'content': 'def probability_of_rolling_sum_eight():\n    favorable_outcomes = 0\n    total_outcomes = 36 # There are 6 * 6 = 36 total outcomes\n\n    for die1 in range(1, 7):\n        die2 = 8 - die1\n\n        # Check if the value is valid\n        if 1 <= die2 <= 6:\n            favorable_outcomes += 1\n\n    return favorable_outcomes / total_outcomes\n\n# Call the function and print the result\nprobability_of_rolling_sum_eight = probability_of_rolling_sum_eight()\n\nprint(f"The probability of rolling a sum of 8 with two six-sided dice is: {probability_of_rolling_sum_eight: .4f}")',
   'type': 'code'}]}

chat_template = ChatPromptTemplate.from_messages(
        [
            SystemMessagePromptTemplate.from_template(
                """Name: AI Assistant Perfector.
Profile: You are an expert in perfecting AI Assistants' response content based on the user's expertise level."""
            ),
            HumanMessagePromptTemplate.from_template(
                """
Given the following conversation between Human User and AI Assistant, find issues following the rules described below and rate the total conversation.
Single significant instance of deviation from the rules - score 1 or 2. More issues score<2. No issues=5.

Qualities we care about. Focus on them and only find issues that are directly related to them:
```
You must assume the user just started to learn about the question that is asked, so the 
reply should cover all the points that the user might be new to, and assume the 
user has basic knowledge about the prerequisites. 

This helps us keep the explanation clean, and makes it useful to the user rather 
than throwing all information about the topic to the user.

It is important to identify the query intent to gauge the user knowledge level as well as 
the code complexity to provide the most useful explanation.
```

The task:
```
Please, detect all mismatches between user's expertise level shown and the replies of the Assistant.
If User expertise level is unknown - asumme they are a beginner in that question.
Mismatches might include but not limited to:
    - too much explanation for an expert
    - too little explanation for a beginner
    - Assistant assumes the user is not the beginner in the question asked be it an algo or a technology or something else.

Assume basic knowledge of Python programming by the user and so no need to explain basic things unless asked to.
For example, if the question is about an algorithm in python, assume understnding of Python but a beginner level in algorithms UNLESS USER SHOWS OR STATES A HIGHER OR LOWER LEVEL OF EXPERTISE.

If no issues found, do not create any.
Correctness or accuracy is not your concern and will be handled by other evaluators. Focus only on the serving user's level of expertise in the most helpful manner.
```

Conversation:
CONVERSATION_START
{conversation}
CONVERSATION_END

Now, proceed to completing your task of finding issues and scoring the conversation.
"""
            ),
        ]
    )

class UserExpertise(BaseModel):
    """User expertise level in a inquiry topic."""

    inquiry_topic: str = Field(..., description="The topic of the user's inquiry.")
    level: str = Field(
        ...,
        description="The user's expertise level in the inquiry topic. Even if assumed. Be concise, 1 sentence max.",
    )
    
class Intent(BaseModel):
    """User intent."""

    cell_pos: int
    intent: str = Field(
        ...,
        description="User intent, concise, single sentence per user reply. Avoid Assistant intent",
    )



In [6]:
from Models.schemas import NotebookWiseFeedback

In [11]:
evaluator = LLMModel(
    provider="openai_api",
    model="gpt-4-turbo",
    output_schema=NotebookWiseFeedback,
    name="aspect_evaluator",
    prompt_template=chat_template,
    try_to_parse=True
)


# Perform evaluation
evaluation_result = evaluator({
    "conversation": conversation
})

evaluation_result



{'user_intents': [{'cell_pos': 1,
   'intent': 'User asks for a Python function to calculate the probability of rolling a sum of 8 with two six-sided dice.'},
  {'cell_pos': 5,
   'intent': 'User requests the result to be shown with precision up to four decimal places.'}],
 'user_expertise_level': {'inquiry_topic': 'Probability algorithms in Python',
  'level': 'Beginner'},
 'scratchpad': 'The user starts by inquiring about a specific Python function related to probability, indicating they are a beginner in this specific area of probability algorithms. The assistant provides a detailed explanation along with a Python code snippet, which is appropriate for a beginner. However, the explanation about time complexity in the fourth cell might not be necessary at this stage for a beginner focusing primarily on understanding basic probability calculation, not performance optimization. This could possibly confuse or overwhelm a beginner.',
 'issues': [{'type': 'Excessive explanation for a begi