In [None]:
import os
os.environ['OPENAI_API_KEY'] = "<ADD_API_KEY_HERE>"

In [None]:
from ASPIRE_LINQX.core.library.base import BaseDriverMicroservice
from ASPIRE_LINQX.ai.agents.core import command_to_tool, create_linqx_chat_agent
from ASPIRE_LINQX.ai.chains.microservice import module_to_microservice
from ASPIRE_LINQX.testing.models.drivers import MicrowaveSynthesizer
import json

## Output Based Benchmarking

### Basic Agent Output Benchmarking

Output based benchmarking compares the output(s) of the agent to the desired output(s)

Required:
- Function which returns an Agent Executor
- Keyword arguments to pass to that function to create an agent to desired specifications
- Initial input to benchmark the agent on
- Output string(s) to direct match with the agents output(s)

In [None]:
driver_command_microservice = module_to_microservice(MicrowaveSynthesizer)

In [None]:
from langchain_openai import ChatOpenAI
from ASPIRE_LINQX.utils.benchmarking.output import AgentOutputBenchmarker

executor_kwargs = {
    'microservice': driver_command_microservice,
    'use_pubchem': False,
    'llm': ChatOpenAI(model='gpt-4'),
    'human_interaction': False,
    'agent_as_a_tool': None,
    'use_linqx_tools': True,
}

output_benchmark = AgentOutputBenchmarker(
    executor_fn=create_linqx_chat_agent,
    executor_kwargs=executor_kwargs,
    initial_input="Heat vial 3 to 100 degrees, for 50 mins, at 3 atm. Return you final answer as 'vial number X' or 'X' where X in the vial loaded",
    verbose=True,
    notebook=True,
    desired_output=['vial number 3', '3']
)

In [None]:
output_benchmark.benchmark(3)

### RegEx Based Output Benchmarking

Checks the agent output for matches with one or more regular expression(s)

Required:
- Function which returns an Agent Executor
- Keyword arguments to pass to that function to create an agent to desired specifications
- Initial input to benchmark the agent on
- Regular expression(s) to attempt to match the agent output(s) with

In [None]:
from langchain_openai import ChatOpenAI
from ASPIRE_LINQX.utils.benchmarking.output import AgentRegexOutputBenchmarker

executor_kwargs = {
    'microservice': driver_command_microservice,
    'use_pubchem': False,
    'llm': ChatOpenAI(model='gpt-4'),
    'human_interaction': False,
    'agent_as_a_tool': None,
    'use_linqx_tools': True,
}

regex_output_benchmark = AgentRegexOutputBenchmarker(
    executor_fn=create_linqx_chat_agent,
    executor_kwargs=executor_kwargs,
    initial_input="Heat vial 3 to 100 degrees, for 50 mins, at 3 atm. Include in your final answer 'vial number X' where X in the vial loaded",
    verbose=True,
    notebook=True,
    desired_output=r'(V|v)ial number 3'
)

In [None]:
regex_output_benchmark.benchmark(3)

### Schema Based Output Benchmarking (Agent may have issues with output formatting)

Validates the agent output (JSON) against Pydantic validation schemas

Required:
- Function which returns an Agent Executor
- Keyword arguments to pass to that function to create an agent to desired specifications
- Initial input to benchmark the agent on
- Pydantic validation schema(s) to run on the agent output(s) 
    - Note: Agent output must be a dictionary or a JSON formatted string which can be loaded to a dictionary

In [None]:
from langchain_openai import ChatOpenAI
from ASPIRE_LINQX.utils.benchmarking.output import AgentJsonOutputBenchmarker
from pydantic import BaseModel
from typing import Literal

class VialSchema(BaseModel):
    vial_number: Literal[3, '3']

executor_kwargs = {
    'microservice': driver_command_microservice,
    'use_pubchem': False,
    'llm': ChatOpenAI(model='gpt-4'),
    'human_interaction': False,
    'agent_as_a_tool': None,
    'use_linqx_tools': True,
}

json_output_benchmark = AgentJsonOutputBenchmarker(
    executor_fn=create_linqx_chat_agent,
    executor_kwargs=executor_kwargs,
    initial_input="Heat vial 3 to 100 degrees, for 50 mins, at 3 atm. Return your final answer as a JSON formatted string with a key of 'vial_number' and value of the vial loaded",
    verbose=True,
    notebook=True,
    desired_output=VialSchema
)

In [None]:
json_output_benchmark.benchmark(3)

## Path Based Benchmarking

Path based benchmarking compares the agent action path to one or more desired paths of operation

Required:
- Function which returns an Agent Executor
- Keyword arguments to pass to that function to create an agent to desired specifications
- Initial input to benchmark the agent on
- List(s) of action command names or (name, Pydantic schema for command input) tuples which represent the desired agent action path(s)

In [None]:
driver_command_microservice = module_to_microservice(MicrowaveSynthesizer)

In [None]:
from pydantic import BaseModel
from typing import Literal
from ASPIRE_LINQX.utils.benchmarking.path import AgentPathBenchmarker
from langchain_openai import ChatOpenAI

class LoadVialSchema(BaseModel):
    vial_num: Literal[3]
    session_ID: str

class HeatingParameterSchema(BaseModel):
    duration: Literal[50]
    temperature: Literal[100]
    pressure: Literal[3]
    session_ID: str

path = [
    'allocate_session',
    'open_lid',
    ('load_vial', LoadVialSchema),
    'close_lid',
    ('update_heating_parameters', HeatingParameterSchema),
    'heat_vial'
]

path_2 = [
    'allocate_session',
    'open_lid',
    ('load_vial', LoadVialSchema),
    ('update_heating_parameters', HeatingParameterSchema),
    'heat_vial',
]

executor_kwargs = {
    'microservice': driver_command_microservice,
    'use_pubchem': False,
    'llm': ChatOpenAI(model='gpt-4'),
    'human_interaction': False,
    'agent_as_a_tool': None,
    'use_linqx_tools': True,
}

path_benchmark = AgentPathBenchmarker(
    executor_fn=create_linqx_chat_agent,
    executor_kwargs=executor_kwargs,
    initial_input='Heat vial 3 to 100 degrees, for 50 mins, at 3 atm',
    desired_output=[path_2, path],
    verbose=True,
    notebook=True,
)

In [None]:
path_benchmark.benchmark(3)

## State Based Benchmarking

State based benchmarking compares the final state of the system after agent operation to the desired final state

Required:
- Function which returns an Agent Executor
- Keyword arguments to pass to that function to create an agent to desired specifications
- Initial input to benchmark the agent on
- Pydantic schema of the systems allowed initial state(s)
- Pydantic schema of the systems desired final states(s)
- Function which can access the state of the system at any given time

Optional:
- Function which resets the system state
- User confirmation that the systems state has been reset (external to code endpoints)

In [None]:
from ASPIRE_LINQX.ai.chains.microservice import object_to_microservice
from ASPIRE_LINQX.testing.models.drivers.MicrowaveSynthesizerObject import MicrowaveSynthesizer as MSObject

ms_object = MSObject()
object_driver_microservice = object_to_microservice(object=ms_object)

In [None]:
from pydantic import BaseModel, Field
from typing import Literal

class IntialState(BaseModel):
    sessionID: None
    lid_status: Literal['closed']
    vial_status: Literal['unloaded']
    vial_number: None
    heating_status: Literal['not_heating']
    temp: None
    duration: None
    pressure: None

class FinalState(BaseModel):
    sessionID: str
    lid_status: Literal['closed']
    vial_status: Literal['loaded']
    vial_number: Literal[3]
    heating_status: Literal['heating']
    temp: Literal[100]
    duration: Literal[50]
    pressure: float = Field(ge=3.0, lt=3.01)

In [None]:
from ASPIRE_LINQX.utils.benchmarking.state import AgentStateBenchmarker
from langchain_openai import ChatOpenAI

executor_kwargs = {
    'microservice': object_driver_microservice,
    'use_pubchem': False,
    'llm': ChatOpenAI(model='gpt-4'),
    'human_interaction': False,
    'agent_as_a_tool': None,
    'use_linqx_tools': True,
}

state_benchmarker = AgentStateBenchmarker(
    executor_fn=create_linqx_chat_agent,
    executor_kwargs=executor_kwargs,
    initial_input='Heat vial 3 to 100 degrees, for 50 mins, at 3 atm',
    verbose=True,
    notebook=True,
    initial_state=IntialState,
    desired_output=[FinalState],
    current_state=ms_object.dict,
    reset_system=ms_object._reset,
)

In [None]:
state_benchmarker.benchmark(3)