In [None]:
%load_ext autoreload

%autoreload 2

In [None]:
%reload_ext autoreload

## Evaluating open source models

In [None]:
from agentic_copilot.models.utils.agents_util import Speaker
from agentic_copilot.models.utils.llm_utils import LLMModels
from agentic_copilot.models.utils.agent_base import AgentFrameWork
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tests.reliability_testing import arun_single_test
import json
import pandas as pd
import numpy as np
import asyncio


In [None]:
with open("data/scenarios.csv", mode="r", encoding="utf-8-sig") as f:
    scenarios = pd.read_csv(f, delimiter=";")

Running simple test cases on workers to check whether they are capable of efficent tool handling

In [None]:
open_source_models = [model.value for model in[LLMModels.MIXTRAL_8X7B, LLMModels.LLAMA_GROQ_3_70B, LLMModels.GEMMA_2_9B]]
frameworks = [AgentFrameWork.BASE.value, AgentFrameWork.PROMPT.value]
worker_agents = [worker.value for worker in Speaker if worker not in [Speaker.ORCHESTRATOR, Speaker.QUERY_ORCHESTRATOR]]
simple_scenarios = scenarios[(scenarios['type'] == 'simple') &
                             (scenarios['agent'].isin(worker_agents))]
print("Models: ", ', '.join(open_source_models))
print("Frameworks: ", ', '.join(frameworks))
print("Worker agents: ", ', '.join(worker_agents))

**Creating test cases for open-source models**

In [None]:
parameter_combinations = [{'model': model, 'agent_framework': framework} for framework in frameworks for model in open_source_models]
params_df = pd.DataFrame(parameter_combinations)

open_tests_df = params_df.merge(simple_scenarios, how='cross')
open_tests_df['response_message'] = open_tests_df['response_message'].apply(lambda x: ('' if pd.isna(x) else x))
open_tests_df.head()

In [None]:
results = []

for _, scenario in simple_scenarios.iterrows():
    for framework in frameworks:
        test_cases = open_tests_df[(open_tests_df['agent_framework'] == framework) &
                                   (open_tests_df['test_case'] == scenario['test_case'])]
        tasks = [arun_single_test(id=f"{params['model']}_{params['agent_framework']}_{params['id']}", 
                                  agent_speaker=params['agent'],
                                  agent_framework=params['agent_framework'],
                                  model=params['model'],
                                  question=params['question'],
                                  expected_response=(params['response_status'], params['response_message']),
                                  input_state_path=params['input_state'],
                                  expected_output_state_path=params['expected_state']) for _, params in test_cases.iterrows()]
        results.extend(await asyncio.gather(*tasks))
        print("Results arrived ", len(results), " / ", len(open_tests_df))

In [None]:
results_df = pd.DataFrame(results)
results_df.head()

In [None]:
results_df['framework'] = results_df['framework'].apply(lambda x: ('generic' if x == 'base'  else x))

def is_rate(reasoning):
    if isinstance(reasoning, dict):
        return 'Error code: 429' in reasoning['exception_message']

# throwing away rate limit errors
all_results = results_df[results_df['reasoning'].apply(lambda x: (not is_rate(x)))]

In [None]:
table = all_results[['model', 'framework', 'result', 'token']].groupby(['model', 'framework']).mean()
with open('data/open_source.csv', 'w') as f:
    table.to_csv(f)

In [None]:
sns.set_theme(style="whitegrid")

fig, axes = plt.subplots(2, 1, figsize=(10, 12))

sns.barplot(data=table, x='model', y='result', hue='framework', ax=axes[0])
axes[0].set_title('p@1 by LLM and LLM-agent framework')
axes[0].set_ylabel('Result')

sns.barplot(data=table, x='model', y='token', hue='framework', ax=axes[1])
axes[1].set_title('Total tokens by LLM and LLM-agent framework')
axes[1].set_ylabel('Tokens')

plt.tight_layout()

plt.savefig('data/open_source.png')

Categorizing failures

In [None]:
category_mapping = {
    "Response status different": "Requested user input when not needed",
    "Agent didn't use its return_direct tools": "Agent didn't use its formatting tools",
    "'litellm.BadRequestError: GroqException - {\"error\":{\"message\":\"Failed to call a function.": "Bad tool usage",
    "str.join() takes exactly one argument": "Some error in query"
}

def map_category(reasoning):
    if isinstance(reasoning, dict):
        message = reasoning['exception_message']
        for key, value in category_mapping.items():
            if key in message:
                return value
        
        return "other"
    else:
        None

all_results['category'] = all_results['reasoning'].apply(map_category)

In [None]:
all_results['framework'] = all_results['framework'].apply(lambda x: ('generic' if x == 'base' else x))
table = all_results[['model', 'framework', 'category', 'id']].groupby(['model', 'framework', 'category']).count()
table = table.rename({'id': 'count'})

with open('data/categories.csv', 'w') as f:
    table.to_csv(f)

## Testing Enterprise models

In [None]:
enterprise_models = [LLMModels.GPT_4O, LLMModels.GPT_4O_MINI, LLMModels.CLAUDE_3_5_SONNET, LLMModels.CLAUDE_3_5_HAIKU]
frameworks = [AgentFrameWork.PROMPT, AgentFrameWork.BASE]
parameter_combinations = [{'model': model, 'agent_framework': framework.value} for framework in frameworks for model in enterprise_models]
params_df = pd.DataFrame(parameter_combinations)
agents = [worker.value for worker in Speaker if worker not in [Speaker.ORCHESTRATOR]]

tests = params_df.merge(scenarios, how='cross')
tests['response_message'] = tests['response_message'].apply(lambda x: ('' if pd.isna(x) else x))
len(tests)

In [None]:
results = []
runs = tests
experience = "e1"

for i in range(0, 5):
    tasks = [arun_single_test(id=f"{params['model'].value}_{params['agent_framework']}_{experience}_{params['id']}_run{i}",
                            test_id=params['id'], 
                            agent_speaker=params['agent'],
                            agent_framework=params['agent_framework'],
                            model=params['model'],
                            question=params['question'],
                            expected_response=(params['response_status'], params['response_message']),
                            input_state_path=params['input_state'],
                            expected_output_state_path=params['expected_state']) for _, params in runs.iterrows()]
    results.extend(await asyncio.gather(*tasks))
    print("Results arrived ", len(results), " / ", len(runs) * 5)

In [None]:
with open('data/enterprise_results.json', 'w') as f:
    json.dump(results, f)

In [None]:
results_all = pd.DataFrame(results)
results_all.head()

In [None]:
results_all['test_id'] = results_all['id'].apply(lambda x: int(x.split("_")[-2]))
results_all.loc[results_all['model'] == 'claude-3-5-haiku', 'price'] *= 3.2
results_all_w_type = pd.merge(left=results_all, right=scenarios[['id', 'type']], left_on='test_id', right_on='id', how='left').drop(columns=['id_y', 'test_id'])
results_all_w_type.head(2)

In [None]:
results_grouped = results_all[['agent', 'model', 'framework', 'time']].groupby(['agent', 'model', 'framework']).mean()
results_grouped.loc[:, 'claude-3-5-haiku', :]

In [None]:
results_grouped = results_all[['agent', 'model', 'framework', 'result']].groupby(['agent', 'model', 'framework']).mean()
results_grouped

fig = go.Figure()

agents = results_grouped.index.get_level_values('agent').drop_duplicates()
agents_name_mapping = {'research_agent': 'Research agent',
 'datastream_query_agent': 'DataStream Query agent',
 'calculation_agent': 'Calculation agent',
 'planning_agent': 'Planning agent',
 'invoice_query_agent': 'Invoice agent'}
models_mapping = {'gpt-4o': 'GPT-4o',
 'gpt-4o-mini': 'GPT-4o mini',
 'claude-3-5-haiku': 'Claude 3.5 Haiku',
 'claude-3-5-sonnet': 'Claude 3.5 Sonnet'}
models = results_grouped.index.get_level_values('model').drop_duplicates()
frameworks = results_grouped.index.get_level_values('framework').drop_duplicates()

fig = make_subplots(
    rows=1,
    cols=len(agents),
    shared_yaxes=True,
    subplot_titles=[agents_name_mapping[agent] for agent in agents],
    horizontal_spacing=0.01,
    
)

for i, framework in enumerate(frameworks):
    for j, agent in enumerate(agents):
        fig.add_trace(
            go.Bar(
                x=[models_mapping[model] for model in models],
                y=results_grouped.loc[agent, :, framework]['result'],
                marker_color='#6A7FDB' if framework == 'prompt' else '#FF6F61',
                showlegend= False if j != 1 else True,
                name='Optimized prompt' if framework == 'prompt' else 'Generic prompt'
            ),
            row=1, col=j+1,
        )

fig.update_layout(title="P@1 rate of enterprise models by differnt agents")

fig.show()

In [None]:
results_grouped = results_all[['agent', 'model', 'framework', 'price']].groupby(['agent', 'model', 'framework']).mean()

fig = make_subplots(
    rows=1,
    cols=len(agents),
    shared_yaxes=True,
    subplot_titles=[agents_name_mapping[agent] for agent in agents],
    horizontal_spacing=0.01,
    
)

for i, framework in enumerate(frameworks):
    for j, agent in enumerate(agents):
        fig.add_trace(
            go.Bar(
                x=[models_mapping[model] for model in models],
                y=results_grouped.loc[agent, :, framework]['price'],
                marker_color='#6A7FDB' if framework == 'prompt' else '#FF6F61',
                showlegend= False if j != 1 else True,
                name='Optimized prompt' if framework == 'prompt' else 'Generic prompt'
            ),
            row=1, col=j+1,
        )

fig.update_layout(title="Mean of the calls' prices by enterprise models and agents")

fig.show()



In [None]:
results_grouped = results_all[['agent', 'model', 'framework', 'token']].groupby(['agent', 'model', 'framework']).mean()

fig = make_subplots(
    rows=1,
    cols=len(agents),
    shared_yaxes=True,
    subplot_titles=[agents_name_mapping[agent] for agent in agents],
    horizontal_spacing=0.01,
    
)

for i, framework in enumerate(frameworks):
    for j, agent in enumerate(agents):
        fig.add_trace(
            go.Bar(
                x=[models_mapping[model] for model in models],
                y=results_grouped.loc[agent, :, framework]['token'],
                marker_color='#6A7FDB' if framework == 'prompt' else '#FF6F61',
                showlegend= False if j != 1 else True,
                name='Optimized prompt' if framework == 'prompt' else 'Generic prompt'
            ),
            row=1, col=j+1,
        )

fig.update_layout(title="Total tokens by enterprise models and agents")

fig.show()


## Response times

In [None]:
c = results_all[['result', 'time', 'token', 'price']].select_dtypes(include=['float64', 'int64']).corr()
px.imshow(c, text_auto=True)

In [None]:
models = [model for model in results_all[['model']].drop_duplicates()['model']]

illustrate = results_all_w_type[(~results_all_w_type['agent'].isin(['planning_agent'])) &
                          (results_all_w_type['time'] < 50)].sort_values(by='time')[['time', 'model', 'type']]

bins = 30
fixed_bins = np.linspace(0, illustrate['time'].max() + 1, bins)
bin_centers = (fixed_bins[1:] + fixed_bins[:-1]) / 2
colors = ["#3B4CCA", "#77DD77", "#FFD700", "#FF5733"]
width = (fixed_bins[-1] - fixed_bins[0]) / bins

fig, axs = plt.subplots(nrows=4, ncols=2, sharex=True, sharey=True, figsize=(12,8))

for i, type in enumerate(['simple', 'complex']):
    for j, model in enumerate(models):
        counts, bins = np.histogram(illustrate[(illustrate['model'] == model) &
                                               (illustrate['type'] == type)]['time'], bins=fixed_bins)
        axs[j][i].bar(bin_centers, counts, width=width, color=colors[i])
        axs[j][i].set_title(f"{models_mapping[model]}")

fig.subplots_adjust(hspace=0.4)
fig.text(0.5, 0.04, "Response time", ha='center', fontsize=14)
fig.text(0.27, 0.065, "Simple test scenarios", ha='center', fontsize=10)
fig.text(0.73, 0.065, "Complex test scenarios", ha='center', fontsize=10)
fig.savefig('data/response_times.png')
fig.show()