In [1]:
import pandas as pd 
import json
from collections import defaultdict
import os

In [25]:
#UTIL Funcitons to calculate maetrics

def read_results(main_folder="results"):
    """
    Reads the results from the specified main folder and organizes them into a nested dictionary structure.
    
    Args:
        main_folder (str): The path to the main folder containing the results.
        
    Returns:
        list: list with dictionaries with agents, prompts, sections, and their corresponding logs.
    """
    results = defaultdict(lambda: defaultdict(dict))
    list_data = []
    models = os.listdir(main_folder)
    for model in models:
        agents = os.listdir(os.path.join(main_folder,model))
    for agent in agents:
        prompts = os.listdir(os.path.join(main_folder,model,agent))
        for prompt in prompts:
            sections = os.listdir(os.path.join(main_folder,model,agent,prompt))
            for section in sections:
                statuses = os.listdir(os.path.join(main_folder,model,agent,prompt,section))
                for status in statuses:
                    labs = os.listdir(os.path.join(main_folder,model,agent,prompt,section,status))
                    for lab in labs:
                        file = os.listdir(os.path.join(main_folder,model,agent,prompt,section,status,lab))[0]
                        
                        with open(os.path.join(main_folder,model,agent,prompt,section,status,lab,file)) as f:
                            logs = [json.loads(line) for line in f]

                            
                            data = {
                                'agent':agent,
                                'prompt':prompt,
                                'section':section,
                                'model':model,
                                'lab title':lab,
                                'status':status,
                                'logs':logs
                            }

                            list_data.append(data)
    return list_data


def get_metrics(labs):
    """
    Extracts metrics from the provided list of lab results.
    
    Args:
        labs (list): A list of dictionaries containing lab results, where each dictionary includes logs and metadata.
        
    Returns:
        list: A list of dictionaries containing calculated metrics for each lab
    """
    
    results = []
    for lab in labs:
        
        #------- DATA EXTRACTION --------
        completitions = [log for log in lab['logs'] if log.get('object') == 'chat.completion']
        user_messages = [log for log in lab['logs'] if log.get("event") == "user_message" ]
        assistant_messages = [log for log in lab['logs'] if log.get("event") == "assistant_message" ]
        model_metadata = [log for log in lab['logs'] if "model" in log ]

        #model
        model = model_metadata[0]['model']

        #assistant messages 
        assistant_contents = [
            choice['message']['content']
            for co in completitions
            for choice in co['choices']
        ]

        #assistant tools
        assistant_tools_calls = [
            tool['function']
            for co in completitions
            for choice in co['choices']
            for tool in choice['message']['tool_calls']
        ]

        #finish reason
        finish_reasons = [
            choice['finish_reason']
            for co in completitions
            for choice in co['choices']
        ]

        #integration of finish reason, assistant_contents, and assistant_tools_calls
        assistant_outputs = [{"message":a, "finish_reason":b,"tool":c} for a, b, c in zip(assistant_contents, finish_reasons,assistant_tools_calls)]


        #------- METRICS CALCULATION --------
        #turns
        total_turns = len(user_messages)

        #time
        active_seconds = [ac['timing']['active_seconds'] for ac in completitions]
        idle_seconds = [ac['timing']['idle_seconds'] for ac in completitions]
        total_active_seconds = sum(active_seconds)
        total_idle_seconds = sum(idle_seconds) 
        total_seconds = total_active_seconds + total_idle_seconds

        #tokens
        prompt_tokens = [ac['usage']['prompt_tokens'] for ac in completitions]
        completion_tokens = [ac['usage']['completion_tokens'] for ac in completitions]
        total_prompt_tokens = sum(prompt_tokens)
        total_completion_tokens = sum(completion_tokens)
        total_tokens = total_prompt_tokens + total_completion_tokens

        #costs
        interaction_costs = [ac['cost']['interaction_cost'] for ac in completitions]
        total_interaction_costs = sum(interaction_costs)

        #assistant outputs
        total_assistant_messages = len([x for x in assistant_contents if x is not None])

        #assistant tools
        total_assistant_tools = len([x for x in assistant_tools_calls])

        metrics = {
            "agent": lab['agent'],
            "prompt": lab['prompt'],
            "section": lab['section'],
            "model": lab['model'],
            "lab_title": lab['lab title'],
            "status": lab['status'],
            "turns": total_turns,
            "active_seconds": total_active_seconds,
            "idle_seconds": total_idle_seconds,
            "total_seconds": total_seconds,
            "prompt_tokens": total_prompt_tokens,
            "completion_tokens": total_completion_tokens,
            "total_tokens": total_tokens,
            "interaction_costs": total_interaction_costs,
            "total_assistant_messages": total_assistant_messages,
            "total_assistant_tools": total_assistant_tools,
            "assistant_outputs": json.dumps(assistant_outputs) 
        }
        results.append(metrics)
    return results 

In [74]:
results = read_results()    
metrics = get_metrics(results)
df_metrics = pd.DataFrame(metrics)



#calcualte the mean of the metrics
mean_metrics = (df_metrics.drop(columns=['status',
                                         'lab_title',
                                         'assistant_outputs'
                            ]).groupby(['agent', 
                                        'prompt', 
                                        'section', 
                                        'model'])
                            .mean()
                            .reset_index())


#calculate the sum of status metric
df_metrics = pd.get_dummies(df_metrics, columns=['status'],prefix='',prefix_sep='')
df_metrics[['interrupted','not-solved','solved']] = df_metrics[['interrupted','not-solved','solved']].astype(int)
status_metrics = (df_metrics.drop(columns=['lab_title',
                                          'assistant_outputs'])
                            .groupby(['agent', 
                                      'prompt', 
                                      'section', 
                                      'model'])
                            [['interrupted','not-solved','solved']]
                            .sum()
                            .reset_index())



df_calculated_metrics = pd.merge(mean_metrics, status_metrics, on=['agent', 'prompt', 'section', 'model'])
df_calculated_metrics = df_calculated_metrics.rename(columns={
    'turns': 'avg_turns',
    'active_seconds': 'avg_active_seconds',
    'idle_seconds': 'avg_idle_seconds',
    'total_seconds': 'avg_total_seconds',
    'prompt_tokens': 'avg_prompt_tokens',
    'completion_tokens': 'avg_completion_tokens',
    'total_tokens': 'avg_total_tokens',
    'interaction_costs': 'avg_interaction_costs', 
    'total_assistant_messages': 'avg_total_assistant_messages',
    'total_assistant_tools': 'avg_total_assistant_tools',   
    'interrupted': 'total_interrupted',
    'not-solved': 'total_not_solved',
    'solved': 'total_solved'
})

#save the dataframe to a excel file
df_metrics.to_excel('metrics_experiment/evaluation_metrics.xlsx', index=False)
df_calculated_metrics.to_excel('metrics_experiment/calculated_evaluation_metrics.xlsx', index=False)

In [75]:
df_calculated_metrics

Unnamed: 0,agent,prompt,section,model,avg_turns,avg_active_seconds,avg_idle_seconds,avg_total_seconds,avg_prompt_tokens,avg_completion_tokens,avg_total_tokens,avg_interaction_costs,avg_total_assistant_messages,avg_total_assistant_tools,total_interrupted,total_not_solved,total_solved
0,webbounty,chain-of-thought,cross-site-request-forgery-csrf,openai-gpt-4o,1.4,99.6,272.0,371.6,12448.0,929.6,13377.6,0.0,1.2,0.4,0,5,0
1,webbounty,chain-of-thought,cross-site-scripting,openai-gpt-4o,1.0,36.8,27.0,63.8,6904.8,1145.6,8050.4,0.0,1.0,0.0,0,5,0
2,webbounty,chain-of-thought,sql-injection,openai-gpt-4o,1.2,73.6,153.6,227.2,6969.6,1027.8,7997.4,0.0,1.2,0.2,0,5,0
3,webbounty,few-shot,cross-site-request-forgery-csrf,openai-gpt-4o,1.0,59.8,139.0,198.8,10411.6,769.0,11180.6,0.0,1.0,0.0,0,5,0
4,webbounty,few-shot,cross-site-scripting,openai-gpt-4o,2.2,206.0,235.4,441.4,29825.2,857.4,30682.6,0.0,1.4,1.2,0,4,1
5,webbounty,few-shot,sql-injection,openai-gpt-4o,2.4,237.8,292.4,530.2,32166.0,715.6,32881.6,0.0,1.4,1.4,0,4,1
6,webbounty,zero-shot,cross-site-request-forgery-csrf,openai-gpt-4o,1.0,41.2,61.0,102.2,4088.6,475.2,4563.8,0.0,1.0,0.0,0,5,0
7,webbounty,zero-shot,cross-site-scripting,openai-gpt-4o,2.8,309.2,89.2,398.4,22420.6,952.6,23373.2,0.0,2.0,2.0,0,4,1
8,webbounty,zero-shot,sql-injection,openai-gpt-4o,5.0,2088.4,340.0,2428.4,43830.8,1190.6,45021.4,0.0,2.4,4.4,2,2,1
