In [1]:
from llama_cpp import Llama
from bfcl.model_handler.oss_model.qwen import QwenHandler
from bfcl.model_handler.oss_model.base_oss_handler import OSSHandler
from bfcl._llm_response_generation import collect_test_cases, parse_test_category_argument
from agent.agent_framework import OpenSourceAgent, Mediator
from bfcl.model_handler.utils import system_prompt_pre_processing_chat_model
from agent.constant import DEFAULT_JOLLY_PROMPT,DEFAULT_EXPERT_PROMPT,DEFAULT_ASSISTANT_PROMPT,DEFAULT_ASSISTANT2_PROMPT
from bfcl.model_handler.utils import system_prompt_pre_processing_assistant_model
import json
import re
from pathlib import Path

RESULT_PATH = Path("./result")
VERSION_PREFIX = "BFCL_v3"

def prepare_test_environment(model_name_bfcl, temperature, test_category):

    # Initialize the handler with the given type_handler
    handler = OSSHandler(model_name=model_name_bfcl, temperature=temperature)
    # Ensure model_name_bfcl and test_category are lists
    if not isinstance(model_name_bfcl, list):
        model_name_bfcl = [model_name_bfcl]
    if not isinstance(test_category, list):
        test_category = [test_category]

    # Parse test categories
    test_name_total, test_filename_total = parse_test_category_argument(test_category)
    # Adjust model name (e.g., add '-optimized' suffix)
    model_name_optimized = model_name_bfcl[0] + "-optimized"
    # Collect test cases
    test_cases_total = collect_test_cases(
        test_name_total, test_filename_total, model_name_optimized
    )
    
    return test_cases_total, handler

def prepare_system_prompts(test_entry, handler, default_jolly_prompt, default_expert_prompt, default_assistant_prompt):

    inference_data = handler._pre_query_processing_prompting(test_entry)

    # Extract the system functions list from inference_data
    system_functions_list = inference_data["function"]
    
    # Add the first turn message prompting
    inference_data = handler.add_first_turn_message_prompting(
        inference_data, test_entry["question"][0]
    )
    system_instruction_action = inference_data["message"][0]['content']
    
    # Prepare the system controller and expert prompts
    system_jolly_data = system_prompt_pre_processing_chat_model(
        [test_entry["question"][0][1]], default_jolly_prompt, system_functions_list
    )
    system_expert_data = system_prompt_pre_processing_chat_model(
        [test_entry["question"][0][1]], default_expert_prompt, system_functions_list
    )
    system_assistant_data = system_prompt_pre_processing_assistant_model(
        [test_entry["question"][0][1]], default_assistant_prompt, system_functions_list
    )
    
    system_jolly = system_jolly_data[0]['content']
    system_expert = system_expert_data[0]['content']
    system_assistant = system_assistant_data[0]['content']
    
    return system_instruction_action, system_jolly, system_expert, system_assistant

def prepare_system_prompt_function_parameter_extraction(test_entry, handler,response, default_assistant_prompt):
    inference_data = handler._pre_query_processing_prompting(test_entry)

    # Extract the system functions list from inference_data
    system_functions_list = inference_data["function"]

        # Usa una regex per estrarre il nome della funzione
    print(response)
    match = re.match(r'\[([\w_]+)\s*\(', response)
    if not match or match==None: 
        single_function_information='The function name is not provided in the response, ask the expert to provide it for you. Explaining the function'
    else:
        print(match)
        function_name = match.group(1)

        single_function_information = []
        # Trova la funzione corrispondente nella function_list
        for func in system_functions_list:
            if func.get('name') == function_name:
                # Estrai i parametri della funzione
                params = func.get('parameters', {}).get('properties', {})
                required_params = func.get('parameters', {}).get('required', [])
                param_list = []
                for param_name, param_info in params.items():
                    param_entry = {
                        'name': param_name,
                        'type': param_info.get('type'),
                        'description': param_info.get('description'),
                        'required': param_name in required_params
                    }
                    param_list.append(param_entry)
                single_function_information.append({
                    'function_name': function_name,
                    'parameters': param_list
                })
    
    system_prompt = default_assistant_prompt.format(single_function_information=single_function_information)
    return system_prompt
    

def write(result, model_name):
    model_name_dir = model_name.replace("/", "_")
    model_result_dir = RESULT_PATH / model_name_dir
    model_result_dir.mkdir(parents=True, exist_ok=True)

    if isinstance(result, dict):
        result = [result]

    for entry in result:
        test_category = entry["id"].rsplit("_", 1)[0]
        file_to_write = f"{VERSION_PREFIX}_{test_category}_result.json"
        file_to_write = model_result_dir / file_to_write
        with open(file_to_write, "a+") as f:
            try:
                f.write(json.dumps(entry) + "\n")
            except Exception as e:
                print(f"❗️Failed to write result: {e}")
                f.write(
                    json.dumps(
                        {
                            "id": entry["id"],
                            "result": repr(entry),
                        }
                    )
                    + "\n"
                )

# TESTING FUNZIONANTE

In [None]:

# Crea un'istanza di QwenHandler con i parametri necessari
model_name_bfcl = 'Qwen/Qwen2.5-1.5B-Instruct'  # Sostituisci con il nome effettivo del modello
temperature = 0.9  # Sostituisci con il valore desiderato per la temperatura
test_category = "java"

test_cases_total, handler = prepare_test_environment(model_name_bfcl, temperature, test_category)
test_entry = test_cases_total[42]


system_instruction_action, system_jolly, system_expert, system_assistant = prepare_system_prompts(
    test_entry, 
    handler,
    DEFAULT_JOLLY_PROMPT,
    DEFAULT_EXPERT_PROMPT,
    DEFAULT_ASSISTANT_PROMPT)

# Create an OpenSourceAgent
action_caller = OpenSourceAgent(name="action_caller", system_instruction=system_instruction_action)
syntax_expert = OpenSourceAgent(name="syntax_expert", system_instruction=system_expert)
# Create a Mediator
mediator = Mediator()
mediator.add_agent(action_caller)
mediator.add_agent(syntax_expert)


res = mediator.send("user",action_caller.id, test_entry["question"][0][1]['content'])

print('RESPONSE:', res)

system_assistant = prepare_system_prompt_function_parameter_extraction(test_entry, handler,res, DEFAULT_ASSISTANT2_PROMPT )

print('SYSTEM ASSISTANT PROMPT:', system_assistant) 


syntax_controller = OpenSourceAgent(name="syntax_controller", system_instruction=system_assistant)


mediator.add_agent(syntax_controller)

#res = mediator.math_problem_solving_architecture(assistant_id=action_caller.id, assistant2_id=syntax_controller.id, expert_id=syntax_expert.id, user_input=test_entry["question"][0][1]['content'], str_condition="ok", max_turns=2)



In [None]:
print(res)

# ERROR IMPROVEMENT

In [None]:
import json

# Crea un'istanza di QwenHandler con i parametri necessari
model_name_bfcl = 'Qwen/Qwen2.5-1.5B-Instruct'  # Sostituisci con il nome effettivo del modello
temperature = 0.9  # Sostituisci con il valore desiderato per la temperatura
test_category = "simple"

offset = 0
test_cases_total, handler = prepare_test_environment(model_name_bfcl, temperature, test_category)
test_cases_total = test_cases_total[offset:]
with_agent = False


# Lista per salvare i risultati
results = []

# Itera su tutti i test cases
for i, test_entry in enumerate(test_cases_total):
    global_index = i + offset
    # Prepara i prompt di sistema per ogni test case
    system_instruction_action, system_jolly, system_expert, system_assistant = prepare_system_prompts(
        test_entry, 
        handler,
        DEFAULT_JOLLY_PROMPT,
        DEFAULT_EXPERT_PROMPT,
        DEFAULT_ASSISTANT_PROMPT)
    
    # Crea gli agenti OpenSourceAgent per ogni test case
    action_caller = OpenSourceAgent(name="action_caller", system_instruction=system_instruction_action)
    syntax_expert = OpenSourceAgent(name="syntax_expert", system_instruction=system_expert)

    # Crea un Mediator per ogni test case
    mediator = Mediator()
    mediator.add_agent(action_caller)
    mediator.add_agent(syntax_expert)

    res = mediator.send("user",action_caller.id, test_entry["question"][0][1]['content'])
    print('RESPONSE:', res)

    if (with_agent == True):

        system_assistant = prepare_system_prompt_function_parameter_extraction(test_entry, handler,res, DEFAULT_ASSISTANT2_PROMPT ) 
        syntax_assistant = OpenSourceAgent(name="syntax_assistant", system_instruction=system_assistant)
        mediator.add_agent(syntax_assistant)
        
        res = mediator.math_problem_solving_architecture(
            assistant_id=action_caller.id, 
            assistant2_id=syntax_assistant.id, 
            expert_id=syntax_expert.id, 
            user_input=test_entry["question"][0][1]['content'], 
            str_condition="ok", 
            max_turns=4
        )
 
    result_entry = {
        "id": f"{test_category}_{global_index}",
        "result": res
    }
    results.append(result_entry)

    write(result_entry, model_name_bfcl)

# Salva tutti i risultati in un file JSON (opzionale, per avere un backup completo)
with open('result-giammy-complete.json', 'w') as f:
    json.dump(results, f, indent=4)

# EVALUATION 

RICORDATI CHE HAI COMMENTATO DENTRO eval_runenr che controlla la lunghezza delle domande di test:

assert (
     len(model_result) == len(prompt) == len(possible_answer)
), f"The length of the model result ({len(model_result)}) does not match the length of the prompt ({len(prompt)}) or possible answer ({len(possible_answer)}). Please check the input files for completeness."


In [5]:
import subprocess

import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv(dotenv_path='berkeley-function-call-leaderboard/.env')  # Replace with your .env path

# Example parameters
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct" # "gpt-3.5-turbo-0125"
TEST_CATEGORY = "all" # simple, java,javscript, 'multiple', 'parallel', 'parallel_multiple'
NUM_THREADS = 1

# Execute using Method 2
command = [
    "bfcl",
    "evaluate",
    "--model", MODEL_NAME,
    "--test-category", TEST_CATEGORY,
]

try:
    result = subprocess.run(command, stderr=subprocess.PIPE, check=True)
    print("RESULT:", result)
    if result.stderr:
        print("Command Error:")
        print(result.stderr.decode())
except subprocess.CalledProcessError as e:
    print(f"An error occurred: {e}")
    print(e.stderr.decode())

DOTENV_PATH: /home/ago/giammy/TESI-MAGISTRALE/thesis/berkeley-function-call-leaderboard/.env
🦍 Model: Qwen_Qwen2.5-1.5B-Instruct
🔍 Running test: simple
✅ Test completed: simple. 🎯 Accuracy: 0.7375
🔍 Running test: parallel
An error occurred: Command '['bfcl', 'evaluate', '--model', 'Qwen/Qwen2.5-1.5B-Instruct', '--test-category', 'all']' returned non-zero exit status 1.
Number of models evaluated:   0%|          | 0/1 [00:00<?, ?it/s]
[31m╭─[0m[31m────────────────────[0m[31m [0m[1;31mTraceback [0m[1;2;31m(most recent call last)[0m[31m [0m[31m─────────────────────[0m[31m─╮[0m
[31m│[0m [2;33m/home/ago/giammy/TESI-MAGISTRALE/thesis/berkeley-function-call-leaderboard/b[0m [31m│[0m
[31m│[0m [2;33mfcl/[0m[1;33m__main__.py[0m:[94m222[0m in [92mevaluate[0m                                              [31m│[0m
[31m│[0m                                                                              [31m│[0m
[31m│[0m   [2m219 [0m[2;33m│   [0m[33m"""[0m   