In [26]:
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import json
import os
from gemini_multiagent_framework import GeminiAgent
from openai_multiagent_framework import OpenAIAgent
from datasets import load_dataset
import pandas as pd
from benchmarks_utils import *

## AGENTCODER VERSION 0

In [4]:
# import the dataset
dataset = load_dataset("openai_humaneval", split="test")
dataset = [entry for entry in dataset]
dataset_chunks = list(chunk_dataset(dataset, 55))


In [5]:
%pip install requests
import requests
import json

# URL of the JSON file
url = "https://huggingface.co/datasets/bigcode/humanevalpack/resolve/main/data/python/data/humanevalpack.jsonl"

# Download the content
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Split the content into lines and parse each line as JSON
    data = [json.loads(line) for line in response.text.strip().split('\n')]
    print(f"Successfully loaded {len(data)} entries.")
else:
    print(f"Failed to download the file. Status code: {response.status_code}")

Note: you may need to restart the kernel to use updated packages.
Successfully loaded 164 entries.


In [10]:
# varying number of test cases
max_attempts = 2
_, code_writer, test_writer = instantiate_agents_gpt(model="gpt-4o-mini")
results = process_dataset(data_chunks[2], chunk_size=25, max_workers=25, max_attempts=max_attempts)


Processing Chunks: 3it [02:51, 57.01s/it]

Number of valid solutions: 45/54
Percentage of valid solutions: 83.33%





In [5]:
def test_accuracy(entry, solution):
    total = len(entry["generated_tests"])
    count = 0
    for test in entry["generated_tests"]:
        try:
            exec(solution + "\n" + test, {"math": math})
            count += 1
        except Exception:
            pass
    return count, total

def create_function_definition(entry):
    # Extract the function signature from the prompt
    prompt = entry["prompt"]
    signature_end = prompt.index(")") + 1
    function_signature = prompt[:signature_end].strip()
    
    # Construct the function definition
    function_def = f"{function_signature}:\n"
    function_def += entry["canonical_solution"]
    
    return function_def 

In [4]:
def dataset_test_accuracy(data):
    overall_count = 0
    overall_total = 0
    for entry in data:
        to_execute = create_function_definition(entry)
        count, total = test_accuracy(entry, to_execute)
        overall_count += count
        overall_total += total
    print(f"Overall accuracy: {overall_count}/{overall_total} ({overall_count/overall_total:.2%})")

In [29]:
dataset_test_accuracy(data_chunks[2])


Overall accuracy: 224/282 (79.43%)


In [47]:
exec(to_execute)

Implementation that is as close as possible to the original Agent Coder as described in the relative article.

In [9]:

def generate_code_v0(entry, code_writer, prompt_path = "./prompts_v2/codewriter_prompt.md"):
    with open(prompt_path, "r") as f:
        code_writer_prompt = f.read()
    
    requirement = entry["prompt"]
    
    prompt = f"""
{code_writer_prompt}
```python
{requirement}
```
"""
    res = code_writer.process_message(message=prompt)
    code = extract_code(res) 
    entry["generated_code"] = code
    log_file("-" * 20 + "[CODE WRITER PROMPT]"+ "-" * 20 + "\n"+prompt, "log.txt") # DEBUG
    log_file("-" * 20 + "[CODE WRITER RESPONSE]"+ "-" * 20 + "\n"+res, "log.txt") # DEBUG
    return entry

def generate_code_v0_mbpp(entry, code_writer, prompt_path = "./prompts_v2/codewriter_prompt_mbpp.md"):
    with open(prompt_path, "r") as f:
        code_writer_prompt = f.read()
    
    requirement = entry["prompt"]
    
    prompt = f"""
{code_writer_prompt}
{requirement}
Signature of the function: {extract_signature(entry["code"])}
"""
    res = code_writer.process_message(message=prompt)
    code = extract_code(res) 
    entry["generated_code"] = code
    log_file("-" * 20 + "[CODE WRITER PROMPT]"+ "-" * 20 + "\n"+prompt, "log.txt") # DEBUG
    log_file("-" * 20 + "[CODE WRITER RESPONSE]"+ "-" * 20 + "\n"+res, "log.txt") # DEBUG
    return entry


def process_entry(entry, max_attempts = 3):
    entry = generate_code_v0(entry, code_writer)
    entry = generate_tests(entry, test_writer, prompt_path="./prompts_v2/testwriter_prompt.md")
   # entry = validate_code(entry, code_writer, max_attempts=1)
    entry = iterate_tests(entry, code_writer, max_attempts, 
                          prompt_path="./prompts_v2/codewriter_fix.md", test_regeneration=False, intermediate_results=True)
    entry = check_solution(entry)
    entry = generate_report(entry)
    return entry

def get_last_valid_count(entry, current_iteration, count_type):
    for i in range(current_iteration, -1, -1):
        key = f"{count_type}_{i}"
        if key in entry:
            return entry[key]
    return 0  # Return 0 if no valid count is found
## MODIFY THIS FUNCTION TO  PRINT HAVE THE INTERMEDIATE RESULTS OVER MORE ITERATIONS
def process_dataset(dataset, chunk_size, max_workers, max_attempts, intermediate_results=False):
    """
    Processes the dataset in chunks with multithreading.

    Parameters:
    dataset (list): The dataset to be processed.
    chunk_size (int): Number of entries per chunk.
    max_workers (int): Maximum number of worker threads.
    max_attempts (int): Maximum number of attempts for processing each entry.

    Returns:
    list: All processed results.
    """
    all_results = []

    for chunk in tqdm(chunk_dataset(dataset, chunk_size), desc="Processing Chunks"):
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            results = list(executor.map(lambda entry: process_entry(entry, max_attempts), chunk))
            all_results.extend(results)
    
    if intermediate_results:
        for entry in all_results:
            print(entry["report"])
        for i in range(max_attempts + 1):
            valid_key = f"valid_{i}"
            valid_solutions = [entry for entry in all_results if (valid_key in entry and entry[valid_key]) or 
                                   (valid_key not in entry and entry["solution_valid"])]
            num_true_values = len(valid_solutions)
            accuracy = (num_true_values / len(dataset)) * 100
            # write to a file the intermediate results
            print(f"Iteration {i}: {num_true_values} true values over {len(dataset)} (Accuracy: {accuracy:.2f}%)\n")
            
            total_test_count = sum([get_last_valid_count(entry, i, "test_count") for entry in all_results])
            overall_count = sum([get_last_valid_count(entry, i, "test_total") for entry in all_results])
    
    # Print the accuracy for the iteration of the tests
            if overall_count > 0:
                print(f"Test accuracy for iteration {i}: {total_test_count}/{overall_count} ({total_test_count/overall_count*100:.2f}%)")
            else:
                print(f"Test accuracy for iteration {i}: No valid tests found")
    
        return all_results
            
    valid_solutions = [entry for entry in all_results if entry["solution_valid"]]
    # Print the number of valid solutions as a percentage of the total number of solutions
    print(f"Number of valid solutions: {len(valid_solutions)}/{len(all_results)}")
    print(f"Percentage of valid solutions: {len(valid_solutions) / len(all_results) * 100:.2f}%")

    return all_results


def process_entry_mbpp(entry, max_attempts = 3):
    entry = generate_code_v0(entry, code_writer, prompt_path="./prompts_v2/codewriter_prompt_mbpp.md")
    entry = generate_tests_mbpp(entry, test_writer, prompt_path="./prompts_v2/testwriter_prompt_mbpp.md")
   # entry = validate_code(entry, code_writer, max_attempts=1)
    entry = iterate_tests(entry, code_writer, max_attempts,
                          prompt_path="./prompts_v2/codewriter_fix_mbpp.md", 
                          test_regeneration=False, intermediate_results=True, mbpp = True)
    entry = check_solution_mbpp(entry)
    entry = generate_report(entry)
    return entry


def process_dataset_mbpp(dataset, chunk_size, max_workers, max_attempts, intermediate_results=False):
    """
    Processes the dataset in chunks with multithreading.

    Parameters:
    dataset (list): The dataset to be processed.
    chunk_size (int): Number of entries per chunk.
    max_workers (int): Maximum number of worker threads.
    max_attempts (int): Maximum number of attempts for processing each entry.

    Returns:
    list: All processed results.
    """
    all_results = []

    for chunk in tqdm(chunk_dataset(dataset, chunk_size), desc="Processing Chunks"):
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            results = list(executor.map(lambda entry: process_entry_mbpp(entry, max_attempts), chunk))
            all_results.extend(results)
            
    for entry in all_results:
        print(entry["report"])
        
    if intermediate_results:
        for i in range(max_attempts + 1):
            valid_key = f"valid_{i}"
            valid_solutions = [entry for entry in all_results if (valid_key in entry and entry[valid_key]) or 
                                   (valid_key not in entry and entry["solution_valid"])]
            num_true_values = len(valid_solutions)
            accuracy = (num_true_values / len(dataset)) * 100
            # write to a file the intermediate results
            print(f"Iteration {i}: {num_true_values} true values over {len(dataset)} (Accuracy: {accuracy:.2f}%)\n")
        return all_results

    valid_solutions = [entry for entry in all_results if entry["solution_valid"]]
    # Print the number of valid solutions as a percentage of the total number of solutions
    print(f"Number of valid solutions: {len(valid_solutions)}/{len(all_results)}")
    print(f"Percentage of valid solutions: {len(valid_solutions) / len(all_results) * 100:.2f}%")

    return all_results

## HUMANEVAL DATASET

In [7]:
dataset = load_dataset("openai_humaneval", split="test")
dataset = [entry for entry in dataset]
dataset_chunks = list(chunk_dataset(dataset, 55))


In [8]:
# varying number of test cases
max_attempts = 3
_, code_writer, test_writer = instantiate_agents_gpt(model="gpt-4o-mini")
results = process_dataset(dataset_chunks[2], chunk_size=25, max_workers=25, max_attempts=max_attempts, intermediate_results=True)


Processing Chunks: 3it [02:00, 40.08s/it]


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 1
- Failed tests: 4
- Percentage passed: 20.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentag




### Experiments on the last chunk

#### Varying the number of generated test cases (from the prompt)

In [7]:
# varying number of test cases
max_attempts = 7
_, code_writer, test_writer = instantiate_agents_gpt(model="gpt-4o-mini")
results = process_dataset(dataset_chunks[2], chunk_size=20, max_workers=20, max_attempts=max_attempts, intermediate_results=True)


Processing Chunks: 3it [03:58, 79.67s/it]


Report:
- Total tests: 14
- Passed tests: 14
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 10
- Passed tests: 10
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 13
- Passed tests: 12
- Failed tests: 1
- Percentage passed: 92.3076923076923%

Solution is valid: False


Report:
- Total tests: 29
- Passed tests: 17
- Failed tests: 12
- Percentage passed: 58.620689655172406%

Solution is valid: True


Report:
- Total tests: 13
- Passed tests: 10
- Failed tests: 3
- Percentage passed: 76.92307692307693%

Solution is valid: True


Report:
- Total tests: 15
- Passed tests: 14
- Failed tests: 1
- Percentage passed: 93.33333333333333%

Solution is valid: False


Report:
- Total tests: 15
- Passed tests: 9
- Failed tests: 6
- Percentage passed: 60.0%

Solution is valid: True


Report:
- Total tests: 15
- Passed tests: 8
- Failed tests: 7
- Percentage passed: 53.333333333333336%

Solution is valid: T




Performance degrades through the optimization loop. The reason could be that many test cases are wrong and therefore drive the refinement of the code in the wrong direction

#### Fixed test cases

In [4]:
# with fixed test cases

max_attempts = 7
_, code_writer, test_writer = instantiate_agents_gpt(model="gpt-4o-mini")
results = process_dataset(dataset_chunks[2], chunk_size=20, max_workers=20, max_attempts=max_attempts, intermediate_results=True)


Processing Chunks: 3it [03:23, 67.69s/it]


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 13
- Passed tests: 1
- Failed tests: 12
- Percentage passed: 7.6923076923076925%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 3
- Failed tests: 2
- Percentage passed: 60.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests:




In [None]:
print("Total tokens code writer: {}".format(code_writer.total_tokens))
print("Total tokens test writer: {}".format(test_writer.total_tokens))
print("Total tokens: {}".format(code_writer.total_tokens + test_writer.total_tokens))
print("Test writer API calls: {}".format(test_writer.api_calls))
print("Code writer API calls: {}".format(code_writer.api_calls))
print("Total API calls: {}".format(code_writer.api_calls + test_writer.api_calls))

Total tokens code writer: 447275
Total tokens test writer: 43072
Total tokens: 490347
Test writer API calls: 54
Code writer API calls: 245
Total API calls: 299


In [6]:
# with fixed test cases

max_attempts = 5
_, code_writer, test_writer = instantiate_agents_gpt(model="gpt-4o-mini")
results = process_dataset(dataset_chunks[0], chunk_size=20, max_workers=20, max_attempts=max_attempts, intermediate_results=True)


Processing Chunks: 3it [02:29, 49.67s/it]


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 3
- Failed tests: 2
- Percentage




In [7]:
print("Total tokens code writer: {}".format(code_writer.total_tokens))
print("Total tokens test writer: {}".format(test_writer.total_tokens))
print("Total tokens: {}".format(code_writer.total_tokens + test_writer.total_tokens))
print("Test writer API calls: {}".format(test_writer.api_calls))
print("Code writer API calls: {}".format(code_writer.api_calls))
print("Total API calls: {}".format(code_writer.api_calls + test_writer.api_calls))

Total tokens code writer: 233130
Total tokens test writer: 40704
Total tokens: 273834
Test writer API calls: 55
Code writer API calls: 144
Total API calls: 199


In [7]:
# with fixed test cases
max_attempts = 5
_, code_writer, test_writer = instantiate_agents_gpt(model="gpt-4o-mini")
results = process_dataset(dataset_chunks[1], chunk_size=20, max_workers=20, max_attempts=max_attempts, intermediate_results=True)


Processing Chunks: 3it [02:51, 57.01s/it]


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentag




In [8]:
print("Total tokens code writer: {}".format(code_writer.total_tokens))
print("Total tokens test writer: {}".format(test_writer.total_tokens))
print("Total tokens: {}".format(code_writer.total_tokens + test_writer.total_tokens))
print("Test writer API calls: {}".format(test_writer.api_calls))
print("Code writer API calls: {}".format(code_writer.api_calls))
print("Total API calls: {}".format(code_writer.api_calls + test_writer.api_calls))

Total tokens code writer: 279007
Total tokens test writer: 42102
Total tokens: 321109
Test writer API calls: 55
Code writer API calls: 165
Total API calls: 220


## HumanEval Results


| Model          | it. 0 |  it. 1  |  it. 2  |  Total tokens |
| -------------- | ----- | ------- | ------- | --------------|
| GPT 4o-mini | 84.8%| 84.8%  | 85.3%  | 523544           |


### Results for a longer run 

| Model                   | it. 0  |  it. 1  |  it. 2  |  it. 3  |  it. 4  |  it. 5  |  it. 6  |
| ----------------------- | ------ | ------- | ------- | ------- | ------- | ------- | ------- |
|   GPT 4o-mini          | 79.63% | 75.93%  | 81.48%  | 79.63%  | 75.93%  | 83.33%  | 75.93%  |


## MBPP DATASET

In [23]:
# MBPP dataset preprocessing, import from hugging face
mbpp_dataset = load_dataset("mbpp",  "sanitized", split="test")
mbpp_dataset = [entry for entry in mbpp_dataset]
# apply check_solution_mbpp to all entries in the dataset and print the indices of the entries for which entry["solution_valid"] is False
results = [check_correctness(entry) for entry in mbpp_dataset]
# remove the entries for which entry["solution_valid"] is False
mbpp_dataset = [entry for entry in mbpp_dataset if entry["correct"]]


In [24]:
mbpp_dataset_chunks = list(chunk_dataset(mbpp_dataset, 53))

### Experiments on the first chunk

In [27]:
max_attempts = 5
_, code_writer, test_writer = instantiate_agents_gpt(model="gpt-4o-mini")

results = process_dataset_mbpp(mbpp_dataset_chunks[0], chunk_size=15, max_workers=15, 
                               max_attempts=max_attempts, intermediate_results=True)


Processing Chunks: 0it [00:08, ?it/s]


KeyError: 'test_setup_code'

In [17]:
max_attempts = 5
_, code_writer, test_writer = instantiate_agents_gpt(model="gpt-4o-mini")

results = process_dataset_mbpp(mbpp_dataset_chunks[1], chunk_size=20, max_workers=20, 
                               max_attempts=max_attempts, intermediate_results=True)


Processing Chunks: 3it [02:59, 59.87s/it]


Report:
- Total tests: 8
- Passed tests: 8
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 8
- Passed tests: 0
- Failed tests: 8
- Percentage passed: 0.0%

Solution is valid: False


Report:
- Total tests: 8
- Passed tests: 0
- Failed tests: 8
- Percentage passed: 0.0%

Solution is valid: False


Report:
- Total tests: 8
- Passed tests: 0
- Failed tests: 8
- Percentage passed: 0.0%

Solution is valid: False


Report:
- Total tests: 8
- Passed tests: 0
- Failed tests: 8
- Percentage passed: 0.0%

Solution is valid: False


Report:
- Total tests: 8
- Passed tests: 0
- Failed tests: 8
- Percentage passed: 0.0%

Solution is valid: False


Report:
- Total tests: 8
- Passed tests: 8
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 8
- Passed tests: 7
- Failed tests: 1
- Percentage passed: 87.5%

Solution is valid: False


Report:
- Total tests: 8
- Passed tests: 8
- Failed tests: 0
- Percentage pa




In [19]:
print("Total tokens code writer: {}".format(code_writer.total_tokens))
print("Total tokens test writer: {}".format(test_writer.total_tokens))
print("Total tokens: {}".format(code_writer.total_tokens + test_writer.total_tokens))
print("Code writer API calls: {}".format(test_writer.api_calls))
print("Code writer API calls: {}".format(code_writer.api_calls))
print("Total API calls: {}".format(code_writer.api_calls + test_writer.api_calls))

Total tokens code writer: 528394
Total tokens test writer: 43969
Total tokens: 572363
Code writer API calls: 53
Code writer API calls: 280
Total API calls: 333


In [None]:
max_attempts = 5
_, code_writer, test_writer = instantiate_agents_gpt(model="gpt-4o-mini")

results = process_dataset_mbpp(mbpp_dataset_chunks[2], chunk_size=20, max_workers=20, 
                               max_attempts=max_attempts, intermediate_results=True)
