In [1]:
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import json
import os
from inture import GeminiAgent
from datasets import load_dataset
import pandas as pd
from benchmarks_utils import *

ModuleNotFoundError: No module named 'inture'

In [13]:
def process_entry(entry, max_attempts = 3):
    entry = build_prompt(entry, prompt_builder)
    entry = generate_code(entry, code_writer)
    entry = generate_tests(entry, test_writer)
   # entry = validate_code(entry, code_writer, max_attempts=1)
    entry = iterate_tests(entry, code_writer, max_attempts)
    entry = check_solution(entry)
    entry = generate_report(entry)
    return entry


def process_dataset(dataset, chunk_size, max_workers, max_attempts):
    """
    Processes the dataset in chunks with multithreading.

    Parameters:
    dataset (list): The dataset to be processed.
    chunk_size (int): Number of entries per chunk.
    max_workers (int): Maximum number of worker threads.
    max_attempts (int): Maximum number of attempts for processing each entry.

    Returns:
    list: All processed results.
    """
    all_results = []

    for chunk in tqdm(chunk_dataset(dataset, chunk_size), desc="Processing Chunks"):
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            results = list(executor.map(lambda entry: process_entry(entry, max_attempts), chunk))
            all_results.extend(results)
            
    for entry in all_results:
        print(entry["report"])

    valid_solutions = [entry for entry in all_results if entry["solution_valid"]]
    # Print the number of valid solutions as a percentage of the total number of solutions
    print(f"Number of valid solutions: {len(valid_solutions)}/{len(all_results)}")
    print(f"Percentage of valid solutions: {len(valid_solutions) / len(all_results) * 100:.2f}%")

    return all_results


def process_entry_mbpp(entry, max_attempts = 3):
    entry = build_prompt_mbpp(entry, prompt_builder)
    entry = generate_code(entry, code_writer)
    entry = generate_tests_mbpp(entry, test_writer)
   # entry = validate_code(entry, code_writer, max_attempts=1)
    entry = iterate_tests(entry, code_writer, max_attempts)
    entry = check_solution_mbpp(entry)
    entry = generate_report(entry)
    return entry


def process_dataset_mbpp(dataset, chunk_size, max_workers, max_attempts):
    """
    Processes the dataset in chunks with multithreading.

    Parameters:
    dataset (list): The dataset to be processed.
    chunk_size (int): Number of entries per chunk.
    max_workers (int): Maximum number of worker threads.
    max_attempts (int): Maximum number of attempts for processing each entry.

    Returns:
    list: All processed results.
    """
    all_results = []

    for chunk in tqdm(chunk_dataset(dataset, chunk_size), desc="Processing Chunks"):
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            results = list(executor.map(lambda entry: process_entry_mbpp(entry, max_attempts), chunk))
            all_results.extend(results)
            
    for entry in all_results:
        print(entry["report"])

    valid_solutions = [entry for entry in all_results if entry["solution_valid"]]
    # Print the number of valid solutions as a percentage of the total number of solutions
    print(f"Number of valid solutions: {len(valid_solutions)}/{len(all_results)}")
    print(f"Percentage of valid solutions: {len(valid_solutions) / len(all_results) * 100:.2f}%")

    return all_results

# VERSION 1
The agents communicate by sharing a dataframe which stores valuable information and the generated responses.
### Setup:
- Prompt Builder: takes as input the problem, builds a prompt using chain-of-thought to analyze and produce a solution to the problem (pseudocode algorithm).
- Code Writer: takes as input the output of the prompt builder and implements the algorithm in python.
- Test Writer: takes as input the task, produces <b>5 tests</b> in the form of assertions per task.
- Test Executor (not an actual agent)

### Optimization loop:
For each generated code, it executes the tests and keeps tracks of the non passed ones. The code + non passed tests + error messages are prompted back to the code writer which eventually regenerates the code. Each time only the non passed tests are re-executed, for a maximum of <b>max_attempts</b> times.


In [16]:
def instantiate_agents():
    prompt_builder = GeminiAgent(
                name="Prompt Builder",
                system_instruction="Build a prompt for the problem provided."
                "The prompt should be clear and concise, and should be able to generate the code for the problem.",
                )
    code_writer = GeminiAgent(
                name="Code Writer",
                system_instruction="Write the code for the problem provided and return just the full code."
                "If it is told you what you are doing is wrong, try to fix it and rewrite the code again fixing the errors."
                "Do not explain nothing about the code, just return the full code.",
                )
    test_writer = GeminiAgent(
                name="Test Writer",
                system_instruction="Write 5 tests for the code that is provided."
                "The tests should cover the main cases and should be able to test the code."
                "Return the tests in a list format.",
                )
    return prompt_builder, code_writer, test_writer


In [17]:
def log_file(text, filename):
    with open(filename, "a") as log_open:
        log_open.write(text + "\n")
    
def build_prompt(entry, prompt_builder, prompt_path = "./prompts_v1/promptbuilder_code.md"):
    
    with open(prompt_path, "r") as f:
        code_writer_prompt = f.read()
    requirement = entry["prompt"]
    
    prompt = f"""
{code_writer_prompt}
```python
{requirement}
```
"""
    res = prompt_builder.process_message(message=prompt)
    entry["prompt_built"] = res
    log_file("-" * 20 + "[PROMPT]"+ "-" * 20 + "\n" +res, "log.txt") # DEBUG
    return entry

# TO REVISE
def extract_code(text):
    if f"```python" in text:
        text = text[text.find(f"```python")+len(f"```python"):]
        text = text[:text.find("```")]
    else:
        print("Error: No code block found")
    return text

def generate_code(entry, code_writer):
    prompt = entry["prompt_built"]
    res = code_writer.process_message(message=prompt)
    code = extract_code(res) 
    entry["generated_code"] = code
    log_file("-" * 20 + "[CODE]"+ "-" * 20 + "\n"+res, "log.txt") # DEBUG
    return entry


def generate_tests(entry, test_writer, prompt_path = "./prompts_v1/testwriter_prompt.md"):
    with open(prompt_path, "r") as f:
        test_writer_prompt = f.read()
    requirement = entry["prompt"]
    prompt = f"""
{test_writer_prompt}
```python
{requirement}
```
"""
    res = test_writer.process_message(message=prompt)
    tests = extract_code(res) # REVISE
    tests = tests.split("\n")
    # Filter out empty strings
    tests = [test for test in tests if test]
    entry["generated_tests"] = tests
    log_file("-" * 20 + "[TESTS]"+ "-" * 20 + "\n"+ "\n".join(tests), "log.txt") # DEBUG
    return entry

def validate_code(entry, code_writer, max_attempts=3, prompt_path = "./prompts_v1/codewriter_fix.md"):
    code = entry["generated_code"]
    out = ""
    counter = max_attempts
    # Load prompt
    
    with open(prompt_path, "r") as f:
        codewriter_fix = f.read()

    while counter > 0:
        try:
            exec(code)
            log_file("-" * 20 + "[VALIDATION]: Code is valid", "log.txt") # DEBUG
            break
        except Exception as e:
            s = str(e)
            out = f"ERROR: {s}"
            prompt = f"""
{codewriter_fix}
{out}
```python
{code}
```
"""
            res = code_writer.process_message(message=prompt)
            code = extract_code(res) # REVISE
            counter -= 1
            log_file("-" * 20 + "[VALIDATION]\n"+out, "log.txt") # DEBUG
    
    entry["generated_code"] = code
    return entry

def iterate_tests(entry, code_writer, max_attempts=3, prompt_path = "./prompts_v1/codewriter_fix.md"):
    code = entry["generated_code"]
    tests = entry["generated_tests"]
    counter = max_attempts
    idx_passed_tests = {idx: False for idx in range(len(tests))}  # Initialize as dict
    error_list = []

    # Load prompt
    
    with open(prompt_path, "r") as f:
        codewriter_fix = f.read()
        
    while counter > 0:
        for idx, test in enumerate(tests):
            if idx_passed_tests[idx]:  # Check if test has passed
                continue
            try:
                code_to_exec = code + "\n" + test
                # Just print the first iteration as debug
                if counter == max_attempts:
                    pass
                    log_file("-" * 20 + "[TEST " + str(idx) + "]"+ "-" * 20 + "\n"+code_to_exec, "log.txt") # DEBUG
                exec(code_to_exec)
                idx_passed_tests[idx] = True  # Mark test as passed
                
                log_file("-" * 20 + "[TEST PASSED]\n", "log.txt") # DEBUG
            except Exception as e:
                s = str(e)
                error_list.append(f"ERROR: {s}")
                log_file("-" * 20 + "[ERROR]\n"+s, "log.txt") # DEBUG
        
        if all(idx_passed_tests.values()):
            break # Break if all tests passed
        else:
            # Filter tests to include only those that haven't passed
            non_passed_tests = [tests[i] for i, passed in idx_passed_tests.items() if not passed]
            prompt = f"""
{codewriter_fix}
```python
{code}
{non_passed_tests}
```
ERRORS:
{error_list}
"""
            res = code_writer.process_message(message=prompt)
            code = extract_code(res)
        counter -= 1
    passed_tests = [test for i, test in enumerate(tests) if idx_passed_tests[i]]
    entry["generated_code"] = code
    entry["validated_tests"] = passed_tests
    return entry

def generate_report(entry):
    tests = entry["generated_tests"]
    validated_tests = entry["validated_tests"]
    solution_valid = entry["solution_valid"]

    # Calculate the number of tests passed
    num_tests = len(tests)
    num_passed_tests = len(validated_tests)
    num_failed_tests = num_tests - num_passed_tests
    
    # Calculate the percentage of tests passed
    percentage_passed = num_passed_tests / num_tests * 100

    # Generate the report
    report = f"""
Report:
- Total tests: {num_tests}
- Passed tests: {num_passed_tests}
- Failed tests: {num_failed_tests}
- Percentage passed: {percentage_passed}%

Solution is valid: {solution_valid}
"""
    entry["report"] = report
    return entry

In [18]:
def is_test_valid(entry_point, code, test):
    try:
        exec(code + "\n" + test + "\n" + "check(" + entry_point + ")" + "\n")
        return True
    except Exception:
        return False


def extract_check_function(test_str):
    # Use regular expression to find the check function within the string
    match = re.search(r'def check\(.*?\):.*?assert.*', test_str, re.DOTALL)
    if match:
        return match.group(0)
    else:
        return None

def check_solution(entry):
    code = entry["generated_code"]
    test_function = extract_check_function(entry["test"])
    valid = is_test_valid(entry["entry_point"], code, test_function)
    entry["solution_valid"] = valid
    return entry


In [19]:
# Function to divide dataset into chunks of size n
def chunk_dataset(data, n):
    for i in range(0, len(data), n):
        yield data[i:i + n]

# HUMANEVAL Dataset


## GEMINI FLASH 1.5

## 2 iterations

In [8]:
max_attempts = 2  # Set the value of max_attempts here
dataset = load_dataset("openai_humaneval", split="test")
dataset = [entry for entry in dataset]
dataset = dataset[:55]
max_attempts = 2  # Set the value of max_attempts here
results = process_dataset(dataset, chunk_size=20, max_workers=20, max_attempts=max_attempts)
os.makedirs('results', exist_ok=True)
with open('results/all_results_gemini-1.5-flash_2.json', 'w') as f:
    json.dump(results, f)

Processing Chunks: 3it [00:53, 17.99s/it]


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage 




In [10]:
max_attempts = 2  # Set the value of max_attempts here
dataset = load_dataset("openai_humaneval", split="test")
dataset = [entry for entry in dataset]
dataset = dataset[55:109]
max_attempts = 2  # Set the value of max_attempts here
results = process_dataset(dataset, chunk_size=20, max_workers=20, max_attempts=max_attempts)
os.makedirs('results', exist_ok=True)
with open('results/all_results_gemini-1.5-flash_2.json', 'w') as f:
    json.dump(results, f)

Processing Chunks: 3it [01:38, 32.73s/it]


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 3
- Failed tests: 2
- Percentag




In [48]:
max_attempts = 2  # Set the value of max_attempts here
dataset = load_dataset("openai_humaneval", split="test")
dataset = [entry for entry in dataset]
dataset = dataset[109:164]
max_attempts = 2  # Set the value of max_attempts here
results = process_dataset(dataset, chunk_size=20, max_workers=20, max_attempts=max_attempts)
os.makedirs('results', exist_ok=True)
with open('results/all_results_gemini-1.5-flash_2.json', 'w') as f:
    json.dump(results, f)

Processing Chunks: 3it [03:00, 60.14s/it]


Report:
- Total tests: 5
- Passed tests: 2
- Failed tests: 3
- Percentage passed: 40.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 3
- Failed tests: 2
- Percentage passed: 60.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 1
- Failed tests: 4
- Percentage passed: 20.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage 




### Token Overhead (calculated using only the last chunk)


In [49]:
gemini_flash = prompt_builder.model
print("PROMPT BUILDER  ->", gemini_flash.count_tokens(prompt_builder.chat_session.history))
gemini_flash = test_writer.model
print("TEST WRITER  ->", gemini_flash.count_tokens(test_writer.chat_session.history))
gemini_flash = code_writer.model
print("CODE WRITER ->", gemini_flash.count_tokens(code_writer.chat_session.history))

PROMPT BUILDER  -> total_tokens: 31436

TEST WRITER  -> total_tokens: 18772

CODE WRITER -> total_tokens: 38239



In [52]:
token_oberhead_flash = 31436 + 18772 + 38239
print("Total tokens overhead ->", token_oberhead_flash)

Total tokens overhead -> 88447


### pass@1

In [50]:
total = (29 + 41 + 46) / 164
print(total)

0.7073170731707317


## 3 iterations

In [10]:
dataset = load_dataset("openai_humaneval", split="test")
dataset = [entry for entry in dataset]
dataset = dataset[:55]
max_attempts = 3  # Set the value of max_attempts here
results = process_dataset(dataset, chunk_size=20, max_workers=20, max_attempts=max_attempts)
os.makedirs('results', exist_ok=True)
with open('results/all_results_gemini-1.5-flash_3.json', 'w') as f:
    json.dump(results, f)

Processing Chunks: 3it [02:23, 47.78s/it]


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 3
- Failed tests: 2
- Percentage passed: 60.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage 




In [13]:
dataset = load_dataset("openai_humaneval", split="test")
dataset = [entry for entry in dataset]
dataset = dataset[55:109]
max_attempts = 3  # Set the value of max_attempts here
results = process_dataset(dataset, chunk_size=20, max_workers=20, max_attempts=max_attempts)
os.makedirs('results', exist_ok=True)
with open('results/all_results_gemini-1.5-flash_3.json', 'w') as f:
    json.dump(results, f)

Processing Chunks: 3it [03:28, 69.64s/it]


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 0
- Failed tests: 5
- Percentage passed: 0.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentag




In [56]:
dataset = load_dataset("openai_humaneval", split="test")
dataset = [entry for entry in dataset]
dataset = dataset[109:164]
max_attempts = 3  # Set the value of max_attempts here
results = process_dataset(dataset, chunk_size=20, max_workers=20, max_attempts=max_attempts)
os.makedirs('results', exist_ok=True)
with open('results/all_results_gemini-1.5-flash_3.json', 'w') as f:
    json.dump(results, f)

Processing Chunks: 3it [02:46, 55.39s/it]


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 2
- Failed tests: 3
- Percentage passed: 40.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 3
- Failed tests: 2
- Percentage passed: 60.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 0
- Failed tests: 5
- Percentage passed: 0.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage 




In [59]:
gemini_flash = prompt_builder.model
pb_tokens = gemini_flash.count_tokens(prompt_builder.chat_session.history)
print("PROMPT BUILDER  ->", pb_tokens)
gemini_flash = test_writer.model
tw_tokens = gemini_flash.count_tokens(test_writer.chat_session.history)
print("TEST WRITER  ->", tw_tokens)
gemini_flash = code_writer.model
cw_tokens = gemini_flash.count_tokens(code_writer.chat_session.history)
print("CODE WRITER ->", cw_tokens)

PROMPT BUILDER  -> total_tokens: 19531

TEST WRITER  -> total_tokens: 31559

CODE WRITER -> total_tokens: 43526



In [65]:
total = 19531 + 31559 + 43426
print("Total tokens overhead ->", total)

Total tokens overhead -> 94516


### pass@1

In [57]:
pass_rate = (32 + 41 + 46) / 164
print(pass_rate)

0.725609756097561


## GEMINI PRO 1.5

## 3 iterations

In [19]:
dataset = load_dataset("openai_humaneval", split="test")
dataset = [entry for entry in dataset]
dataset = dataset[:55]
results = process_dataset(dataset, chunk_size=20, max_workers=20, max_attempts=3)


Processing Chunks: 3it [02:37, 52.54s/it]


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentag




In [22]:
dataset = load_dataset("openai_humaneval", split="test")
dataset = [entry for entry in dataset]
dataset = dataset[55:110]
results = process_dataset(dataset, chunk_size=20, max_workers=20, max_attempts=3)

with open('results/all_results_gemini-1.5-pro_001.json', 'w') as f:
    json.dump(results, f)

Processing Chunks: 3it [04:33, 91.19s/it]


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percenta




In [26]:
dataset = load_dataset("openai_humaneval", split="test")
dataset = [entry for entry in dataset]
dataset = dataset[110:164]
results = process_dataset(dataset, chunk_size=20, max_workers=20, max_attempts=3)
with open('results/all_results_gemini-1.5-pro_001.json', 'w') as f:
    json.dump(results, f)

Processing Chunks: 3it [04:24, 88.22s/it]


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 3
- Failed tests: 2
- Percentage passed: 60.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage




In [27]:
total = (40 + 46 + 47)/164
print(total)

0.8109756097560976


## 2 iterations

In [24]:

dataset = load_dataset("openai_humaneval", split="test")
dataset = [entry for entry in dataset]
dataset = dataset[:55]
results = process_dataset(dataset, chunk_size=20, max_workers=20, max_attempts=2)

with open('results/all_results_gemini-1.5-pro_2.json', 'w') as f:
    json.dump(results, f)



Processing Chunks: 3it [01:01, 20.34s/it]


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 1
- Failed tests: 4
- Percentage passed: 20.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 3
- Failed tests: 2
- Percentage passed: 60.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage 




In [6]:




dataset = load_dataset("openai_humaneval", split="test")
dataset = [entry for entry in dataset]
dataset = dataset[55:109]
results = process_dataset(dataset, chunk_size=20, max_workers=20, max_attempts=2)

with open('results/all_results_gemini-1.5-pro_2.json', 'w') as f:
    json.dump(results, f)




Processing Chunks: 3it [02:18, 46.11s/it]


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percenta




In [6]:
dataset = load_dataset("openai_humaneval", split="test")
dataset = [entry for entry in dataset]
dataset = dataset[109:164]

results = process_dataset(dataset, chunk_size=20, max_workers=20, max_attempts=2)

with open('results/all_results_gemini-1.5-pro_2.json', 'w') as f:
    json.dump(results, f)


Processing Chunks: 3it [02:36, 52.12s/it]


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 3
- Failed tests: 2
- Percentage passed: 60.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage 




In [7]:
total = (40 + 46 + 46) / 164
print(total)

0.8048780487804879


                   
| Model   | pass@1 (it.=2)| pass@1 (it.=3)|
| --------------   | ----------   | ------------ |
| Gemini 1.5 flash |    71%          |         73%  |
| Gemini 1.5 pro   |   80%        |         81%    |


| Model   | total tokens (it.=2)| total tokens (it.=3)|
| --------------   | ----------   | ------------ |
| Gemini 1.5 flash |    88447         |         94516  |

## MBPP DATASET

- test_list: one assert per item in the list (directly executable);
- prompt: text description (no code to extract) in the form of a string;
- code: the solution as the definition of a function;
- test_imports: to put before test_list for execution.

In [2]:
def extract_signature(func_str):
    # Use regex to find the function signature
    match = re.search(r'def\s+(\w+\([^)]*\))', func_str)
    if match:
        return match.group(1)
    return ""

def build_prompt_mbpp(entry, prompt_builder, prompt_path = "./prompts_v1/promptbuilder_mbpp.md"):
    
    with open(prompt_path, "r") as f:
        code_writer_prompt = f.read()
    requirement = entry["prompt"]
    
    prompt = f"""
{code_writer_prompt}
{requirement}
Signature of the function: {extract_signature(entry["code"])}
"""
    res = prompt_builder.process_message(message=prompt)
    entry["prompt_built"] = res
    log_file("-" * 20 + "[PROMPT]"+ "-" * 20 + "\n" +res, "log.txt") # DEBUG
    return entry


def generate_tests_mbpp(entry, test_writer, prompt_path = "./prompts_v1/testwriter_mbpp_prompt.md"):
    
    with open(prompt_path, "r") as f:
        test_writer_prompt = f.read()
    requirement = entry["prompt"]
    prompt = f"""
{test_writer_prompt}
{requirement}
Signature of the function: {extract_signature(entry["code"])}
"""
    res = test_writer.process_message(message=prompt)
    tests = extract_code(res) # REVISE
    tests = tests.split("\n")
    # Filter out empty strings
    tests = [test for test in tests if test]
    entry["generated_tests"] = tests
    log_file("-" * 20 + "[TESTS]"+ "-" * 20 + "\n"+ "\n".join(tests), "log.txt") # DEBUG
    return entry

In [45]:
def is_test_valid_mbpp(code, tests):
    try:
        for test in tests:
            exec(code + "\n" + test + "\n")
        return True
    except Exception:
        return False


def check_solution_mbpp(entry):
    tests = ["\n".join(entry["test_imports"]) + "\n" + test for test in entry["test_list"]]
    # Append all items in test_imports to each item in test_list
    # tests = [entry["test_imports"] + test for test in entry["test_list"]]
    # Assuming is_test_valid_mbpp takes the generated code and a list of combined tests
    valid = is_test_valid_mbpp(entry["generated_code"], tests)
    
    entry["solution_valid"] = valid
    return entry

In [33]:
def check_correctness(entry):
    tests = [test for test in entry["test_list"]]
    
    # Assuming is_test_valid_mbpp takes the generated code and a list of combined tests
    valid = is_test_valid_mbpp(entry["code"], tests)
    
    entry["correct"] = valid
    return entry

### import MBPP locally :

In [9]:
# load a json file at "/home/enosim/Desktop/mbpp.json" and convert it into a list of dictionaries
# (for when hugging face datasets are not available)
with open("/home/enosim/Desktop/mbpp.json", "r") as f:
    mbpp_dataset = json.load(f)
    
results = [check_correctness(entry) for entry in mbpp_dataset]
# remove the entries for which entry["solution_valid"] is False
mbpp_dataset = [entry for entry in mbpp_dataset if entry["correct"]]

### import MBPP from HuggingFace:

In [22]:
# MBPP dataset preprocessing, import from hugging face
mbpp_dataset = load_dataset("mbpp", "sanitized", split="test")
mbpp_dataset = [entry for entry in mbpp_dataset]
# apply check_solution_mbpp to all entries in the dataset and print the indices of the entries for which entry["solution_valid"] is False
results = [check_correctness(entry) for entry in mbpp_dataset]
# remove the entries for which entry["solution_valid"] is False
mbpp_dataset = [entry for entry in mbpp_dataset if entry["correct"]]

### chunk it

In [24]:
mbpp_dataset_chunks = list(chunk_dataset(mbpp_dataset, 53))

## GEMINI FLASH

## 2 Iterations

In [16]:
max_attempts = 2  
results = process_dataset_mbpp(mbpp_dataset_chunks[0], chunk_size=20, max_workers=20, max_attempts=max_attempts)
os.makedirs('results', exist_ok=True)
with open('results/mbpp_gemini-1.5-flash_2.json', 'w') as f:
    json.dump(results, f)

Processing Chunks: 3it [00:44, 14.74s/it]


Report:
- Total tests: 5
- Passed tests: 2
- Failed tests: 3
- Percentage passed: 40.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage p




In [19]:
prompt_builder, code_writer, test_writer = instantiate_agents()
results = process_dataset_mbpp(mbpp_dataset_chunks[1], chunk_size=20, max_workers=20, max_attempts=max_attempts)
os.makedirs('results', exist_ok=True)
with open('results/mbpp_gemini-1.5-flash_2.json', 'w') as f:
    json.dump(results, f)

Processing Chunks: 3it [00:47, 15.86s/it]


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 1
- Failed tests: 4
- Percentage passed: 20.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percenta




In [20]:
prompt_builder, code_writer, test_writer = instantiate_agents()
results = process_dataset_mbpp(mbpp_dataset_chunks[2], chunk_size=20, max_workers=20, max_attempts=max_attempts)
os.makedirs('results', exist_ok=True)
with open('results/mbpp_gemini-1.5-flash_2.json', 'w') as f:
    json.dump(results, f)

Processing Chunks: 3it [00:51, 17.26s/it]


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 2
- Failed tests: 3
- Percentage passed: 40.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 0
- Failed tests: 5
- Percentage passed: 0.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 3
- Failed tests: 2
- Percentage passed: 60.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 2
- Failed tests: 3
- Percentage passed: 40.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage pa




In [14]:
max_attempts = 2
prompt_builder, code_writer, test_writer = instantiate_agents()
results = process_dataset_mbpp(mbpp_dataset_chunks[3], chunk_size=20, max_workers=20, max_attempts=max_attempts)
os.makedirs('results', exist_ok=True)
with open('results/mbpp_gemini-1.5-flash_2.json', 'a') as f:
    json.dump(results, f)

Processing Chunks: 3it [00:43, 14.46s/it]


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 3
- Failed tests: 2
- Percentage passed: 60.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 2
- Failed tests: 3
- Percentage passed: 40.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 2
- Failed tests: 3
- Percent




### pass@1

In [15]:
(31 + 27 + 40 + 41) / 210

0.6619047619047619

### Token Overhead

In [16]:
gemini_flash = prompt_builder.model
print("PROMPT BUILDER  ->", gemini_flash.count_tokens(prompt_builder.chat_session.history))
gemini_flash = test_writer.model
print("TEST WRITER  ->", gemini_flash.count_tokens(test_writer.chat_session.history))
gemini_flash = code_writer.model
print("CODE WRITER ->", gemini_flash.count_tokens(code_writer.chat_session.history))

PROMPT BUILDER  -> total_tokens: 21459

TEST WRITER  -> total_tokens: 13598

CODE WRITER -> total_tokens: 18829



In [17]:
21459 + 13598 + 18829

53886

## 3 Iterations

In [15]:
max_attempts = 3
prompt_builder, code_writer, test_writer = instantiate_agents()
results = process_dataset_mbpp(mbpp_dataset_chunks[0], chunk_size=20, max_workers=20, max_attempts=max_attempts)
os.makedirs('results', exist_ok=True)
with open('results/mbpp_gemini-1.5-flash_3.json', 'w') as f:
    json.dump(results, f)

Processing Chunks: 3it [00:54, 18.25s/it]


Report:
- Total tests: 5
- Passed tests: 2
- Failed tests: 3
- Percentage passed: 40.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage p




In [16]:
prompt_builder, code_writer, test_writer = instantiate_agents()
results = process_dataset_mbpp(mbpp_dataset_chunks[1], chunk_size=20, max_workers=20, max_attempts=max_attempts)
os.makedirs('results', exist_ok=True)
with open('results/mbpp_gemini-1.5-flash_3.json', 'a') as f:
    json.dump(results, f)

Processing Chunks: 3it [00:47, 15.82s/it]


Report:
- Total tests: 5
- Passed tests: 3
- Failed tests: 2
- Percentage passed: 60.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 0
- Failed tests: 5
- Percentage passed: 0.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage




In [55]:
max_attempts = 3
prompt_builder, code_writer, test_writer = instantiate_agents()
results = process_dataset_mbpp(mbpp_dataset_chunks[2], chunk_size=20, max_workers=20, max_attempts=max_attempts)
os.makedirs('results', exist_ok=True)
with open('results/mbpp_gemini-1.5-flash_3.json', 'a') as f:
    json.dump(results, f)

Processing Chunks: 3it [00:48, 16.13s/it]


Report:
- Total tests: 5
- Passed tests: 0
- Failed tests: 5
- Percentage passed: 0.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 1
- Failed tests: 4
- Percentage passed: 20.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 0
- Failed tests: 5
- Percentage passed: 0.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 1
- Failed tests: 4
- Percentage passed: 20.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage 




In [56]:
prompt_builder, code_writer, test_writer = instantiate_agents()
results = process_dataset_mbpp(mbpp_dataset_chunks[3], chunk_size=20, max_workers=20, max_attempts=max_attempts)
os.makedirs('results', exist_ok=True)
with open('results/mbpp_gemini-1.5-flash_3.json', 'a') as f:
    json.dump(results, f)

Processing Chunks: 3it [00:45, 15.09s/it]


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 3
- Failed tests: 2
- Percentage passed: 60.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 0
- Failed tests: 5
- Percen




In [58]:
(31 + 25 + 40 + 36) / 210

0.6285714285714286

### Token Overhead

In [59]:
gemini_flash = prompt_builder.model
print("PROMPT BUILDER  ->", gemini_flash.count_tokens(prompt_builder.chat_session.history))
gemini_flash = test_writer.model
print("TEST WRITER  ->", gemini_flash.count_tokens(test_writer.chat_session.history))
gemini_flash = code_writer.model
print("CODE WRITER ->", gemini_flash.count_tokens(code_writer.chat_session.history))

PROMPT BUILDER  -> total_tokens: 28115

TEST WRITER  -> total_tokens: 17871

CODE WRITER -> total_tokens: 23394



In [60]:
28115 + 17871 + 23394

69380

## GEMINI PRO

### 2 iterations

In [10]:
prompt_builder, code_writer, test_writer = instantiate_agents()
max_attempts = 2  
results = process_dataset_mbpp(mbpp_dataset_chunks[0], chunk_size=20, max_workers=20, max_attempts=max_attempts)
os.makedirs('results', exist_ok=True)
with open('results/mbpp_gemini-1.5-pro_2.json', 'w') as f:
    json.dump(results, f)

Processing Chunks: 3it [01:54, 38.25s/it]


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 3
- Failed tests: 2
- Percentage passed: 60.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage




In [11]:
prompt_builder, code_writer, test_writer = instantiate_agents()
max_attempts = 2  
results = process_dataset_mbpp(mbpp_dataset_chunks[1], chunk_size=20, max_workers=20, max_attempts=max_attempts)
os.makedirs('results', exist_ok=True)
with open('results/mbpp_gemini-1.5-pro_2.json', 'a') as f:
    json.dump(results, f)

Processing Chunks: 3it [03:30, 70.16s/it]


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 2
- Failed tests: 3
- Percentage passed: 40.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 3
- Failed tests: 2
- Percentage passed: 60.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentag




In [14]:
prompt_builder, code_writer, test_writer = instantiate_agents()
max_attempts = 2  
results = process_dataset_mbpp(mbpp_dataset_chunks[2], chunk_size=20, max_workers=20, max_attempts=max_attempts)
os.makedirs('results', exist_ok=True)
with open('results/mbpp_gemini-1.5-pro_2.json', 'a') as f:
    json.dump(results, f)

Processing Chunks: 2it [01:23, 41.47s/it]

40
52
40
0
24


Processing Chunks: 3it [02:03, 41.05s/it]


Report:
- Total tests: 5
- Passed tests: 0
- Failed tests: 5
- Percentage passed: 0.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 3
- Failed tests: 2
- Percentage passed: 60.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 3
- Failed tests: 2
- Percentage passed: 60.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage 




In [18]:
prompt_builder, code_writer, test_writer = instantiate_agents()
max_attempts = 2  
results = process_dataset_mbpp(mbpp_dataset_chunks[3], chunk_size=20, max_workers=20, max_attempts=max_attempts)
os.makedirs('results', exist_ok=True)
with open('results/mbpp_gemini-1.5-pro_2.json', 'a') as f:
    json.dump(results, f)

Processing Chunks: 3it [01:48, 36.27s/it]


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 2
- Failed tests: 3
- Percentage passed: 40.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 2
- Failed tests: 3
- Percentage passed: 40.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 1
- Failed tests: 4
- Percenta




In [19]:
(35 + 26 + 39 + 43) / 210

0.680952380952381

## 3 Iterations

In [25]:
prompt_builder, code_writer, test_writer = instantiate_agents()
max_attempts = 3  
results = process_dataset_mbpp(mbpp_dataset_chunks[0], chunk_size=20, max_workers=20, max_attempts=max_attempts)
os.makedirs('results', exist_ok=True)
with open('results/mbpp_gemini-1.5-pro_3.json', 'w') as f:
    json.dump(results, f)

Processing Chunks: 3it [02:13, 44.50s/it]


Report:
- Total tests: 5
- Passed tests: 3
- Failed tests: 2
- Percentage passed: 60.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage




In [26]:
prompt_builder, code_writer, test_writer = instantiate_agents()
max_attempts = 3  
results = process_dataset_mbpp(mbpp_dataset_chunks[1], chunk_size=20, max_workers=20, max_attempts=max_attempts)
os.makedirs('results', exist_ok=True)
with open('results/mbpp_gemini-1.5-pro_3.json', 'a') as f:
    json.dump(results, f)

Processing Chunks: 3it [02:05, 41.79s/it]


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percenta




In [27]:
prompt_builder, code_writer, test_writer = instantiate_agents()
max_attempts = 3  
results = process_dataset_mbpp(mbpp_dataset_chunks[2], chunk_size=20, max_workers=20, max_attempts=max_attempts)
os.makedirs('results', exist_ok=True)
with open('results/mbpp_gemini-1.5-pro_3.json', 'a') as f:
    json.dump(results, f)

Processing Chunks: 2it [02:58, 91.80s/it]

48
75
-52
24
0


Processing Chunks: 3it [04:46, 95.42s/it]


Report:
- Total tests: 5
- Passed tests: 0
- Failed tests: 5
- Percentage passed: 0.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage 




In [28]:
prompt_builder, code_writer, test_writer = instantiate_agents()
max_attempts = 3  
results = process_dataset_mbpp(mbpp_dataset_chunks[3], chunk_size=20, max_workers=20, max_attempts=max_attempts)
os.makedirs('results', exist_ok=True)
with open('results/mbpp_gemini-1.5-pro_3.json', 'a') as f:
    json.dump(results, f)

Processing Chunks: 3it [02:01, 40.58s/it]


Report:
- Total tests: 5
- Passed tests: 4
- Failed tests: 1
- Percentage passed: 80.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 2
- Failed tests: 3
- Percentage passed: 40.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: True


Report:
- Total tests: 5
- Passed tests: 5
- Failed tests: 0
- Percentage passed: 100.0%

Solution is valid: False


Report:
- Total tests: 5
- Passed tests: 3
- Failed tests: 2
- Percent




### pass@1

In [29]:
(34 + 29 + 40 + 44) / 210

0.7

                   
| Model   | pass@1 (it.=2)| pass@1 (it.=3)|
| --------------   | ----------   | ------------ |
| Gemini 1.5 flash |    66%          |    65%       |
| Gemini 1.5 pro   |     68%     |     70%       |




| Model   | total tokens (it.=2)| total tokens (it.=3)|
| --------------   | ----------   | ------------ |
| Gemini 1.5 flash |    53886         |  69380         |