In [1]:
import concurrent.futures
import json
from pprint import pprint
import re
import traceback

from human_eval.data import read_problems
from human_eval.execution import check_correctness
import litellm
from pydantic import BaseModel
from tqdm import tqdm

In [2]:
LLM_NAME = 'gpt-4o' # 'gpt-4o-mini'
TEST_TIMEOUT = 15 # seconds
MAX_TOKENS = 512
MAX_RETRIES = 10
# "gpt-4o-mini", "gpt-4o", "o1-preview", "o1-mini"

GENERATION_KWARGS = {
    'temperature': 1.4,
    'top_p': 0.1,
}

In [3]:
class BugResponse(BaseModel):
    bug_explanation: str
    buggy_code: str


def introduce_bug(problem):
    prompt = f"""
    Here's a Python function:

    {problem['prompt']}
    {problem['canonical_solution']}

    Introduce a bug in this function. It could be very simple and obvious or complex and subtle. Provide your response in JSON format with two fields:
    1. 'bug_explanation': A brief (maximum 2 sentences) description of the bug you introduced.
    2. 'buggy_code': The function with the bug introduced (code should start from just after the function signature and docstring). The first line should start with a four space indent. Do not include comments in the code.
    """
    response = litellm.completion(
        model = LLM_NAME,
        messages = [{
            'role': 'user',
            'content': prompt
        }],
        response_format = BugResponse if 'gpt' in LLM_NAME.lower() else None,
        max_tokens = MAX_TOKENS,
        **GENERATION_KWARGS,
    )
    
    content = response.choices[0].message.content
    content = json.loads(content)
    # print('=' * 60)
    # print(content['buggy_code'])
    # print('=' * 60)

    if not content['buggy_code'].endswith('\n'):
        content['buggy_code'] += '\n'
    if not content['buggy_code'].strip('\n').startswith('    '):
        raise ValueError('Code does not start with an indent')

    if content['buggy_code'].startswith('```json'):
        content['buggy_code'] = content['buggy_code'].strip('```json').strip('\n').strip('```')
    elif content['buggy_code'].startswith('```'):
        content['buggy_code'] = content['buggy_code'].strip('```').strip('\n')
        
    # Remove comments from the buggy code
    content['buggy_code'] = re.sub(r'#.*?(\n|$)', '\n', content['buggy_code'])
    
    return content


In [4]:
def create_problem_with_bug(problem, max_attempts=MAX_RETRIES):
    for _ in range(max_attempts):
        try:
            bug_info = introduce_bug(problem)
            buggy_completion = bug_info['buggy_code']
            
            # Check if the buggy solution fails the tests
            result = check_correctness(problem, buggy_completion, TEST_TIMEOUT)
            # print('=' * 60)
            # pprint(problem)
            # pprint(bug_info)
            # pprint(result)
            
            if not result['passed']:
                # Bug successfully introduced
                problem_with_bug = problem.copy()
                problem_with_bug['incorrect_solution'] = buggy_completion
                problem_with_bug['bug_explanation'] = bug_info['bug_explanation']
                return problem_with_bug
        except Exception as e:
            print(f"Error in create_buggy_problem: {e}")
            print(traceback.format_exc())


def process_problem(item):
    task_id, problem = item
    buggy_problem = create_problem_with_bug(problem)
    return task_id, buggy_problem


def create_problems_with_bug_parallel(problems, max_workers=8):
    buggy_problems = {}
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_task = {executor.submit(process_problem, item): item[0] for item in list(problems.items())}
        for future in tqdm(concurrent.futures.as_completed(future_to_task), total=len(future_to_task)):
            task_id = future_to_task[future]
            try:
                task_id, result = future.result()
                if result:
                    buggy_problems[task_id] = result
                else:
                    print(f"Failed to create buggy problem for task {task_id}")
            except Exception as e:
                print(f"Error processing task {task_id}: {e}")
    
    return buggy_problems


In [5]:
# problems = read_problems()
# introduce_bug(problems[list(problems.keys())[0]])

In [6]:
problems = read_problems()
buggy_problems = create_problems_with_bug_parallel(problems)

# Print some statistics
print(f"Total problems: {len(problems)}")
print(f"Buggy problems created: {len(buggy_problems)}")

# Optionally, you can save the buggy_problems to a file
with open('data/human_eval_with_bugs.json', 'w') as f:
    json.dump(buggy_problems, f)

 12%|█▏        | 19/164 [00:12<01:04,  2.26it/s]

Failed to create buggy problem for task HumanEval/7


 19%|█▉        | 31/164 [00:21<02:43,  1.23s/it]

Failed to create buggy problem for task HumanEval/16


 20%|█▉        | 32/164 [00:22<02:43,  1.24s/it]

Failed to create buggy problem for task HumanEval/28


 22%|██▏       | 36/164 [00:25<01:36,  1.32it/s]

Failed to create buggy problem for task HumanEval/27


 23%|██▎       | 37/164 [00:25<01:21,  1.56it/s]

Failed to create buggy problem for task HumanEval/29


 26%|██▌       | 42/164 [00:26<00:49,  2.49it/s]

Failed to create buggy problem for task HumanEval/34


 28%|██▊       | 46/164 [00:29<00:56,  2.10it/s]

Failed to create buggy problem for task HumanEval/35


 34%|███▍      | 56/164 [00:39<02:53,  1.61s/it]

Failed to create buggy problem for task HumanEval/42


 36%|███▌      | 59/164 [00:41<01:44,  1.00it/s]

Failed to create buggy problem for task HumanEval/53


 41%|████      | 67/164 [00:46<00:57,  1.70it/s]

Failed to create buggy problem for task HumanEval/48


 44%|████▍     | 72/164 [00:48<00:37,  2.44it/s]

Failed to create buggy problem for task HumanEval/47


 51%|█████     | 83/164 [00:56<01:05,  1.24it/s]

Failed to create buggy problem for task HumanEval/59


 65%|██████▌   | 107/164 [01:09<00:29,  1.96it/s]

Failed to create buggy problem for task HumanEval/86


 87%|████████▋ | 142/164 [01:41<00:28,  1.30s/it]

Failed to create buggy problem for task HumanEval/135


 88%|████████▊ | 144/164 [01:45<00:30,  1.54s/it]

Failed to create buggy problem for task HumanEval/124


 88%|████████▊ | 145/164 [01:45<00:22,  1.16s/it]

Failed to create buggy problem for task HumanEval/127


 96%|█████████▌| 157/164 [01:58<00:09,  1.39s/it]

Failed to create buggy problem for task HumanEval/146


 98%|█████████▊| 160/164 [02:06<00:10,  2.56s/it]

Failed to create buggy problem for task HumanEval/159


 98%|█████████▊| 161/164 [02:07<00:06,  2.03s/it]

Failed to create buggy problem for task HumanEval/162


 99%|█████████▉| 162/164 [02:09<00:03,  1.99s/it]

Failed to create buggy problem for task HumanEval/143


 99%|█████████▉| 163/164 [02:20<00:04,  4.75s/it]

Failed to create buggy problem for task HumanEval/148


100%|██████████| 164/164 [02:21<00:00,  1.16it/s]

Failed to create buggy problem for task HumanEval/79
Total problems: 164
Buggy problems created: 142



