In [7]:
import concurrent.futures
import json
import traceback

from human_eval.data import read_problems
from human_eval.execution import check_correctness
import litellm
from pydantic import BaseModel
from tqdm import tqdm

In [2]:
LLM_NAME = 'gpt-4o-mini'
TEST_TIMEOUT = 10 # seconds
MAX_TOKENS = 512
MAX_RETRIES = 6
# "gpt-4o-mini", "gpt-4o", "o1-preview", "o1-mini"

In [3]:
class BugResponse(BaseModel):
    bug_explanation: str
    buggy_code: str
    
    
def introduce_bug(problem):
    prompt = f"""
    Here's a Python function:

    {problem['prompt']}
    {problem['canonical_solution']}

    Introduce a subtle bug in this function. Provide your response in JSON format with two fields:
    1. 'bug_explanation': A brief (maximum 2 sentences) description of the bug you introduced.
    2. 'buggy_code': The function with the bug introduced (code should start from just after the function signature and docstring). Code should start with a new line and indent.
    """
    response = litellm.completion(
        model = LLM_NAME,
        messages = [{
            'role': 'user',
            'content': prompt
        }],
        response_format = BugResponse if 'gpt' in LLM_NAME.lower() else None,
        max_tokens = MAX_TOKENS,
    )
    
    content = response.choices[0].message.content
    
    if content.startswith('```json'):
        content = content.strip('```json').strip('\n').strip('```')
    elif content.startswith('```'):
        content = content.strip('```').strip('\n')
    
    response_dict = json.loads(content)
    return response_dict

In [4]:
def create_problem_with_bug(problem, max_attempts=MAX_RETRIES):
    for _ in range(max_attempts):
        try:
            bug_info = introduce_bug(problem)
            buggy_completion = bug_info['buggy_code']
            
            # Check if the buggy solution fails the tests
            result = check_correctness(problem, buggy_completion, TEST_TIMEOUT)
            # pprint(problem)
            # pprint(bug_info)
            # pprint(result)
            
            if not result['passed']:
                # Bug successfully introduced
                problem_with_bug = problem.copy()
                problem_with_bug['incorrect_solution'] = buggy_completion
                problem_with_bug['bug_explanation'] = bug_info['bug_explanation']
                return problem_with_bug
        except Exception as e:
            print(f"Error in create_buggy_problem: {e}")
            print(traceback.format_exc())


def process_problem(item):
    task_id, problem = item
    buggy_problem = create_problem_with_bug(problem)
    return task_id, buggy_problem


def create_problems_with_bug_parallel(problems, max_workers=8):
    buggy_problems = {}
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_task = {executor.submit(process_problem, item): item[0] for item in problems.items()}
        for future in tqdm(concurrent.futures.as_completed(future_to_task), total=len(future_to_task)):
            task_id = future_to_task[future]
            try:
                task_id, result = future.result()
                if result:
                    buggy_problems[task_id] = result
                else:
                    print(f"Failed to create buggy problem for task {task_id}")
            except Exception as e:
                print(f"Error processing task {task_id}: {e}")
    
    return buggy_problems

In [5]:
# problems = read_problems()
# introduce_bug(problems[list(problems.keys())[0]])

In [6]:
problems = read_problems()
buggy_problems = create_problems_with_bug_parallel(problems)

# Print some statistics
print(f"Total problems: {len(problems)}")
print(f"Buggy problems created: {len(buggy_problems)}")

# Optionally, you can save the buggy_problems to a file
with open('data/human_eval_with_bugs.json', 'w') as f:
    json.dump(buggy_problems, f)

Failed to create buggy problem for task HumanEval/24
Failed to create buggy problem for task HumanEval/35
Failed to create buggy problem for task HumanEval/44
Failed to create buggy problem for task HumanEval/111
Failed to create buggy problem for task HumanEval/99
Failed to create buggy problem for task HumanEval/137
Failed to create buggy problem for task HumanEval/161
Failed to create buggy problem for task HumanEval/156
Failed to create buggy problem for task HumanEval/163
Total problems: 164
Buggy problems created: 155
