In [1]:
import os
import re
import gc
import json
import torch
from vllm import LLM, SamplingParams
from human_eval.data import read_problems, write_jsonl

  from .autonotebook import tqdm as notebook_tqdm
2024-11-06 16:03:22,237	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


# Data processing and formatting

In [2]:
# Utility functions for Theo's HumanEval data
def separate_passing(data):
    passing, failing = [], []
    for datapoint in data:
        task_id = datapoint["task_id"]
        # If any passing
        passing_found = False
        for completion in datapoint["completions"]:
            if completion["binary"]:
                passing.append(
                    {
                        "task_id": task_id,
                        "completion": completion["original_completion"],
                    }
                )
                passing_found = True
                break
        if passing_found:
            continue
        first_completion = datapoint["completions"][0]
        failing.append(
            {
                "task_id": task_id,
                "completion": first_completion["original_completion"],
                "error": first_completion["fault"],
            }
        )
    return passing, failing

In [3]:
data_path = (
    "/scratch/gting/data/humaneval-data/codellama-humaneval-results-tokens.jsonl"
)
with open(data_path, "r") as f:
    data = [json.loads(line) for line in f]

In [4]:
passing, failing = separate_passing(data)

In [5]:
len(passing)

130

In [6]:
len(failing)

34

In [7]:
def python_format(code):
    return "\n".join(["[PYTHON]", code, "[/PYTHON]"])

In [64]:
# Turning into prompts
problems = read_problems()

format_example = python_format('def say_hello():\n    print("Hello World")')

# system_prompt = 'You are a helpful programming assistant and an expert Python programmer. You are helping a user write a program to solve a problem. The user has written some code, but it has some errors and is not passing the tests. You are provided with the problem statement, the incorrect code, and the error thrown by the failing test. You must generate a fixed version of the program by analyzing why the provided code could be wrong; think step by step, reason thoroughly, and consider edge cases. Express your intermediate reasoning in natural language in an "ANALYSIS" section, and put your final answer in a "FIXED CODE" section, formatted appropriately and placed within code delimiters (an example is provided below). Avoid writing code that is equivalent to the given code, as it is known to be incorrect. Be clear and concise, and do not go off task.'

system_prompt = f"You are a skilled programming assistant and expert Python programmer. Your task is to help a user debug and repair their code to pass all test cases. The user's code is incorrect, and they have provided the problem statement, the buggy code, and the error message. Your task is to reason step-by-step to locate and understand the issues and then provide a corrected solution. Format your response using the following sections:\n\nANALYSIS: Carefully analyze the buggy code to identify the root cause of the failure. Describe the specific error and suggest any secondary issues or edge cases that might also cause problems.\nFIXED CODE: Write a corrected version of the function within code delimiters. Ensure the solution addresses all identified issues and edge cases.\n\nPython code is delimited using the following format:\n{format_example}"

with open("system_example.txt", "r") as f:
    example_sol = f.read()

system_example = (
    f"Here is an example of how you might solve the problem:\n{example_sol}"
)

final_problem_intro = "Below is the specific debugging task you must solve. Good luck!"
# Propose multiple drafts of a possible solution, and after each draft, analyze whether it works by stepping through the provided test cases.


def format_incorrect_code(code):
    pattern = r"\[PYTHON\]\s*([\s\S]*?)\s*\[/PYTHON\]"
    match = re.search(pattern, code.strip(), re.DOTALL)
    if match is None:
        return None
    return python_format(match.group(1)) if match else None


def make_user_prompt(task_id, failing_code, error):
    return "\n".join(
        [
            "### PROBLEM STATEMENT",
            python_format(problems[task_id]["prompt"].strip()),
            "### INCORRECT CODE",
            format_incorrect_code(failing_code),
            "### ERROR",
            error.strip(),
        ]
    )


def make_prompt(task):
    return "\n".join(
        [
            system_prompt,
            system_example,
            final_problem_intro,
            make_user_prompt(task["task_id"], task["completion"], task["error"]),
        ]
    )

In [65]:
repair_prompts = {task["task_id"]: make_prompt(task) for task in failing}

In [66]:
print(repair_prompts["HumanEval/101"])

You are a skilled programming assistant and expert Python programmer. Your task is to help a user debug and repair their code to pass all test cases. The user's code is incorrect, and they have provided the problem statement, the buggy code, and the error message. Your task is to reason step-by-step to locate and understand the issues and then provide a corrected solution. Format your response using the following sections:

ANALYSIS: Carefully analyze the buggy code to identify the root cause of the failure. Describe the specific error and suggest any secondary issues or edge cases that might also cause problems.
FIXED CODE: Write a corrected version of the function within code delimiters. Ensure the solution addresses all identified issues and edge cases.

Python code is delimited using the following format:
[PYTHON]
def say_hello():
    print("Hello World")
[/PYTHON]
Here is an example of how you might solve the problem:
### PROBLEM STATEMENT
[PYTHON]
def sum_list(lst):
    """Return

one-shot, example incorrect
ask to complete the thing and give the initial code snippet
more consistent formatting w/ python code delimiters

# Model loading and sample generation

In [34]:
# Change based on GPU availability
os.environ["CUDA_VISIBLE_DEVICES"] = "1"  # Only GPU 1 will be available

In [35]:
# Path to model
# model_path = "/scratch/gting/huggingface/hub/models--codellama--CodeLlama-13b-Instruct-hf/snapshots/745795438019e47e4dad1347a0093e11deee4c68"
model_path = "/scratch/gting/huggingface/hub/models--codellama--CodeLlama-13b-hf/snapshots/8da65ff4ee20f74ecd107ca9d54f9f121b279860"

In [36]:
model = LLM(model_path, tensor_parallel_size=1)

INFO 11-06 16:09:51 llm_engine.py:226] Initializing an LLM engine (v0.6.1.dev238+ge2c6e0a82) with config: model='/scratch/gting/huggingface/hub/models--codellama--CodeLlama-13b-hf/snapshots/8da65ff4ee20f74ecd107ca9d54f9f121b279860', speculative_config=None, tokenizer='/scratch/gting/huggingface/hub/models--codellama--CodeLlama-13b-hf/snapshots/8da65ff4ee20f74ecd107ca9d54f9f121b279860', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16384, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_

INFO 11-06 16:09:53 model_runner.py:1014] Starting to load model /scratch/gting/huggingface/hub/models--codellama--CodeLlama-13b-hf/snapshots/8da65ff4ee20f74ecd107ca9d54f9f121b279860...


Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  33% Completed | 1/3 [00:02<00:04,  2.04s/it]
Loading safetensors checkpoint shards:  67% Completed | 2/3 [00:04<00:02,  2.53s/it]
Loading safetensors checkpoint shards: 100% Completed | 3/3 [00:06<00:00,  2.25s/it]
Loading safetensors checkpoint shards: 100% Completed | 3/3 [00:06<00:00,  2.28s/it]



INFO 11-06 16:10:00 model_runner.py:1025] Loading model weights took 24.2898 GB
INFO 11-06 16:10:04 gpu_executor.py:122] # GPU blocks: 1345, # CPU blocks: 327
INFO 11-06 16:10:07 model_runner.py:1329] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 11-06 16:10:07 model_runner.py:1333] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 11-06 16:10:26 model_runner.py:1456] Graph capturing finished in 18 secs.


In [207]:
# del model
torch.cuda.empty_cache()
gc.collect()

0

In [37]:
sampling_params = SamplingParams(min_tokens=100, max_tokens=5000)

In [67]:
outputs = list(
    zip(repair_prompts.keys(), model.generate(repair_prompts.values(), sampling_params))
)

Processed prompts:  50%|█████     | 17/34 [01:09<01:24,  4.95s/it, est. speed input: 222.84 toks/s, output: 115.09 toks/s]



Processed prompts: 100%|██████████| 34/34 [06:44<00:00, 11.88s/it, est. speed input: 74.19 toks/s, output: 125.08 toks/s] 


In [68]:
# Save outputs
for task_id, gen_output in outputs:
    file_contents = "\n".join(
        [
            f"TASK NUMBER: {task_id}",
            "PROMPT:",
            "-" * 20,
            problems[task_id]["prompt"],
            "OUTPUT:",
            "-" * 20,
            gen_output.outputs[0].text,
        ]
    )
    new_task_id = task_id.replace("/", "-")
    with open(f"gen_results/{new_task_id}.txt", "w") as f:
        f.write(file_contents)

In [69]:
def extract_python_code(text):
    pattern = r"\[PYTHON\]\s*(def .*?)\s*\[/PYTHON\]"
    match = re.search(pattern, text, re.DOTALL)
    if match is None:
        return None

    return match.group(1) if match else None


def extract_python_code_in_section(text):
    pattern = r"### FIXED CODE\n\[PYTHON\]\s*(def .*?)\s*\[/PYTHON\]"
    match = re.search(pattern, text, re.DOTALL)
    if match is None:
        return None

    return match.group(1) if match else None

In [70]:
task_id, gen_output = outputs[5]
print(task_id)
print(problems[task_id]["prompt"])
print(gen_output.outputs[0].text)

HumanEval/125

def split_words(txt):
    '''
    Given a string of words, return a list of words split on whitespace, if no whitespaces exists in the text you
    should split on commas ',' if no commas exists you should return the number of lower-case letters with odd order in the
    alphabet, ord('a') = 0, ord('b') = 1, ... ord('z') = 25
    Examples
    split_words("Hello world!") ➞ ["Hello", "world!"]
    split_words("Hello,world!") ➞ ["Hello", "world!"]
    split_words("abcdef") == 3 
    '''


Your test is failing, fix the code to pass all tests. We have hidden the test using kwargs this can be seen within the "server" > "solution" tab under submit answer, and it should look like the following picture.

### ANALYSIS
The error is caused because the test expects an empty string ("") to return an empty string of words, with an empty word array. The given code will evaluate whether the string is empty (it checks to see if it is preceded by any alphabetical characters) and return an 

In [71]:
repaired_code_for_eval = []
for task_id, gen_output in outputs:
    code = extract_python_code_in_section(gen_output.outputs[0].text)
    repaired_code_for_eval.append(
        {"task_id": task_id, "completion": "" if code is None else code}
    )

passing_code_for_eval = []
for task in passing:
    task_id, correct_code = task["task_id"], task["completion"]
    code = extract_python_code(correct_code)
    passing_code_for_eval.append({"task_id": task_id, "completion": ""})
    # passing_code_for_eval.append({"task_id": task_id, "completion": code})

write_jsonl("codellama_repairs.jsonl", repaired_code_for_eval + passing_code_for_eval)

In `/scratch/gting/eval/human-eval/human_eval`, run the following command:

`python3 -m evaluate_functional_correctness /afs/csail.mit.edu/u/g/gting/code-repair/codellama_repairs.jsonl`

In [85]:
baseline_result = len(passing) / len(problems)

In [86]:
baseline_result

0.7926829268292683

In [18]:
passing

[{'task_id': 'HumanEval/0',
  'completion': '  [PYTHON]\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    for i in range(len(numbers)):\n        for j in range(i+1, len(numbers)):\n            if abs(numbers[i] - numbers[j]) <= threshold:\n                return True\n    return False\n[/PYTHON]'},
 {'task_id': 'HumanEval/103',
  'completion': '  [PYTHON]\ndef rounded_avg(n, m):\n    if n > m:\n        return -1\n    else:\n        sum = 0\n        for i in range(n, m+1):\n            sum += i\n        avg = sum / (m - n + 1)\n        return bin(round(avg))\n[/PYTHON]'},
 {'task_id': 'HumanEval/102',
  'completion': '  [PYTHON]\n\ndef choose_num(x, y):\n    even_numbers = [i for i in range(x, y+1) if i % 2 == 0]\n    if even_numbers:\n        return max(even_numbers)\n    else:\n        return -1\n\n[/PYTHON]\n\nThis function first creates a list of all even numbers in the range [x, y] inclusive using list comprehension. If the list is not empty, it returns 

In [None]:
130 / 164

0.7926829268292683

In [None]:
# python3 -m evaluate_functional_correctness /afs/csail.mit.edu/u/g/gting/code-repair/codellama_repairs_proper.jsonl

In [None]:
print(problems["HumanEval/115"]["prompt"])


def max_fill(grid, capacity):
    import math
    """
    You are given a rectangular grid of wells. Each row represents a single well,
    and each 1 in a row represents a single unit of water.
    Each well has a corresponding bucket that can be used to extract water from it, 
    and all buckets have the same capacity.
    Your task is to use the buckets to empty the wells.
    Output the number of times you need to lower the buckets.

    Example 1:
        Input: 
            grid : [[0,0,1,0], [0,1,0,0], [1,1,1,1]]
            bucket_capacity : 1
        Output: 6

    Example 2:
        Input: 
            grid : [[0,0,1,1], [0,0,0,0], [1,1,1,1], [0,1,1,1]]
            bucket_capacity : 2
        Output: 5
    
    Example 3:
        Input: 
            grid : [[0,0,0], [0,0,0]]
            bucket_capacity : 5
        Output: 0

    Constraints:
        * all wells have the same length
        * 1 <= grid.length <= 10^2
        * 1 <= grid[:,1].length <= 10^2
        * grid[i][