From bfe275266434d2015b3c8effabe5ff9cb4257a72 Mon Sep 17 00:00:00 2001 From: jemeza-codegen Date: Thu, 13 Feb 2025 16:58:48 -0800 Subject: [PATCH 1/2] feat: extension that clones and parses swe bench codebases --- pyproject.toml | 1 + src/codegen/extensions/swe_bench/__init__.py | 0 .../extensions/swe_bench/swe_bench_wrapper.py | 80 +++++++++++++++++++ src/codegen/extensions/swe_bench/utils.py | 42 ++++++++++ 4 files changed, 123 insertions(+) create mode 100644 src/codegen/extensions/swe_bench/__init__.py create mode 100644 src/codegen/extensions/swe_bench/swe_bench_wrapper.py create mode 100644 src/codegen/extensions/swe_bench/utils.py diff --git a/pyproject.toml b/pyproject.toml index f820b7ae9..1cc11bf23 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,6 +70,7 @@ dependencies = [ "numpy>=2.2.2", "mcp[cli]", "neo4j", + "datasets", ] license = { text = "Apache-2.0" } diff --git a/src/codegen/extensions/swe_bench/__init__.py b/src/codegen/extensions/swe_bench/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/codegen/extensions/swe_bench/swe_bench_wrapper.py b/src/codegen/extensions/swe_bench/swe_bench_wrapper.py new file mode 100644 index 000000000..3bde3edfd --- /dev/null +++ b/src/codegen/extensions/swe_bench/swe_bench_wrapper.py @@ -0,0 +1,80 @@ +import shutil +from collections.abc import Generator +from typing import Any + +from datasets import load_dataset + +from codegen.extensions.swe_bench.utils import NO_ENV_SETUP, SWEBenchEntry, SWEBenchEnvSetup, SWEBenchSplit, construct_codebase +from codegen.sdk.core.codebase import Codebase + + +class SWEBenchWrapper: + def __init__(self, remove_after_run: bool = False): + print("Loading SWE-bench dataset...") + self.ds = load_dataset("princeton-nlp/SWE-bench") + print("SWE-bench dataset loaded.") + self.remove_after_run = remove_after_run + self.repo_groups = self.create_repo_groups() + + def create_repo_groups(self) -> dict: + # Create a list of all possible splits + SPLITS: list[SWEBenchSplit] = ["train", "dev", "test"] + + # Create a nested dictionary with explicit type hints + repo_groups: dict[SWEBenchSplit, dict[str, dict[str, list[Any]]]] = {} + + # Group entries from all splits + for split in SPLITS: + repo_groups[split] = {} + for entry in self.ds[split]: + repo = entry["repo"] + environment_setup_commit = entry["environment_setup_commit"] + + # Initialize nested dictionaries if they don't exist + if repo not in repo_groups[split]: + repo_groups[split][repo] = {} + if environment_setup_commit not in repo_groups[split][repo]: + repo_groups[split][repo][environment_setup_commit] = [] + + repo_groups[split][repo][environment_setup_commit].append(entry) + + return repo_groups + + def get_entries_for_split(self, split: SWEBenchSplit) -> Generator[tuple[SWEBenchEnvSetup | SWEBenchEntry, Codebase], None, None]: + # ===== [ For each repo in the split ] ===== + for repo in self.repo_groups[split]: + # construct the codebase for the repo + codebase = construct_codebase(repo_full_name=repo) + # ===== [ For each environment setup commit ] ===== + for environment_setup_commit in self.repo_groups[split][repo]: + # yield the environment setup commit + if environment_setup_commit: + # no need to parse the codebase on the environment commit + codebase.checkout(commit=environment_setup_commit, remote=True) + yield SWEBenchEnvSetup(split=split, environment_setup_commit=environment_setup_commit), codebase + else: + yield SWEBenchEnvSetup(split=split, environment_setup_commit=NO_ENV_SETUP), codebase + # ===== [ For each test setup commit ] ===== + for entry in self.repo_groups[split][repo][environment_setup_commit]: + codebase.checkout(commit=entry["base_commit"], remote=True) + # yield the test entry with a parsed codebase object + yield SWEBenchEntry(entry=entry, split=split), codebase + + if codebase and self.remove_after_run: + # remove the repo from the tmp_dir + shutil.rmtree(f"/tmp/codegen/{repo}") + + +if __name__ == "__main__": + swe_bench_wrapper = SWEBenchWrapper() + for entry, codebase in swe_bench_wrapper.get_entries_for_split("train"): + if isinstance(entry, SWEBenchEnvSetup): + print(f"Environment setup commit: {entry.environment_setup_commit}") + # install dependencies... + elif isinstance(entry, SWEBenchEntry): + print(f"Entry: {entry.entry['instance_id']}") + problem_statement = entry.entry["problem_statement"] + print(f"Task: {problem_statement[:20]}") + # send of agent to solve tasks.... + + print(f"Number of files: {len(codebase.files)}") diff --git a/src/codegen/extensions/swe_bench/utils.py b/src/codegen/extensions/swe_bench/utils.py new file mode 100644 index 000000000..007ae2603 --- /dev/null +++ b/src/codegen/extensions/swe_bench/utils.py @@ -0,0 +1,42 @@ +from typing import Literal + +from pydantic import BaseModel + +from codegen.git.repo_operator.remote_repo_operator import RemoteRepoOperator +from codegen.git.schemas.repo_config import RepoConfig +from codegen.sdk.codebase.config import ProjectConfig +from codegen.sdk.core.codebase import Codebase, PyCodebaseType + +# Define the SWEBenchSplit type using Literal +SWEBenchSplit = Literal["train", "dev", "test"] +NO_ENV_SETUP = "NO_ENV_SETUP" + + +class SWEBenchEnvSetup(BaseModel): + split: SWEBenchSplit + environment_setup_commit: str = NO_ENV_SETUP + + +class SWEBenchEntry(BaseModel): + split: SWEBenchSplit + entry: dict + + +def construct_codebase(repo_full_name: str) -> PyCodebaseType: + repo_name = repo_full_name.split("/")[-1] + repo_config = RepoConfig(name=repo_name, full_name=repo_full_name, base_dir="/tmp/codegen") + + # clone or pull the repo + print(f"Cloning or pulling {repo_full_name}...") + remote_operator = RemoteRepoOperator(repo_config=repo_config, bot_commit=False) + print(f"Cloned or pulled {repo_full_name}.") + + # create the project config + projects = [ProjectConfig(repo_operator=remote_operator, base_path=None, subdirectories=None)] + + # parse the codebase + print("Parsing codebase...") + codebase = Codebase(projects=projects) + print("Codebase parsed.") + + return codebase From 18cea4cffbd0a5f1e834b2dfa899a5e8e66c0a2f Mon Sep 17 00:00:00 2001 From: Victor Cheng Date: Sun, 16 Feb 2025 08:45:12 -0800 Subject: [PATCH 2/2] treehacks --- src/codegen/extensions/swebench/README.md | 29 ++ src/codegen/extensions/swebench/agent.py | 129 ++++++ src/codegen/extensions/swebench/harness.py | 383 ++++++++++++++++++ src/codegen/extensions/swebench/report.py | 280 +++++++++++++ .../swebench/results/all_preds.jsonl | 3 + .../swebench/results/pallets__flask-4992.json | 29 ++ .../swebench/results/psf__requests-1963.json | 27 ++ .../results/pytest-dev__pytest-7168.json | 27 ++ src/codegen/extensions/swebench/tests.py | 237 +++++++++++ src/codegen/extensions/swebench/utils.py | 186 +++++++++ 10 files changed, 1330 insertions(+) create mode 100644 src/codegen/extensions/swebench/README.md create mode 100644 src/codegen/extensions/swebench/agent.py create mode 100644 src/codegen/extensions/swebench/harness.py create mode 100755 src/codegen/extensions/swebench/report.py create mode 100644 src/codegen/extensions/swebench/results/all_preds.jsonl create mode 100644 src/codegen/extensions/swebench/results/pallets__flask-4992.json create mode 100644 src/codegen/extensions/swebench/results/psf__requests-1963.json create mode 100644 src/codegen/extensions/swebench/results/pytest-dev__pytest-7168.json create mode 100755 src/codegen/extensions/swebench/tests.py create mode 100644 src/codegen/extensions/swebench/utils.py diff --git a/src/codegen/extensions/swebench/README.md b/src/codegen/extensions/swebench/README.md new file mode 100644 index 000000000..8ba547cf0 --- /dev/null +++ b/src/codegen/extensions/swebench/README.md @@ -0,0 +1,29 @@ +## Codegen Harness and Evaluator for SWE Bennch Development Tool + +This folder contains a harness and evaluator for the SWE Bench leaderboard, and enables developers to test and evaluate their codegen models on the SWE Bench leaderboard. + +It integrates directly into the Codegen agentic framework and can be built on top of. + +### Setup + +Remember to install all the dependencies for the environment. + +### Usage + +#### Edit agent.py, your codegen agent + +This file contains the main logic for the agent. + +The agent taps into the tree sitter using codegen. You can modify this by adding additional tools, extending its capabilities, prompts, and more. + +It is invoked in the harness script. + +#### Run harness.py to run the agent + +This script will gather the correct dataset, run the agent, and save the results. + +#### Run report.py to generate a report + +This script will generate a report from the results. It will loop through all the results and generate a report to evaluate each. Currently, there is an error in the docker image. + +There are currently example predictions in the `predictions/results` folder. \ No newline at end of file diff --git a/src/codegen/extensions/swebench/agent.py b/src/codegen/extensions/swebench/agent.py new file mode 100644 index 000000000..6f5d5784a --- /dev/null +++ b/src/codegen/extensions/swebench/agent.py @@ -0,0 +1,129 @@ +from langchain_openai import ChatOpenAI +from codegen import Codebase + +"""Demo implementation of an agent with Codegen tools.""" + +from langchain.agents import AgentExecutor +from langchain.agents.openai_functions_agent.base import OpenAIFunctionsAgent +from langchain.hub import pull +from langchain.tools import BaseTool +from langchain_core.chat_history import InMemoryChatMessageHistory +from langchain_core.messages import BaseMessage +from langchain_core.runnables.history import RunnableWithMessageHistory +from langchain_openai import ChatOpenAI + +from codegen import Codebase + +from codegen.extensions.langchain.tools import ( + CommitTool, + CreateFileTool, + DeleteFileTool, + EditFileTool, + GithubCreatePRCommentTool, + GithubCreatePRReviewCommentTool, + GithubCreatePRTool, + GithubViewPRTool, + ListDirectoryTool, + MoveSymbolTool, + RenameFileTool, + RevealSymbolTool, + SearchTool, + SemanticEditTool, + SemanticSearchTool, + ViewFileTool, +) + + +def create_codebase_agent( + codebase: Codebase, + model_name: str = "gpt-4o", + temperature: float = 0, + verbose: bool = True, + chat_history: list[BaseMessage] = [], +) -> RunnableWithMessageHistory: + """Create an agent with all codebase tools. + + Args: + codebase: The codebase to operate on + model_name: Name of the model to use (default: gpt-4) + temperature: Model temperature (default: 0) + verbose: Whether to print agent's thought process (default: True) + + Returns: + Initialized agent with message history + """ + # Initialize language model + llm = ChatOpenAI( + model_name=model_name, + temperature=temperature, + ) + + # Get all codebase tools + tools = [ + ViewFileTool(codebase), + ListDirectoryTool(codebase), + SearchTool(codebase), + EditFileTool(codebase), + CreateFileTool(codebase), + DeleteFileTool(codebase), + RenameFileTool(codebase), + MoveSymbolTool(codebase), + # RevealSymbolTool(codebase), + SemanticEditTool(codebase), + SemanticSearchTool(codebase), + CommitTool(codebase), + GithubCreatePRTool(codebase), + GithubViewPRTool(codebase), + GithubCreatePRCommentTool(codebase), + GithubCreatePRReviewCommentTool(codebase), + ] + + # Get the prompt to use + prompt = pull("hwchase17/openai-functions-agent") + + # Create the agent + agent = OpenAIFunctionsAgent( + llm=llm, + tools=tools, + prompt=prompt, + ) + + # Create the agent executor + agent_executor = AgentExecutor( + agent=agent, + tools=tools, + verbose=verbose, + ) + + # Create message history handler + message_history = InMemoryChatMessageHistory(messages=chat_history) + + # Wrap with message history + return RunnableWithMessageHistory( + agent_executor, + lambda session_id: message_history, + input_messages_key="input", + history_messages_key="chat_history", + ) + + +# Initialize codebase +codebase = Codebase.from_repo("fastapi/fastapi") + +# Create the agent with GPT-4 +agent = create_codebase_agent( + codebase=codebase, + model_name="gpt-4o", + temperature=0, + verbose=True +) + + + +# Analyze dependencies +result = agent.invoke( + {"input": "What are the dependencies of the FastAPI class?"}, + config={"configurable": {"session_id": "demo"}} +) +print(result["output"]) + diff --git a/src/codegen/extensions/swebench/harness.py b/src/codegen/extensions/swebench/harness.py new file mode 100644 index 000000000..90accfac1 --- /dev/null +++ b/src/codegen/extensions/swebench/harness.py @@ -0,0 +1,383 @@ +""" +This is the harness for running an AI agent on the SWE Bench dataset. + +""" + +#!/usr/bin/env python + +import json +import random +import subprocess +import sys +import tempfile +from pathlib import Path +import datetime +import pprint + +import lox + +# Replace the dump import with pprint +# from dump import dump +# from tests import run_tests +from utils import get_full_dataset # noqa: F401 +from utils import get_lite_dataset # noqa: F401 +from utils import get_plausible, load_predictions, pick_winner + + +# coding agent +from agent import create_codebase_agent +from codegen import Codebase + + +REPOS_DNAME = Path("repos") +CHAT_LOGS_DNAME = Path("chat-logs") +PREDS_DNAME = Path("predictions") + + +def diff_versus_commit(git_dname, commit): + """ + Take a diff of `git_dname` current contents versus the `commit`. + """ + + diff_cmd = f"git -C {git_dname} diff {commit}" + diff_output = subprocess.check_output(diff_cmd.split()).decode() + return diff_output + + +def files_in_patch(patch): + """ + Extract the list of modified files from a unified diff patch string. + """ + files = [] + for line in patch.split("\n"): + if line.startswith("--- a/") or line.startswith("+++ b/"): + fname = line.split("/", 1)[1] + if fname not in files: + files.append(fname) + return files + + +def checkout_repo(git_tempdir, entry): + """ + Clone the SWE Bench entry's git `repo` into `dname` at the `base_commit`. + Make a tempdir if no `dname` provided. + """ + github_url = "https://github.com/" + repo_url = github_url + entry["repo"] + commit = entry["base_commit"] + + print(repo_url, commit) + + checkout_repo_url_commit(git_tempdir, repo_url, commit) + + +def checkout_repo_url_commit(repo_dname, url, commit): + """ + Clone the git `url` into `dname` at `commit`. + Check a local cache of the bare repo to avoid pulling from github every time. + """ + + # Extract repo name from URL + repo_name = url.split("/")[-1].split(".")[0] + repo_name += ".git" + + # dump(repo_name) + pprint.pprint(repo_name) + REPOS_DNAME.mkdir(exist_ok=True) + bare_repo = REPOS_DNAME / repo_name + + if not bare_repo.exists(): + cmd = f"git clone --bare {url} {bare_repo}" + subprocess.run(cmd.split(), check=True) + + cmd = f"git clone {bare_repo} {repo_dname}" + subprocess.run(cmd.split(), check=True) + + cmd = f"git -c advice.detachedHead=false -C {repo_dname} checkout {commit}" + subprocess.run(cmd.split(), check=True) + + +def show_problems(dataset): + """ + Print out all the instance_id and problem_descriptions. + """ + for inst, entry in dataset.items(): + problem = entry["problem_statement"].splitlines()[0] + print(f"{inst}: {problem}") + + +def run_pre_existing_tests(entry, git_dname): + """Given the current contents of the `git_dname`, run the tests that + were present in the entry's `repo` at the time of the + `base_commit` or which have been added into the repo since. This + checks if the code in the `git_dname` has broken pre-existing + tests or is failing any newly added tests. + + It does NOT attempt to run the tests in the `test_patch` which + are used to evaluate whether the `model_patch` has resolved the + `problem_statement`. + + Returns None if all the tests passed. Returns the text of the + test run output if any failed. + """ + + model_patch = diff_versus_commit(git_dname, entry["base_commit"]) + # passed, output = run_tests( + # entry, + # model_patch=model_patch, + # use_test_patch=False, + # ) + # We were UNABLE to run tests + # if passed is None: + # return + + # if passed: + # return + + # Just keep the output after the (no-op) test patch applied, + # which is the actual output from the tests that were run. + # output = output.split(">>>>> Applied Patch (test)")[-1] + + # return output + + + + +def process_one_instance(entry, out_dname): + """Process one `entry` from SWE Bench using the LLM `models` at the + given `temperature`. Set `model_name_or_path` in the result json. + Store the result json and the chat log into `out_dname`. + """ + + instance_id = entry["instance_id"] + base_commit = entry["base_commit"] + + print("=" * 60) + pprint.pprint(instance_id) + print("=" * 60) + problem_statement = entry["problem_statement"] + print(problem_statement) + + ### + # DO NOT assist aider by telling it which files need to be modified! + oracle = False + gold_files = files_in_patch(entry["patch"]) + if oracle: + oracle_files = gold_files + else: + oracle_files = None + ### + + + results = [] + cost = 0 + winner = None + + num_tries = 1 + # Do NUM_TRIES tries for each of the models, until we find a *plausible* solution + for attempt in range(1, num_tries + 1): + codebase = Codebase.from_repo( + repo_full_name=entry["repo"], + commit=entry["base_commit"], + language="python" + ) # check out the repo + + agent = create_codebase_agent( + codebase=codebase, + model_name="gpt-4o", + temperature=0, + verbose=True + ) + + # for usage for testing for the model + # test_cmd = lambda: run_pre_existing_tests(entry, codebase.repo_path) # noqa: E731 + + pprint.pprint(instance_id) + pprint.pprint(gold_files) + + message = """Below is a real GitHub issue from a popular GitHub repository. +The issue was filed some time ago. +The repo has been checked out at the commit that existed at the moment the issue was filed. +If you are already familiar with this repo, be cautious! +You are working with an old version of the repo! +Filenames, directory names, file contents, etc may be different than what you're used to. + +Propose changes to update the repo to fix the problem below. + +""" + message += problem_statement + + try: + result = agent.invoke( + {"input": message}, + config={"configurable": {"session_id": "demo"}} + ) + except Exception as coder_err: + # swallow any exceptions during benchmarking + pprint.pprint(coder_err) + continue + + + pprint.pprint(instance_id) + pprint.pprint(gold_files) + + + # Get the diff between the current state and the original commit + model_patch = diff_versus_commit(codebase.repo_path, base_commit) + pprint.pprint(model_patch) + + # Record the results for the logs + result = dict( + # Required args for running eval tests + instance_id=instance_id, + model_patch=model_patch, + # For computing stats + gold_files=gold_files, + edited_files=files_in_patch(model_patch) + ) + result["try"] = attempt # `try` is a python keyword + results.append(result) + + pprint.pprint(result) + + # Did we get a successful edit, lint and test? If so, we found a plausible solution! + if model_patch: + winner = result + break + + + # If there's no clear winner, look for the most viable result we got... + if not winner: + winner = pick_winner(results) + + if not winner: + result = dict( + # Required args for running eval tests + instance_id=instance_id, + model_patch=None, + ) + + pprint.pprint(winner) + if not winner: + return + + print("\n\nFinal diff:\n") + print(winner["model_patch"]) + + # Avoid circular reference when we save to json + winner = dict(winner) + + winner.update( + dict( + tries=attempt, + all_results=results, # Record all the results for later analysis + cost=cost, # total cost across all results + ) + ) + + out_fname = out_dname / (instance_id + ".json") + out_fname.write_text(json.dumps(winner, indent=4)) + + +def process_instances( + dataset, threads, prior_dnames +): + """ + dataset - The subset of the SWE Bench dataset to process. + threads - How many problems to attempt concurrently. + prior_dnames - Names of predictions/ dirnames from previous runs. + If they contain a plausible solution for an instance, + don't continue looking. + """ + + # Create the predictions directory if it doesn't exist + PREDS_DNAME.mkdir(exist_ok=True) + out_dname = PREDS_DNAME / "results" + out_dname.mkdir() + + pprint.pprint(out_dname) + + # If we are restarting this run, figure out which instances are already done. + done_preds = load_predictions([out_dname]) + done_instances = set(done_preds.keys()) + pprint.pprint(len(done_instances)) + + pprint.pprint(prior_dnames) + prior_preds = load_predictions(prior_dnames) + pprint.pprint(len(prior_preds)) + + plausible_instances = get_plausible(prior_preds) + pprint.pprint(len(plausible_instances)) + + if prior_preds: + # Just keep trying to solve instances that exist in the previous runs + all_instances = set(prior_preds.keys()) + else: + all_instances = set(dataset.keys()) + + remaining_instances = set(all_instances) + remaining_instances -= done_instances + remaining_instances -= plausible_instances + + remaining_instances = list(remaining_instances) + random.shuffle(remaining_instances) + + pprint.pprint(sorted(remaining_instances)) + pprint.pprint(len(remaining_instances)) + + print() + print("press enter...") + input() + + if not CHAT_LOGS_DNAME.exists(): + CHAT_LOGS_DNAME.mkdir() + + chat_history_dname = CHAT_LOGS_DNAME / "results" + chat_history_dname.mkdir(exist_ok=True) + + if threads > 1: + process_one_instance_lox = lox.process(threads)(process_one_instance) + process_one_instance_func = process_one_instance_lox.scatter + gather = process_one_instance_lox.gather + else: + process_one_instance_func = process_one_instance + + for instance_id in remaining_instances: + if instance_id in done_instances: + print("skipping", instance_id) + continue + + process_one_instance_func( + dataset[instance_id], + out_dname, + ) + + print("#" * 60) + # input() + + if threads > 1: + gather() + + +def main(): + + # Load the SWE Bench dataset + # dataset = get_full_dataset() + # dataset = get_verified_dataset() + dataset = get_lite_dataset() + threads = 10 + + # Any predictions/ dirs provided on the command line are treated + # as earlier, higher priority runs. If a plausible solution was + # found for an instance already, we don't need to keep looking in + # this run. + prior_dnames = sys.argv[1:] + + process_instances( + dataset, threads, prior_dnames + ) + + +if __name__ == "__main__": + status = main() + sys.exit(status) diff --git a/src/codegen/extensions/swebench/report.py b/src/codegen/extensions/swebench/report.py new file mode 100755 index 000000000..ad36d238d --- /dev/null +++ b/src/codegen/extensions/swebench/report.py @@ -0,0 +1,280 @@ +#!/usr/bin/env python + +import json +import os +import random +import shutil +import subprocess +import sys +from collections import defaultdict +from pathlib import Path +from pprint import pprint + + +from tests import remove_patches_to_tests, run_tests +from utils import ( + FULL_DATASET_FNAME, + choose_predictions, + get_dataset, + load_predictions, +) + +using_dataset = "lite" + +NUM_EVAL_PROCS = 5 + +def run_evals(swe_bench_tasks, log_dir, predictions_jsonl): + run_evals_cmd = f""" +python -m swebench.harness.run_evaluation + --predictions_path {predictions_jsonl} + --max_workers 32 + --run_id {log_dir.replace('/', '-')} + --dataset_name princeton-nlp/SWE-bench_Lite + --cache_level instance + --report_dir {log_dir} +""" + run_evals_cmd = " ".join([line.strip() for line in run_evals_cmd.split() if line.strip()]) + print("Running evaluation command:", run_evals_cmd) + + subprocess.run(run_evals_cmd.split(), check=True) + + +def get_report(swe_bench_tasks, log_dir, predictions_jsonl, model_name_or_path): + # Load and parse the evaluation results directly from the predictions file + results = defaultdict(list) + + with open(predictions_jsonl) as f: + for line in f: + pred = json.loads(line) + instance_id = pred['instance_id'] + + # Track basic stats + results['generated'].append(instance_id) + + # Check for evaluation logs + log_file = Path(log_dir) / f"{instance_id}.eval.log" + if log_file.exists(): + results['with_logs'].append(instance_id) + log_content = log_file.read_text() + + if "PASS" in log_content: + results['resolved'].append(instance_id) + results['applied'].append(instance_id) + elif "FAIL" in log_content: + results['applied'].append(instance_id) + else: + results['no_apply'].append(instance_id) + else: + results['no_logs'].append(instance_id) + + # Convert lists to sets for compatibility with existing code + return {k: set(v) for k, v in results.items()} + + +def update_pred_json(predictions, report): + all_instances = set(report.get("generated", [])) + all_instances.update(set(report.get("no_generation", []))) + + for instance_id, pred in predictions.items(): + # Use get() to handle missing 'resolved' key, defaulting to empty set + was_resolved = instance_id in report.get("resolved", set()) + if "resolved" in pred and pred["resolved"] == was_resolved: + continue + + assert instance_id in all_instances, instance_id + + pred["resolved"] = was_resolved + save = dict(pred) + + # Construct json_fname if it doesn't exist + if "json_fname" not in pred: + json_fname = Path("predictions/results") / f"{instance_id}.json" + else: + json_fname = pred["json_fname"] + del save["json_fname"] # Remove from save data if it exists + + Path(json_fname).write_text(json.dumps(save, indent=4)) + + return predictions + + +def preds_to_jsonl(dname, predictions): + dname = Path(dname) + + predictions_jsonl = str(dname / "all_preds.jsonl") + print(f"Creating JSONL file: {predictions_jsonl}") + + # Use a default model name since it's not in the predictions + model_name = "results" + + with open(predictions_jsonl, "w") as fh: + for inst, pred in predictions.items(): + minimal_pred = { + "model_name_or_path": model_name, # Use default model name + "model_patch": remove_patches_to_tests(pred["model_patch"]) if "model_patch" in pred else pred.get("patch", ""), + "instance_id": pred["instance_id"], + } + fh.write(json.dumps(minimal_pred) + "\n") + return predictions_jsonl + + +def run_evals_on_dname(dname): + dname = Path(dname) + + predictions = load_predictions([dname], devin_only=(using_dataset == "devin")) + + predictions_jsonl = preds_to_jsonl(dname, predictions) + pprint(predictions_jsonl) + + log_dir = Path("logs") / dname.name + log_dir.mkdir(exist_ok=True, parents=True) + pprint(log_dir) + + any_need_evals = any("resolved" not in pred for pred in predictions.values()) + any_need_evals = True + if any_need_evals: + run_evals(FULL_DATASET_FNAME, str(log_dir), predictions_jsonl) + + model_name_or_path = list(predictions.values())[0]["model_name_or_path"] + report = get_report(FULL_DATASET_FNAME, log_dir, predictions_jsonl, model_name_or_path) + predictions = update_pred_json(predictions, report) + + return predictions_jsonl, log_dir + + +def combine_jsonl_logs(predictions, model_name_or_path): + logs = Path("logs") + log_dir = logs / model_name_or_path + + log_dir.mkdir(exist_ok=True) + pprint(log_dir) + + preds_dir = Path("predictions") / model_name_or_path + + predictions_jsonl = preds_to_jsonl(preds_dir, predictions) + for inst, pred in predictions.items(): + from_fname = logs / pred["dname"] + # dump(from_fname, inst) + from_fname = list(from_fname.glob(f"{inst}.*.log")) + assert len(from_fname) <= 1, from_fname + if not len(from_fname): + print("Missing", pred["dname"], inst) + continue + from_fname = from_fname[0] + # dump(from_fname) + + to_fname = log_dir / f"{inst}.{model_name_or_path}.eval.log" + # dump(from_fname, to_fname) + shutil.copyfile(from_fname, to_fname) + + return predictions_jsonl, log_dir + + +def main(): + # Automatically find all JSON files in predictions/results + results_dir = Path("predictions/results") + if not results_dir.exists(): + print(f"Directory does not exist: {results_dir}") + return 1 + + prediction_files = list(results_dir.glob("*.json")) + print(f"Found {len(prediction_files)} prediction files") + + predictions = {} + for file_path in prediction_files: + try: + with open(file_path) as f: + prediction = json.load(f) + if isinstance(prediction, dict) and "instance_id" in prediction: + predictions[prediction["instance_id"]] = prediction + except json.JSONDecodeError: + print(f"Error reading JSON from {file_path}") + continue + + print(f"Successfully loaded {len(predictions)} predictions") + + if predictions: + # Create predictions JSONL file + predictions_jsonl = preds_to_jsonl("predictions/results", predictions) + print(f"\nCreated predictions JSONL: {predictions_jsonl}") + + # Setup log directory + log_dir = Path("logs/results") + log_dir.mkdir(exist_ok=True, parents=True) + print(f"Using log directory: {log_dir}") + + # Run evaluations + run_evals(FULL_DATASET_FNAME, str(log_dir), predictions_jsonl) + + # Get and display report + model_name = "results" # or whatever model name you want to use + report = get_report(FULL_DATASET_FNAME, log_dir, predictions_jsonl, model_name) + + print("\nEvaluation Results:") + print(f"Total predictions: {len(predictions)}") + print(f"Successfully applied: {len(report.get('applied', []))}") + print(f"Resolved: {len(report.get('resolved', []))}") + print(f"Failed to apply: {len(report.get('no_apply', []))}") + print(f"With logs: {len(report.get('with_logs', []))}") + print(f"No logs: {len(report.get('no_logs', []))}") + + # Update prediction JSONs with results + predictions = update_pred_json(predictions, report) + else: + print("No valid predictions found") + return 1 + + return 0 + + +def stats_on_tests_before_and_after(report, predictions): + num = 0 + num_before_pass = 0 + num_pass_to_fail = 0 + + dataset = get_dataset() + + random.shuffle(predictions) + + outcomes = defaultdict(int) + for pred in predictions: + instance_id = pred["instance_id"] + + # if instance_id not in has_patch_not_resolved: + # continue + + num += 1 + + entry = dataset[instance_id] + before_passed, _ = run_tests(entry) + if not before_passed: + continue + + after_passed, _ = run_tests(entry, model_patch=pred["model_patch"]) + + resolved = instance_id in report["resolved"] + pprint(before_passed, after_passed, resolved) + outcome = (before_passed, after_passed, resolved) + outcomes[outcome] += 1 + pprint(sorted(outcomes.items())) + + if before_passed: + num_before_pass += 1 + if before_passed and not after_passed: + num_pass_to_fail += 1 + + print() + pprint(num) + pprint(num_before_pass) + pprint(num_pass_to_fail) + + pct_before_pass = num_before_pass / num * 100 + pprint(pct_before_pass) + pct_pass_to_fail = num_pass_to_fail / num_before_pass * 100 + pprint(pct_pass_to_fail) + + print() + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/codegen/extensions/swebench/results/all_preds.jsonl b/src/codegen/extensions/swebench/results/all_preds.jsonl new file mode 100644 index 000000000..473d123da --- /dev/null +++ b/src/codegen/extensions/swebench/results/all_preds.jsonl @@ -0,0 +1,3 @@ +{"model_name_or_path": "results", "model_patch": "diff --git a/src/_pytest/_io/saferepr.py b/src/_pytest/_io/saferepr.py\nindex 23af4d0..eb03600 100644\n--- a/src/_pytest/_io/saferepr.py\n+++ b/src/_pytest/_io/saferepr.py\n@@ -17,6 +17,8 @@ def _format_repr_exception(exc: BaseException, obj: Any) -> str:\n exc_info = _try_repr_or_str(exc)\n except (KeyboardInterrupt, SystemExit):\n raise\n+ except RuntimeError as exc:\n+ exc_info = \"RuntimeError: No active exception to reraise\"\n except BaseException as exc:\n exc_info = \"unpresentable exception ({})\".format(_try_repr_or_str(exc))\n return \"<[{} raised in repr()] {} object at 0x{:x}>\".format(\n@@ -100,4 +102,4 @@ class AlwaysDispatchingPrettyPrinter(pprint.PrettyPrinter):\n def _pformat_dispatch(object, indent=1, width=80, depth=None, *, compact=False):\n return AlwaysDispatchingPrettyPrinter(\n indent=indent, width=width, depth=depth, compact=compact\n- ).pformat(object)\n+ ).pformat(object)\n\\ No newline at end of file\n", "instance_id": "pytest-dev__pytest-7168"} +{"model_name_or_path": "results", "model_patch": "diff --git a/requests/sessions.py b/requests/sessions.py\nindex 425db22..d7afd2a 100644\n--- a/requests/sessions.py\n+++ b/requests/sessions.py\n@@ -86,6 +86,7 @@ class SessionRedirectMixin(object):\n \"\"\"Receives a Response. Returns a generator of Responses.\"\"\"\n \n i = 0\n+ method = req.method # Track the current method\n \n while resp.is_redirect:\n prepared_request = req.copy()\n@@ -99,7 +100,6 @@ class SessionRedirectMixin(object):\n resp.close()\n \n url = resp.headers['location']\n- method = req.method\n \n # Handle redirection without scheme (see: RFC 1808 Section 4)\n if url.startswith('//'):\n@@ -156,7 +156,7 @@ class SessionRedirectMixin(object):\n \n if 'Authorization' in headers:\n # If we get redirected to a new host, we should strip out any\n- #\u00a0authentication headers.\n+ # authentication headers.\n original_parsed = urlparse(resp.request.url)\n redirect_parsed = urlparse(url)\n \n@@ -568,4 +568,4 @@ class Session(SessionRedirectMixin):\n def session():\n \"\"\"Returns a :class:`Session` for context-management.\"\"\"\n \n- return Session()\n+ return Session()\n\\ No newline at end of file\n", "instance_id": "psf__requests-1963"} +{"model_name_or_path": "results", "model_patch": "diff --git a/src/flask/config.py b/src/flask/config.py\nindex d4fc310..f0a2fc0 100644\n--- a/src/flask/config.py\n+++ b/src/flask/config.py\n@@ -234,6 +234,7 @@ class Config(dict):\n filename: str,\n load: t.Callable[[t.IO[t.Any]], t.Mapping],\n silent: bool = False,\n+ mode: str = 'r', # Add mode parameter with default 'r'\n ) -> bool:\n \"\"\"Update the values in the config from a file that is loaded\n using the ``load`` parameter. The loaded data is passed to the\n@@ -254,6 +255,7 @@ class Config(dict):\n :type load: ``Callable[[Reader], Mapping]`` where ``Reader``\n implements a ``read`` method.\n :param silent: Ignore the file if it doesn't exist.\n+ :param mode: The mode in which to open the file. Default is 'r'.\n :return: ``True`` if the file was loaded successfully.\n \n .. versionadded:: 2.0\n@@ -261,7 +263,7 @@ class Config(dict):\n filename = os.path.join(self.root_path, filename)\n \n try:\n- with open(filename) as f:\n+ with open(filename, mode=mode) as f: # Use the mode parameter\n obj = load(f)\n except OSError as e:\n if silent and e.errno in (errno.ENOENT, errno.EISDIR):\n@@ -335,4 +337,4 @@ class Config(dict):\n return rv\n \n def __repr__(self) -> str:\n- return f\"<{type(self).__name__} {dict.__repr__(self)}>\"\n+ return f\"<{type(self).__name__} {dict.__repr__(self)}>\"\n\\ No newline at end of file\n", "instance_id": "pallets__flask-4992"} diff --git a/src/codegen/extensions/swebench/results/pallets__flask-4992.json b/src/codegen/extensions/swebench/results/pallets__flask-4992.json new file mode 100644 index 000000000..13643ac16 --- /dev/null +++ b/src/codegen/extensions/swebench/results/pallets__flask-4992.json @@ -0,0 +1,29 @@ +{ + "instance_id": "pallets__flask-4992", + "model_patch": "diff --git a/src/flask/config.py b/src/flask/config.py\nindex d4fc310..f0a2fc0 100644\n--- a/src/flask/config.py\n+++ b/src/flask/config.py\n@@ -234,6 +234,7 @@ class Config(dict):\n filename: str,\n load: t.Callable[[t.IO[t.Any]], t.Mapping],\n silent: bool = False,\n+ mode: str = 'r', # Add mode parameter with default 'r'\n ) -> bool:\n \"\"\"Update the values in the config from a file that is loaded\n using the ``load`` parameter. The loaded data is passed to the\n@@ -254,6 +255,7 @@ class Config(dict):\n :type load: ``Callable[[Reader], Mapping]`` where ``Reader``\n implements a ``read`` method.\n :param silent: Ignore the file if it doesn't exist.\n+ :param mode: The mode in which to open the file. Default is 'r'.\n :return: ``True`` if the file was loaded successfully.\n \n .. versionadded:: 2.0\n@@ -261,7 +263,7 @@ class Config(dict):\n filename = os.path.join(self.root_path, filename)\n \n try:\n- with open(filename) as f:\n+ with open(filename, mode=mode) as f: # Use the mode parameter\n obj = load(f)\n except OSError as e:\n if silent and e.errno in (errno.ENOENT, errno.EISDIR):\n@@ -335,4 +337,4 @@ class Config(dict):\n return rv\n \n def __repr__(self) -> str:\n- return f\"<{type(self).__name__} {dict.__repr__(self)}>\"\n+ return f\"<{type(self).__name__} {dict.__repr__(self)}>\"\n\\ No newline at end of file\ndiff --git a/tests/test_config.py b/tests/test_config.py\nindex 76c5d27..782897b 100644\n--- a/tests/test_config.py\n+++ b/tests/test_config.py\n@@ -1,6 +1,7 @@\n import json\n import os\n import textwrap\n+import tomllib\n \n import pytest\n \n@@ -239,11 +240,18 @@ def test_from_pyfile_weird_encoding(tmpdir, encoding):\n textwrap.dedent(\n f\"\"\"\n # -*- coding: {encoding} -*-\n- TEST_VALUE = \"f\u00f6\u00f6\"\n+ TEST_VALUE = \"f\u000f\u000f\"\n \"\"\"\n ).encode(encoding)\n )\n app = flask.Flask(__name__)\n app.config.from_pyfile(str(f))\n value = app.config[\"TEST_VALUE\"]\n- assert value == \"f\u00f6\u00f6\"\n+ assert value == \"f\u000f\u000f\"\n+\n+\n+def test_config_from_toml_file():\n+ app = flask.Flask(__name__)\n+ current_dir = os.path.dirname(os.path.abspath(__file__))\n+ app.config.from_file(os.path.join(current_dir, \"static\", \"config.toml\"), tomllib.load, mode=\"rb\")\n+ common_object_test(app)\n", + "gold_files": [ + "src/flask/config.py" + ], + "edited_files": [ + "src/flask/config.py", + "tests/test_config.py" + ], + "try": 1, + "tries": 1, + "all_results": [ + { + "instance_id": "pallets__flask-4992", + "model_patch": "diff --git a/src/flask/config.py b/src/flask/config.py\nindex d4fc310..f0a2fc0 100644\n--- a/src/flask/config.py\n+++ b/src/flask/config.py\n@@ -234,6 +234,7 @@ class Config(dict):\n filename: str,\n load: t.Callable[[t.IO[t.Any]], t.Mapping],\n silent: bool = False,\n+ mode: str = 'r', # Add mode parameter with default 'r'\n ) -> bool:\n \"\"\"Update the values in the config from a file that is loaded\n using the ``load`` parameter. The loaded data is passed to the\n@@ -254,6 +255,7 @@ class Config(dict):\n :type load: ``Callable[[Reader], Mapping]`` where ``Reader``\n implements a ``read`` method.\n :param silent: Ignore the file if it doesn't exist.\n+ :param mode: The mode in which to open the file. Default is 'r'.\n :return: ``True`` if the file was loaded successfully.\n \n .. versionadded:: 2.0\n@@ -261,7 +263,7 @@ class Config(dict):\n filename = os.path.join(self.root_path, filename)\n \n try:\n- with open(filename) as f:\n+ with open(filename, mode=mode) as f: # Use the mode parameter\n obj = load(f)\n except OSError as e:\n if silent and e.errno in (errno.ENOENT, errno.EISDIR):\n@@ -335,4 +337,4 @@ class Config(dict):\n return rv\n \n def __repr__(self) -> str:\n- return f\"<{type(self).__name__} {dict.__repr__(self)}>\"\n+ return f\"<{type(self).__name__} {dict.__repr__(self)}>\"\n\\ No newline at end of file\ndiff --git a/tests/test_config.py b/tests/test_config.py\nindex 76c5d27..782897b 100644\n--- a/tests/test_config.py\n+++ b/tests/test_config.py\n@@ -1,6 +1,7 @@\n import json\n import os\n import textwrap\n+import tomllib\n \n import pytest\n \n@@ -239,11 +240,18 @@ def test_from_pyfile_weird_encoding(tmpdir, encoding):\n textwrap.dedent(\n f\"\"\"\n # -*- coding: {encoding} -*-\n- TEST_VALUE = \"f\u00f6\u00f6\"\n+ TEST_VALUE = \"f\u000f\u000f\"\n \"\"\"\n ).encode(encoding)\n )\n app = flask.Flask(__name__)\n app.config.from_pyfile(str(f))\n value = app.config[\"TEST_VALUE\"]\n- assert value == \"f\u00f6\u00f6\"\n+ assert value == \"f\u000f\u000f\"\n+\n+\n+def test_config_from_toml_file():\n+ app = flask.Flask(__name__)\n+ current_dir = os.path.dirname(os.path.abspath(__file__))\n+ app.config.from_file(os.path.join(current_dir, \"static\", \"config.toml\"), tomllib.load, mode=\"rb\")\n+ common_object_test(app)\n", + "gold_files": [ + "src/flask/config.py" + ], + "edited_files": [ + "src/flask/config.py", + "tests/test_config.py" + ], + "try": 1 + } + ], + "cost": 0, + "resolved": false +} \ No newline at end of file diff --git a/src/codegen/extensions/swebench/results/psf__requests-1963.json b/src/codegen/extensions/swebench/results/psf__requests-1963.json new file mode 100644 index 000000000..8146ce1d5 --- /dev/null +++ b/src/codegen/extensions/swebench/results/psf__requests-1963.json @@ -0,0 +1,27 @@ +{ + "instance_id": "psf__requests-1963", + "model_patch": "diff --git a/requests/sessions.py b/requests/sessions.py\nindex 425db22..d7afd2a 100644\n--- a/requests/sessions.py\n+++ b/requests/sessions.py\n@@ -86,6 +86,7 @@ class SessionRedirectMixin(object):\n \"\"\"Receives a Response. Returns a generator of Responses.\"\"\"\n \n i = 0\n+ method = req.method # Track the current method\n \n while resp.is_redirect:\n prepared_request = req.copy()\n@@ -99,7 +100,6 @@ class SessionRedirectMixin(object):\n resp.close()\n \n url = resp.headers['location']\n- method = req.method\n \n # Handle redirection without scheme (see: RFC 1808 Section 4)\n if url.startswith('//'):\n@@ -156,7 +156,7 @@ class SessionRedirectMixin(object):\n \n if 'Authorization' in headers:\n # If we get redirected to a new host, we should strip out any\n- #\u00a0authentication headers.\n+ # authentication headers.\n original_parsed = urlparse(resp.request.url)\n redirect_parsed = urlparse(url)\n \n@@ -568,4 +568,4 @@ class Session(SessionRedirectMixin):\n def session():\n \"\"\"Returns a :class:`Session` for context-management.\"\"\"\n \n- return Session()\n+ return Session()\n\\ No newline at end of file\n", + "gold_files": [ + "requests/sessions.py" + ], + "edited_files": [ + "requests/sessions.py" + ], + "try": 1, + "tries": 1, + "all_results": [ + { + "instance_id": "psf__requests-1963", + "model_patch": "diff --git a/requests/sessions.py b/requests/sessions.py\nindex 425db22..d7afd2a 100644\n--- a/requests/sessions.py\n+++ b/requests/sessions.py\n@@ -86,6 +86,7 @@ class SessionRedirectMixin(object):\n \"\"\"Receives a Response. Returns a generator of Responses.\"\"\"\n \n i = 0\n+ method = req.method # Track the current method\n \n while resp.is_redirect:\n prepared_request = req.copy()\n@@ -99,7 +100,6 @@ class SessionRedirectMixin(object):\n resp.close()\n \n url = resp.headers['location']\n- method = req.method\n \n # Handle redirection without scheme (see: RFC 1808 Section 4)\n if url.startswith('//'):\n@@ -156,7 +156,7 @@ class SessionRedirectMixin(object):\n \n if 'Authorization' in headers:\n # If we get redirected to a new host, we should strip out any\n- #\u00a0authentication headers.\n+ # authentication headers.\n original_parsed = urlparse(resp.request.url)\n redirect_parsed = urlparse(url)\n \n@@ -568,4 +568,4 @@ class Session(SessionRedirectMixin):\n def session():\n \"\"\"Returns a :class:`Session` for context-management.\"\"\"\n \n- return Session()\n+ return Session()\n\\ No newline at end of file\n", + "gold_files": [ + "requests/sessions.py" + ], + "edited_files": [ + "requests/sessions.py" + ], + "try": 1 + } + ], + "cost": 0, + "resolved": false +} \ No newline at end of file diff --git a/src/codegen/extensions/swebench/results/pytest-dev__pytest-7168.json b/src/codegen/extensions/swebench/results/pytest-dev__pytest-7168.json new file mode 100644 index 000000000..e9a37a220 --- /dev/null +++ b/src/codegen/extensions/swebench/results/pytest-dev__pytest-7168.json @@ -0,0 +1,27 @@ +{ + "instance_id": "pytest-dev__pytest-7168", + "model_patch": "diff --git a/src/_pytest/_io/saferepr.py b/src/_pytest/_io/saferepr.py\nindex 23af4d0..eb03600 100644\n--- a/src/_pytest/_io/saferepr.py\n+++ b/src/_pytest/_io/saferepr.py\n@@ -17,6 +17,8 @@ def _format_repr_exception(exc: BaseException, obj: Any) -> str:\n exc_info = _try_repr_or_str(exc)\n except (KeyboardInterrupt, SystemExit):\n raise\n+ except RuntimeError as exc:\n+ exc_info = \"RuntimeError: No active exception to reraise\"\n except BaseException as exc:\n exc_info = \"unpresentable exception ({})\".format(_try_repr_or_str(exc))\n return \"<[{} raised in repr()] {} object at 0x{:x}>\".format(\n@@ -100,4 +102,4 @@ class AlwaysDispatchingPrettyPrinter(pprint.PrettyPrinter):\n def _pformat_dispatch(object, indent=1, width=80, depth=None, *, compact=False):\n return AlwaysDispatchingPrettyPrinter(\n indent=indent, width=width, depth=depth, compact=compact\n- ).pformat(object)\n+ ).pformat(object)\n\\ No newline at end of file\n", + "gold_files": [ + "src/_pytest/_io/saferepr.py" + ], + "edited_files": [ + "src/_pytest/_io/saferepr.py" + ], + "try": 1, + "tries": 1, + "all_results": [ + { + "instance_id": "pytest-dev__pytest-7168", + "model_patch": "diff --git a/src/_pytest/_io/saferepr.py b/src/_pytest/_io/saferepr.py\nindex 23af4d0..eb03600 100644\n--- a/src/_pytest/_io/saferepr.py\n+++ b/src/_pytest/_io/saferepr.py\n@@ -17,6 +17,8 @@ def _format_repr_exception(exc: BaseException, obj: Any) -> str:\n exc_info = _try_repr_or_str(exc)\n except (KeyboardInterrupt, SystemExit):\n raise\n+ except RuntimeError as exc:\n+ exc_info = \"RuntimeError: No active exception to reraise\"\n except BaseException as exc:\n exc_info = \"unpresentable exception ({})\".format(_try_repr_or_str(exc))\n return \"<[{} raised in repr()] {} object at 0x{:x}>\".format(\n@@ -100,4 +102,4 @@ class AlwaysDispatchingPrettyPrinter(pprint.PrettyPrinter):\n def _pformat_dispatch(object, indent=1, width=80, depth=None, *, compact=False):\n return AlwaysDispatchingPrettyPrinter(\n indent=indent, width=width, depth=depth, compact=compact\n- ).pformat(object)\n+ ).pformat(object)\n\\ No newline at end of file\n", + "gold_files": [ + "src/_pytest/_io/saferepr.py" + ], + "edited_files": [ + "src/_pytest/_io/saferepr.py" + ], + "try": 1 + } + ], + "cost": 0, + "resolved": false +} \ No newline at end of file diff --git a/src/codegen/extensions/swebench/tests.py b/src/codegen/extensions/swebench/tests.py new file mode 100755 index 000000000..b81b6b274 --- /dev/null +++ b/src/codegen/extensions/swebench/tests.py @@ -0,0 +1,237 @@ +#!/usr/bin/env python + +import json +import random +import sys +import pprint +from collections import defaultdict +from pathlib import Path + +from utils import get_dataset, load_predictions # noqa: F401 + +# A no-op patch which creates an empty file is used to stand in for +# the `model_patch` and/or `test_patch` when running SWE Bench tests +# without one or both of those patches. +NOOP_PATCH = ( + "diff --git a/empty.file.{nonce}.ignore b/empty.file.{nonce}.ignore\n" + "new file mode 100644\n" + "index 0000000..e69de29\n" +) + +def remove_patches_to_tests(model_patch): + """ + Remove any changes to the tests directory from the provided patch. + This is to ensure that the model_patch does not disturb the repo's + tests when doing acceptance testing with the `test_patch`. + """ + if not model_patch: + return model_patch + + lines = model_patch.splitlines(keepends=True) + filtered_lines = [] + is_tests = False + + for line in lines: + if line.startswith("diff --git a/"): + pieces = line.split() + to = pieces[-1] + if to.startswith("b/") and ( + "/test/" in to + or "/tests/" in to + or "/testing/" in to + or "/test_" in to + or "/tox.ini" in to + ): + is_tests = True + else: + is_tests = False + + if not is_tests: + filtered_lines.append(line) + + return "".join(filtered_lines) + +def files_in_patch(patch): + """ + Extract the list of modified files from a unified diff patch string. + """ + files = [] + for line in patch.split("\n"): + if line.startswith("--- a/") or line.startswith("+++ b/"): + fname = line.split("/", 1)[1] + if fname not in files: + files.append(fname) + return files + +def run_tests(entry, model_patch=None, use_test_patch=False, model_name_or_path="none"): + """ + Run tests for the SWE Bench `entry`, optionally applying a `model_patch` first. + + If `use_test_patch` is True, then also apply the `test_patch` to bring in + the tests which determine if the issue is resolved. So False means + only run the tests that existed at the `base_commit` and any new/changed + tests contained in the `model_patch`. + + Optionally specify a `model_name_or_path`, which isn't really used since + the log_dir for the tests is a temp dir which is discarded. + """ + instance_id = entry["instance_id"] + + test_type = MAP_REPO_TO_TEST_FRAMEWORK[entry["repo"]] + test_directives = get_test_directives(entry) + test_cmd = f"{test_type} {' '.join(test_directives)}" + + # Use a no-op patch if no model_patch is provided + if not model_patch: + model_patch = NOOP_PATCH.format(nonce="model_patch") + + # Use a no-op patch if use_test_patch is False + if use_test_patch: + test_patch = entry["test_patch"] + else: + test_patch = NOOP_PATCH.format(nonce="test_patch") + + if model_patch and use_test_patch: + # Make sure the model_patch does not disturb the repo's tests + # when doing acceptance testing with the `test_patch`. + print("=" * 30) + print(model_patch) + model_patch = remove_patches_to_tests(model_patch) + print("=" * 30) + print(model_patch) + print("=" * 30) + + entry_instance = { + "repo": entry["repo"], + "version": entry["version"], + "base_commit": entry["base_commit"], + "instance_id": entry["instance_id"], + "model_name_or_path": model_name_or_path, + "model_patch": model_patch, + "test_patch": test_patch, + "test_directives": test_directives, + "test_cmd": test_cmd, + } + + namespace = "aorwall" + with tempfile.TemporaryDirectory(dir="/mnt/aider") as log_dir: + timeout = 60 + log_suffix = "" + + asyncio.run(run_docker_evaluation(entry_instance, namespace, log_dir, timeout, log_suffix)) + + log_fname = Path(log_dir) / f"{instance_id}.{model_name_or_path}.eval.log" + if not log_fname.exists(): + return None, "" + + log_text = log_fname.read_text() + log_lines = log_text.splitlines() + log_lines = [line for line in log_lines if line.startswith(">>>>")] + print("\n".join(log_lines)) + + passed = ">>>>> All Tests Passed" in log_text + + return passed, log_text + + +def main_check_docker_images(): + dataset = get_dataset() + + # instances = get_devin_instance_ids() + instances = list(dataset.keys()) + random.shuffle(instances) + + cache_fname = Path("tmp.dockerimages.json") + if cache_fname.exists(): + data = json.loads(cache_fname.read_text()) + good_dockers = defaultdict(int, data["good"]) + bad_dockers = defaultdict(int, data["bad"]) + seen_instances = set(data["instances"]) + else: + good_dockers = defaultdict(int) + bad_dockers = defaultdict(int) + seen_instances = set() + + for instance_id in instances: + entry = dataset[instance_id] + + if instance_id in seen_instances: + continue + + seen_instances.add(instance_id) + + docker_image = get_docker_image(entry) + if docker_image in bad_dockers: + bad_dockers[docker_image] += 1 + continue + + if docker_image in good_dockers: + good_dockers[docker_image] += 1 + continue + + pprint.pprint(instance_id) + pprint.pprint(docker_image) + + passed, test_text = run_tests( + entry, + model_patch=None, + use_test_patch=False, + ) + if passed is None: + bad_dockers[docker_image] += 1 + else: + good_dockers[docker_image] += 1 + + update_cache(cache_fname, seen_instances, good_dockers, bad_dockers) + + update_cache(cache_fname, seen_instances, good_dockers, bad_dockers) + + pprint.pprint(bad_dockers) + + +def update_cache(cache_fname, instances, good_dockers, bad_dockers): + save_dict = dict( + instances=list(instances), + good=dict(good_dockers), + bad=dict(bad_dockers), + ) + cache_fname.write_text(json.dumps(save_dict, indent=4, sort_keys=True)) + + total_instances = sum(good_dockers.values()) + sum(bad_dockers.values()) + pprint.pprint(total_instances) + bad_instances = sum(bad_dockers.values()) + pprint.pprint(bad_instances) + if total_instances: + pct_bad_instances = bad_instances / total_instances * 100 + pprint.pprint(pct_bad_instances) + pprint.pprint(len(bad_dockers)) + + +def main_preds(): + dataset = get_dataset() + + dnames = sys.argv[1:] + preds = load_predictions(dnames) + + num = 0 + num_passed = 0 + for instance_id, pred in preds.items(): + entry = dataset[instance_id] + + passed, test_text = run_tests( + entry, + model_patch=pred["model_patch"], + use_test_patch=True, + ) + + num += 1 + if passed: + num_passed += 1 + + pprint.pprint((num_passed, num)) + + +if __name__ == "__main__": + status = main_check_docker_images() + # status = main_preds() + sys.exit(status) diff --git a/src/codegen/extensions/swebench/utils.py b/src/codegen/extensions/swebench/utils.py new file mode 100644 index 000000000..0593a1a59 --- /dev/null +++ b/src/codegen/extensions/swebench/utils.py @@ -0,0 +1,186 @@ +import datetime +import json +import shutil +from pathlib import Path +from pprint import pprint + +from datasets import load_dataset + + +FULL_DATASET = "princeton-nlp/SWE-bench" +FULL_DATASET_FNAME = FULL_DATASET.replace("/", "--") + ".json" + + +VERIFIED_DATASET = "princeton-nlp/SWE-bench-verified" +VERIFIED_DATASET_FNAME = VERIFIED_DATASET.replace("/", "--") + ".json" + +LITE_DATASET = "princeton-nlp/SWE-bench_Lite" +LITE_DATASET_FNAME = LITE_DATASET.replace("/", "--") + ".json" + + +def dump_dataset(dataset, fname): + """ + Save the dataset to json. + """ + entries = list(dataset) + for entry in entries: + entry["FAIL_TO_PASS"] = json.loads(entry["FAIL_TO_PASS"]) + entry["PASS_TO_PASS"] = json.loads(entry["PASS_TO_PASS"]) + + with open(fname, "w") as f: + json.dump(entries, f, indent=4) + + +def get_full_dataset(): + return get_dataset(FULL_DATASET, FULL_DATASET_FNAME) + + +def get_lite_dataset(): + return get_dataset(LITE_DATASET, LITE_DATASET_FNAME) + + +def get_verified_dataset(): + return get_dataset(VERIFIED_DATASET, VERIFIED_DATASET_FNAME) + + +def get_dataset(dataset, fname): + """ + Load the `DATASET` from hugging face, and turn it into a dict + keyed on `instance_id`. + Cache the dict locally in a json file. + """ + + fname = Path(fname) + if fname.exists(): + dataset = json.loads(fname.read_text()) + else: + pprint(dataset) + dataset = load_dataset(dataset) + dataset = dataset["test"] + dump_dataset(dataset, fname) + pprint(dataset) + + res = dict() + for entry in dataset: + res[entry["instance_id"]] = entry + + return res + + +def load_predictions(paths): + prediction_paths = [] + for path in paths: + path = Path(path) + if path.is_file(): + prediction_paths.append(path) + elif path.is_dir(): + prediction_paths += list(path.glob("*.json")) + else: + assert False, path + + # prediction_paths.sort(key=lambda p: p.stat().st_mtime) + + predictions = dict() + for fname in prediction_paths: + try: + pred = json.loads(fname.read_text()) + except json.decoder.JSONDecodeError as err: + pprint(fname) + raise err + + if "instance_id" not in pred: + print("Skipping json without instance_id", fname) + continue + + inst = pred["instance_id"] + pred["json_fname"] = str(fname) + predictions[inst] = pred + + return predictions + + +def is_plausible(pred): + attrs = "model_patch edit_outcome lint_outcome test_outcome".split() + for attr in attrs: + if not pred.get(attr): + return + return True + + +def get_plausible(preds): + return set(inst for inst, pred in preds.items() if is_plausible(pred)) + + +def check_criteria(pred, criteria): + attrs = criteria.split() + for attr in attrs: + if not pred[attr]: + return False + return True + + +def pick_winner(results): + """ + Given that we didn't obtain a result with all good outcomes, + try a series of weaker outcome sets to find the strongest result. + """ + priority = ( + "model_patch edit_outcome lint_outcome test_outcome", # all good! + "model_patch edit_outcome lint_outcome", # all good but test_outcome + "model_patch lint_outcome", # a patch that lints? + "model_patch edit_outcome", # a patch that had no edit errors? + "model_patch", # anything with an actual patch! + ) + + # choose the best result available + for criteria in priority: + for res in results: + if check_criteria(res, criteria): + return res + + # choose the first result as a last resort + if results: + return results[0] + + +def choose_pred(inst, all_preds, dnames): + results = [] + for i in range(len(all_preds)): + preds = all_preds[i] + dname = dnames[i] + + if inst not in preds: + continue + pred = dict(preds[inst]) + pred["dname"] = Path(dname).name + results.append(pred) + + return pick_winner(results) + + +def choose_predictions(dnames, model_name_or_path=None, copy_md=False, devin_only=False): + all_preds = [load_predictions([dname], devin_only=devin_only) for dname in dnames] + all_instances = set() + for preds in all_preds: + all_instances.update(preds.keys()) + + chosen = dict() + for inst in all_instances: + res = choose_pred(inst, all_preds, dnames) + chosen[inst] = res + + if copy_md: + pred_dname = Path("predictions") + md_fname = pred_dname / res["dname"] / (inst + ".md") + assert md_fname.exists(), md_fname + new_md_fname = pred_dname / model_name_or_path / (inst + ".md") + shutil.copyfile(md_fname, new_md_fname) + + for inst in chosen: + pred = dict(chosen[inst]) + pred["model_name_or_path"] = model_name_or_path + chosen[inst] = pred + + pprint(len(chosen)) + pprint(chosen) + return chosen