From bfe275266434d2015b3c8effabe5ff9cb4257a72 Mon Sep 17 00:00:00 2001
From: jemeza-codegen <jmeza@codegen.com>
Date: Thu, 13 Feb 2025 16:58:48 -0800
Subject: [PATCH 1/2] feat: extension that clones and parses swe bench
 codebases

---
 pyproject.toml                                |  1 +
 src/codegen/extensions/swe_bench/__init__.py  |  0
 .../extensions/swe_bench/swe_bench_wrapper.py | 80 +++++++++++++++++++
 src/codegen/extensions/swe_bench/utils.py     | 42 ++++++++++
 4 files changed, 123 insertions(+)
 create mode 100644 src/codegen/extensions/swe_bench/__init__.py
 create mode 100644 src/codegen/extensions/swe_bench/swe_bench_wrapper.py
 create mode 100644 src/codegen/extensions/swe_bench/utils.py

diff --git a/pyproject.toml b/pyproject.toml
index f820b7ae9..1cc11bf23 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -70,6 +70,7 @@ dependencies = [
   "numpy>=2.2.2",
   "mcp[cli]",
   "neo4j",
+  "datasets",
 ]
 
 license = { text = "Apache-2.0" }
diff --git a/src/codegen/extensions/swe_bench/__init__.py b/src/codegen/extensions/swe_bench/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/codegen/extensions/swe_bench/swe_bench_wrapper.py b/src/codegen/extensions/swe_bench/swe_bench_wrapper.py
new file mode 100644
index 000000000..3bde3edfd
--- /dev/null
+++ b/src/codegen/extensions/swe_bench/swe_bench_wrapper.py
@@ -0,0 +1,80 @@
+import shutil
+from collections.abc import Generator
+from typing import Any
+
+from datasets import load_dataset
+
+from codegen.extensions.swe_bench.utils import NO_ENV_SETUP, SWEBenchEntry, SWEBenchEnvSetup, SWEBenchSplit, construct_codebase
+from codegen.sdk.core.codebase import Codebase
+
+
+class SWEBenchWrapper:
+    def __init__(self, remove_after_run: bool = False):
+        print("Loading SWE-bench dataset...")
+        self.ds = load_dataset("princeton-nlp/SWE-bench")
+        print("SWE-bench dataset loaded.")
+        self.remove_after_run = remove_after_run
+        self.repo_groups = self.create_repo_groups()
+
+    def create_repo_groups(self) -> dict:
+        # Create a list of all possible splits
+        SPLITS: list[SWEBenchSplit] = ["train", "dev", "test"]
+
+        # Create a nested dictionary with explicit type hints
+        repo_groups: dict[SWEBenchSplit, dict[str, dict[str, list[Any]]]] = {}
+
+        # Group entries from all splits
+        for split in SPLITS:
+            repo_groups[split] = {}
+            for entry in self.ds[split]:
+                repo = entry["repo"]
+                environment_setup_commit = entry["environment_setup_commit"]
+
+                # Initialize nested dictionaries if they don't exist
+                if repo not in repo_groups[split]:
+                    repo_groups[split][repo] = {}
+                if environment_setup_commit not in repo_groups[split][repo]:
+                    repo_groups[split][repo][environment_setup_commit] = []
+
+                repo_groups[split][repo][environment_setup_commit].append(entry)
+
+        return repo_groups
+
+    def get_entries_for_split(self, split: SWEBenchSplit) -> Generator[tuple[SWEBenchEnvSetup | SWEBenchEntry, Codebase], None, None]:
+        # ===== [ For each repo in the split ] =====
+        for repo in self.repo_groups[split]:
+            # construct the codebase for the repo
+            codebase = construct_codebase(repo_full_name=repo)
+            # ===== [ For each environment setup commit ] =====
+            for environment_setup_commit in self.repo_groups[split][repo]:
+                # yield the environment setup commit
+                if environment_setup_commit:
+                    #  no need to parse the codebase on the environment commit
+                    codebase.checkout(commit=environment_setup_commit, remote=True)
+                    yield SWEBenchEnvSetup(split=split, environment_setup_commit=environment_setup_commit), codebase
+                else:
+                    yield SWEBenchEnvSetup(split=split, environment_setup_commit=NO_ENV_SETUP), codebase
+                # ===== [ For each test setup commit ] =====
+                for entry in self.repo_groups[split][repo][environment_setup_commit]:
+                    codebase.checkout(commit=entry["base_commit"], remote=True)
+                    # yield the test entry with a parsed codebase object
+                    yield SWEBenchEntry(entry=entry, split=split), codebase
+
+        if codebase and self.remove_after_run:
+            # remove the repo from the tmp_dir
+            shutil.rmtree(f"/tmp/codegen/{repo}")
+
+
+if __name__ == "__main__":
+    swe_bench_wrapper = SWEBenchWrapper()
+    for entry, codebase in swe_bench_wrapper.get_entries_for_split("train"):
+        if isinstance(entry, SWEBenchEnvSetup):
+            print(f"Environment setup commit: {entry.environment_setup_commit}")
+            # install dependencies...
+        elif isinstance(entry, SWEBenchEntry):
+            print(f"Entry: {entry.entry['instance_id']}")
+            problem_statement = entry.entry["problem_statement"]
+            print(f"Task: {problem_statement[:20]}")
+            # send of agent to solve tasks....
+
+        print(f"Number of files: {len(codebase.files)}")
diff --git a/src/codegen/extensions/swe_bench/utils.py b/src/codegen/extensions/swe_bench/utils.py
new file mode 100644
index 000000000..007ae2603
--- /dev/null
+++ b/src/codegen/extensions/swe_bench/utils.py
@@ -0,0 +1,42 @@
+from typing import Literal
+
+from pydantic import BaseModel
+
+from codegen.git.repo_operator.remote_repo_operator import RemoteRepoOperator
+from codegen.git.schemas.repo_config import RepoConfig
+from codegen.sdk.codebase.config import ProjectConfig
+from codegen.sdk.core.codebase import Codebase, PyCodebaseType
+
+# Define the SWEBenchSplit type using Literal
+SWEBenchSplit = Literal["train", "dev", "test"]
+NO_ENV_SETUP = "NO_ENV_SETUP"
+
+
+class SWEBenchEnvSetup(BaseModel):
+    split: SWEBenchSplit
+    environment_setup_commit: str = NO_ENV_SETUP
+
+
+class SWEBenchEntry(BaseModel):
+    split: SWEBenchSplit
+    entry: dict
+
+
+def construct_codebase(repo_full_name: str) -> PyCodebaseType:
+    repo_name = repo_full_name.split("/")[-1]
+    repo_config = RepoConfig(name=repo_name, full_name=repo_full_name, base_dir="/tmp/codegen")
+
+    # clone or pull the repo
+    print(f"Cloning or pulling {repo_full_name}...")
+    remote_operator = RemoteRepoOperator(repo_config=repo_config, bot_commit=False)
+    print(f"Cloned or pulled {repo_full_name}.")
+
+    # create the project config
+    projects = [ProjectConfig(repo_operator=remote_operator, base_path=None, subdirectories=None)]
+
+    # parse the codebase
+    print("Parsing codebase...")
+    codebase = Codebase(projects=projects)
+    print("Codebase parsed.")
+
+    return codebase

From 18cea4cffbd0a5f1e834b2dfa899a5e8e66c0a2f Mon Sep 17 00:00:00 2001
From: Victor Cheng <victorchengthesecond@gmail.com>
Date: Sun, 16 Feb 2025 08:45:12 -0800
Subject: [PATCH 2/2] treehacks

---
 src/codegen/extensions/swebench/README.md     |  29 ++
 src/codegen/extensions/swebench/agent.py      | 129 ++++++
 src/codegen/extensions/swebench/harness.py    | 383 ++++++++++++++++++
 src/codegen/extensions/swebench/report.py     | 280 +++++++++++++
 .../swebench/results/all_preds.jsonl          |   3 +
 .../swebench/results/pallets__flask-4992.json |  29 ++
 .../swebench/results/psf__requests-1963.json  |  27 ++
 .../results/pytest-dev__pytest-7168.json      |  27 ++
 src/codegen/extensions/swebench/tests.py      | 237 +++++++++++
 src/codegen/extensions/swebench/utils.py      | 186 +++++++++
 10 files changed, 1330 insertions(+)
 create mode 100644 src/codegen/extensions/swebench/README.md
 create mode 100644 src/codegen/extensions/swebench/agent.py
 create mode 100644 src/codegen/extensions/swebench/harness.py
 create mode 100755 src/codegen/extensions/swebench/report.py
 create mode 100644 src/codegen/extensions/swebench/results/all_preds.jsonl
 create mode 100644 src/codegen/extensions/swebench/results/pallets__flask-4992.json
 create mode 100644 src/codegen/extensions/swebench/results/psf__requests-1963.json
 create mode 100644 src/codegen/extensions/swebench/results/pytest-dev__pytest-7168.json
 create mode 100755 src/codegen/extensions/swebench/tests.py
 create mode 100644 src/codegen/extensions/swebench/utils.py

diff --git a/src/codegen/extensions/swebench/README.md b/src/codegen/extensions/swebench/README.md
new file mode 100644
index 000000000..8ba547cf0
--- /dev/null
+++ b/src/codegen/extensions/swebench/README.md
@@ -0,0 +1,29 @@
+## Codegen Harness and Evaluator for SWE Bennch Development Tool
+
+This folder contains a harness and evaluator for the SWE Bench leaderboard, and enables developers to test and evaluate their codegen models on the SWE Bench leaderboard.
+
+It integrates directly into the Codegen agentic framework and can be built on top of.
+
+### Setup
+
+Remember to install all the dependencies for the environment.
+
+### Usage
+
+#### Edit agent.py, your codegen agent
+
+This file contains the main logic for the agent.
+
+The agent taps into the tree sitter using codegen. You can modify this by adding additional tools, extending its capabilities, prompts, and more.
+
+It is invoked in the harness script.
+
+#### Run harness.py to run the agent
+
+This script will gather the correct dataset, run the agent, and save the results.
+
+#### Run report.py to generate a report
+
+This script will generate a report from the results. It will loop through all the results and generate a report to evaluate each. Currently, there is an error in the docker image. 
+
+There are currently example predictions in the `predictions/results` folder.
\ No newline at end of file
diff --git a/src/codegen/extensions/swebench/agent.py b/src/codegen/extensions/swebench/agent.py
new file mode 100644
index 000000000..6f5d5784a
--- /dev/null
+++ b/src/codegen/extensions/swebench/agent.py
@@ -0,0 +1,129 @@
+from langchain_openai import ChatOpenAI
+from codegen import Codebase
+
+"""Demo implementation of an agent with Codegen tools."""
+
+from langchain.agents import AgentExecutor
+from langchain.agents.openai_functions_agent.base import OpenAIFunctionsAgent
+from langchain.hub import pull
+from langchain.tools import BaseTool
+from langchain_core.chat_history import InMemoryChatMessageHistory
+from langchain_core.messages import BaseMessage
+from langchain_core.runnables.history import RunnableWithMessageHistory
+from langchain_openai import ChatOpenAI
+
+from codegen import Codebase
+
+from codegen.extensions.langchain.tools import (
+    CommitTool,
+    CreateFileTool,
+    DeleteFileTool,
+    EditFileTool,
+    GithubCreatePRCommentTool,
+    GithubCreatePRReviewCommentTool,
+    GithubCreatePRTool,
+    GithubViewPRTool,
+    ListDirectoryTool,
+    MoveSymbolTool,
+    RenameFileTool,
+    RevealSymbolTool,
+    SearchTool,
+    SemanticEditTool,
+    SemanticSearchTool,
+    ViewFileTool,
+)
+
+
+def create_codebase_agent(
+    codebase: Codebase,
+    model_name: str = "gpt-4o",
+    temperature: float = 0,
+    verbose: bool = True,
+    chat_history: list[BaseMessage] = [],
+) -> RunnableWithMessageHistory:
+    """Create an agent with all codebase tools.
+
+    Args:
+        codebase: The codebase to operate on
+        model_name: Name of the model to use (default: gpt-4)
+        temperature: Model temperature (default: 0)
+        verbose: Whether to print agent's thought process (default: True)
+
+    Returns:
+        Initialized agent with message history
+    """
+    # Initialize language model
+    llm = ChatOpenAI(
+        model_name=model_name,
+        temperature=temperature,
+    )
+
+    # Get all codebase tools
+    tools = [
+        ViewFileTool(codebase),
+        ListDirectoryTool(codebase),
+        SearchTool(codebase),
+        EditFileTool(codebase),
+        CreateFileTool(codebase),
+        DeleteFileTool(codebase),
+        RenameFileTool(codebase),
+        MoveSymbolTool(codebase),
+        # RevealSymbolTool(codebase),
+        SemanticEditTool(codebase),
+        SemanticSearchTool(codebase),
+        CommitTool(codebase),
+        GithubCreatePRTool(codebase),
+        GithubViewPRTool(codebase),
+        GithubCreatePRCommentTool(codebase),
+        GithubCreatePRReviewCommentTool(codebase),
+    ]
+
+    # Get the prompt to use
+    prompt = pull("hwchase17/openai-functions-agent")
+
+    # Create the agent
+    agent = OpenAIFunctionsAgent(
+        llm=llm,
+        tools=tools,
+        prompt=prompt,
+    )
+
+    # Create the agent executor
+    agent_executor = AgentExecutor(
+        agent=agent,
+        tools=tools,
+        verbose=verbose,
+    )
+
+    # Create message history handler
+    message_history = InMemoryChatMessageHistory(messages=chat_history)
+
+    # Wrap with message history
+    return RunnableWithMessageHistory(
+        agent_executor,
+        lambda session_id: message_history,
+        input_messages_key="input",
+        history_messages_key="chat_history",
+    )
+
+
+# Initialize codebase
+codebase = Codebase.from_repo("fastapi/fastapi")
+
+# Create the agent with GPT-4
+agent = create_codebase_agent(
+    codebase=codebase,
+    model_name="gpt-4o",
+    temperature=0,
+    verbose=True
+)
+
+
+
+# Analyze dependencies
+result = agent.invoke(
+    {"input": "What are the dependencies of the FastAPI class?"},
+    config={"configurable": {"session_id": "demo"}}
+)
+print(result["output"])
+
diff --git a/src/codegen/extensions/swebench/harness.py b/src/codegen/extensions/swebench/harness.py
new file mode 100644
index 000000000..90accfac1
--- /dev/null
+++ b/src/codegen/extensions/swebench/harness.py
@@ -0,0 +1,383 @@
+"""
+This is the harness for running an AI agent on the SWE Bench dataset.
+
+"""
+
+#!/usr/bin/env python
+
+import json
+import random
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+import datetime
+import pprint
+
+import lox
+
+# Replace the dump import with pprint
+# from dump import dump
+# from tests import run_tests
+from utils import get_full_dataset  # noqa: F401
+from utils import get_lite_dataset  # noqa: F401
+from utils import  get_plausible, load_predictions, pick_winner
+
+
+# coding agent
+from agent import create_codebase_agent
+from codegen import Codebase
+
+
+REPOS_DNAME = Path("repos")
+CHAT_LOGS_DNAME = Path("chat-logs")
+PREDS_DNAME = Path("predictions")
+
+
+def diff_versus_commit(git_dname, commit):
+    """
+    Take a diff of `git_dname` current contents versus the `commit`.
+    """
+
+    diff_cmd = f"git -C {git_dname} diff {commit}"
+    diff_output = subprocess.check_output(diff_cmd.split()).decode()
+    return diff_output
+
+
+def files_in_patch(patch):
+    """
+    Extract the list of modified files from a unified diff patch string.
+    """
+    files = []
+    for line in patch.split("\n"):
+        if line.startswith("--- a/") or line.startswith("+++ b/"):
+            fname = line.split("/", 1)[1]
+            if fname not in files:
+                files.append(fname)
+    return files
+
+
+def checkout_repo(git_tempdir, entry):
+    """
+    Clone the SWE Bench entry's git `repo` into `dname` at the `base_commit`.
+    Make a tempdir if no `dname` provided.
+    """
+    github_url = "https://github.com/"
+    repo_url = github_url + entry["repo"]
+    commit = entry["base_commit"]
+
+    print(repo_url, commit)
+
+    checkout_repo_url_commit(git_tempdir, repo_url, commit)
+
+
+def checkout_repo_url_commit(repo_dname, url, commit):
+    """
+    Clone the git `url` into `dname` at `commit`.
+    Check a local cache of the bare repo to avoid pulling from github every time.
+    """
+
+    # Extract repo name from URL
+    repo_name = url.split("/")[-1].split(".")[0]
+    repo_name += ".git"
+
+    # dump(repo_name)
+    pprint.pprint(repo_name)
+    REPOS_DNAME.mkdir(exist_ok=True)
+    bare_repo = REPOS_DNAME / repo_name
+
+    if not bare_repo.exists():
+        cmd = f"git clone --bare {url} {bare_repo}"
+        subprocess.run(cmd.split(), check=True)
+
+    cmd = f"git clone {bare_repo} {repo_dname}"
+    subprocess.run(cmd.split(), check=True)
+
+    cmd = f"git -c advice.detachedHead=false -C {repo_dname} checkout {commit}"
+    subprocess.run(cmd.split(), check=True)
+
+
+def show_problems(dataset):
+    """
+    Print out all the instance_id and problem_descriptions.
+    """
+    for inst, entry in dataset.items():
+        problem = entry["problem_statement"].splitlines()[0]
+        print(f"{inst}: {problem}")
+
+
+def run_pre_existing_tests(entry, git_dname):
+    """Given the current contents of the `git_dname`, run the tests that
+    were present in the entry's `repo` at the time of the
+    `base_commit` or which have been added into the repo since.  This
+    checks if the code in the `git_dname` has broken pre-existing
+    tests or is failing any newly added tests.
+
+    It does NOT attempt to run the tests in the `test_patch` which
+    are used to evaluate whether the `model_patch` has resolved the
+    `problem_statement`.
+
+    Returns None if all the tests passed. Returns the text of the
+    test run output if any failed.
+    """
+
+    model_patch = diff_versus_commit(git_dname, entry["base_commit"])
+    # passed, output = run_tests(
+    #     entry,
+    #     model_patch=model_patch,
+    #     use_test_patch=False,
+    # )
+    # We were UNABLE to run tests
+    # if passed is None:
+    #     return
+
+    # if passed:
+    #     return
+
+    # Just keep the output after the (no-op) test patch applied,
+    # which is the actual output from the tests that were run.
+    # output = output.split(">>>>> Applied Patch (test)")[-1]
+
+    # return output
+
+
+
+
+def process_one_instance(entry, out_dname):
+    """Process one `entry` from SWE Bench using the LLM `models` at the
+    given `temperature`.  Set `model_name_or_path` in the result json.
+    Store the result json and the chat log into `out_dname`.
+    """
+
+    instance_id = entry["instance_id"]
+    base_commit = entry["base_commit"]
+
+    print("=" * 60)
+    pprint.pprint(instance_id)
+    print("=" * 60)
+    problem_statement = entry["problem_statement"]
+    print(problem_statement)
+
+    ###
+    # DO NOT assist aider by telling it which files need to be modified!
+    oracle = False
+    gold_files = files_in_patch(entry["patch"])
+    if oracle:
+        oracle_files = gold_files
+    else:
+        oracle_files = None
+    ###
+
+
+    results = []
+    cost = 0
+    winner = None
+
+    num_tries = 1
+    # Do NUM_TRIES tries for each of the models, until we find a *plausible* solution
+    for attempt in range(1, num_tries + 1):
+            codebase = Codebase.from_repo(
+                repo_full_name=entry["repo"],
+                commit=entry["base_commit"],
+                language="python"
+            ) # check out the repo
+
+            agent = create_codebase_agent(
+                codebase=codebase,
+                model_name="gpt-4o",
+                temperature=0,
+                verbose=True
+            )
+            
+            # for usage for testing for the model
+            # test_cmd = lambda: run_pre_existing_tests(entry, codebase.repo_path)  # noqa: E731
+
+            pprint.pprint(instance_id)
+            pprint.pprint(gold_files)
+
+            message = """Below is a real GitHub issue from a popular GitHub repository.
+The issue was filed some time ago.
+The repo has been checked out at the commit that existed at the moment the issue was filed.
+If you are already familiar with this repo, be cautious!
+You are working with an old version of the repo!
+Filenames, directory names, file contents, etc may be different than what you're used to.
+
+Propose changes to update the repo to fix the problem below.
+
+"""
+            message += problem_statement
+
+            try:
+                result = agent.invoke(
+                    {"input": message},
+                    config={"configurable": {"session_id": "demo"}}
+                )
+            except Exception as coder_err:
+                # swallow any exceptions during benchmarking
+                pprint.pprint(coder_err)
+                continue
+
+
+            pprint.pprint(instance_id)
+            pprint.pprint(gold_files)
+
+
+            # Get the diff between the current state and the original commit
+            model_patch = diff_versus_commit(codebase.repo_path, base_commit)
+            pprint.pprint(model_patch)
+
+            # Record the results for the logs
+            result = dict(
+                # Required args for running eval tests
+                instance_id=instance_id,
+                model_patch=model_patch,
+                # For computing stats
+                gold_files=gold_files,
+                edited_files=files_in_patch(model_patch)
+            )
+            result["try"] = attempt  # `try` is a python keyword
+            results.append(result)
+
+            pprint.pprint(result)
+
+            # Did we get a successful edit, lint and test? If so, we found a plausible solution!
+            if model_patch:
+                winner = result
+                break
+
+
+    # If there's no clear winner, look for the most viable result we got...
+    if not winner:
+        winner = pick_winner(results)
+
+    if not winner:
+        result = dict(
+            # Required args for running eval tests
+            instance_id=instance_id,
+            model_patch=None,
+        )
+
+    pprint.pprint(winner)
+    if not winner:
+        return
+
+    print("\n\nFinal diff:\n")
+    print(winner["model_patch"])
+
+    # Avoid circular reference when we save to json
+    winner = dict(winner)
+
+    winner.update(
+        dict(
+            tries=attempt,
+            all_results=results,  # Record all the results for later analysis
+            cost=cost,  # total cost across all results
+        )
+    )
+
+    out_fname = out_dname / (instance_id + ".json")
+    out_fname.write_text(json.dumps(winner, indent=4))
+
+
+def process_instances(
+    dataset, threads, prior_dnames
+):
+    """
+    dataset - The subset of the SWE Bench dataset to process.
+    threads - How many problems to attempt concurrently.
+    prior_dnames - Names of predictions/ dirnames from previous runs.
+                   If they contain a plausible solution for an instance,
+                   don't continue looking.
+    """
+
+    # Create the predictions directory if it doesn't exist
+    PREDS_DNAME.mkdir(exist_ok=True)
+    out_dname = PREDS_DNAME / "results"
+    out_dname.mkdir()
+
+    pprint.pprint(out_dname)
+
+    # If we are restarting this run, figure out which instances are already done.
+    done_preds = load_predictions([out_dname])
+    done_instances = set(done_preds.keys())
+    pprint.pprint(len(done_instances))
+
+    pprint.pprint(prior_dnames)
+    prior_preds = load_predictions(prior_dnames)
+    pprint.pprint(len(prior_preds))
+
+    plausible_instances = get_plausible(prior_preds)
+    pprint.pprint(len(plausible_instances))
+
+    if prior_preds:
+        # Just keep trying to solve instances that exist in the previous runs
+        all_instances = set(prior_preds.keys())
+    else:
+        all_instances = set(dataset.keys())
+
+    remaining_instances = set(all_instances)
+    remaining_instances -= done_instances
+    remaining_instances -= plausible_instances
+
+    remaining_instances = list(remaining_instances)
+    random.shuffle(remaining_instances)
+
+    pprint.pprint(sorted(remaining_instances))
+    pprint.pprint(len(remaining_instances))
+
+    print()
+    print("press enter...")
+    input()
+
+    if not CHAT_LOGS_DNAME.exists():
+        CHAT_LOGS_DNAME.mkdir()
+
+    chat_history_dname = CHAT_LOGS_DNAME / "results"
+    chat_history_dname.mkdir(exist_ok=True)
+
+    if threads > 1:
+        process_one_instance_lox = lox.process(threads)(process_one_instance)
+        process_one_instance_func = process_one_instance_lox.scatter
+        gather = process_one_instance_lox.gather
+    else:
+        process_one_instance_func = process_one_instance
+
+    for instance_id in remaining_instances:
+        if instance_id in done_instances:
+            print("skipping", instance_id)
+            continue
+
+        process_one_instance_func(
+            dataset[instance_id],
+            out_dname,
+        )
+
+        print("#" * 60)
+        # input()
+
+    if threads > 1:
+        gather()
+
+
+def main():
+
+    # Load the SWE Bench dataset
+    # dataset = get_full_dataset()
+    # dataset = get_verified_dataset()
+    dataset = get_lite_dataset()
+    threads = 10
+
+    # Any predictions/ dirs provided on the command line are treated
+    # as earlier, higher priority runs.  If a plausible solution was
+    # found for an instance already, we don't need to keep looking in
+    # this run.
+    prior_dnames = sys.argv[1:]
+
+    process_instances(
+        dataset, threads, prior_dnames
+    )
+
+
+if __name__ == "__main__":
+    status = main()
+    sys.exit(status)
diff --git a/src/codegen/extensions/swebench/report.py b/src/codegen/extensions/swebench/report.py
new file mode 100755
index 000000000..ad36d238d
--- /dev/null
+++ b/src/codegen/extensions/swebench/report.py
@@ -0,0 +1,280 @@
+#!/usr/bin/env python
+
+import json
+import os
+import random
+import shutil
+import subprocess
+import sys
+from collections import defaultdict
+from pathlib import Path
+from pprint import pprint
+
+
+from tests import remove_patches_to_tests, run_tests
+from utils import (
+    FULL_DATASET_FNAME,
+    choose_predictions,
+    get_dataset,
+    load_predictions,
+)
+
+using_dataset = "lite"
+
+NUM_EVAL_PROCS = 5
+
+def run_evals(swe_bench_tasks, log_dir, predictions_jsonl):
+    run_evals_cmd = f"""
+python -m swebench.harness.run_evaluation
+    --predictions_path {predictions_jsonl}
+    --max_workers 32
+    --run_id {log_dir.replace('/', '-')}
+    --dataset_name princeton-nlp/SWE-bench_Lite
+    --cache_level instance
+    --report_dir {log_dir}
+"""
+    run_evals_cmd = " ".join([line.strip() for line in run_evals_cmd.split() if line.strip()])
+    print("Running evaluation command:", run_evals_cmd)
+
+    subprocess.run(run_evals_cmd.split(), check=True)
+
+
+def get_report(swe_bench_tasks, log_dir, predictions_jsonl, model_name_or_path):
+    # Load and parse the evaluation results directly from the predictions file
+    results = defaultdict(list)
+    
+    with open(predictions_jsonl) as f:
+        for line in f:
+            pred = json.loads(line)
+            instance_id = pred['instance_id']
+            
+            # Track basic stats
+            results['generated'].append(instance_id)
+            
+            # Check for evaluation logs
+            log_file = Path(log_dir) / f"{instance_id}.eval.log"
+            if log_file.exists():
+                results['with_logs'].append(instance_id)
+                log_content = log_file.read_text()
+                
+                if "PASS" in log_content:
+                    results['resolved'].append(instance_id)
+                    results['applied'].append(instance_id)
+                elif "FAIL" in log_content:
+                    results['applied'].append(instance_id)
+                else:
+                    results['no_apply'].append(instance_id)
+            else:
+                results['no_logs'].append(instance_id)
+
+    # Convert lists to sets for compatibility with existing code
+    return {k: set(v) for k, v in results.items()}
+
+
+def update_pred_json(predictions, report):
+    all_instances = set(report.get("generated", []))
+    all_instances.update(set(report.get("no_generation", [])))
+
+    for instance_id, pred in predictions.items():
+        # Use get() to handle missing 'resolved' key, defaulting to empty set
+        was_resolved = instance_id in report.get("resolved", set())
+        if "resolved" in pred and pred["resolved"] == was_resolved:
+            continue
+
+        assert instance_id in all_instances, instance_id
+
+        pred["resolved"] = was_resolved
+        save = dict(pred)
+        
+        # Construct json_fname if it doesn't exist
+        if "json_fname" not in pred:
+            json_fname = Path("predictions/results") / f"{instance_id}.json"
+        else:
+            json_fname = pred["json_fname"]
+            del save["json_fname"]  # Remove from save data if it exists
+            
+        Path(json_fname).write_text(json.dumps(save, indent=4))
+
+    return predictions
+
+
+def preds_to_jsonl(dname, predictions):
+    dname = Path(dname)
+
+    predictions_jsonl = str(dname / "all_preds.jsonl")
+    print(f"Creating JSONL file: {predictions_jsonl}")
+    
+    # Use a default model name since it's not in the predictions
+    model_name = "results"
+    
+    with open(predictions_jsonl, "w") as fh:
+        for inst, pred in predictions.items():
+            minimal_pred = {
+                "model_name_or_path": model_name,  # Use default model name
+                "model_patch": remove_patches_to_tests(pred["model_patch"]) if "model_patch" in pred else pred.get("patch", ""),
+                "instance_id": pred["instance_id"],
+            }
+            fh.write(json.dumps(minimal_pred) + "\n")
+    return predictions_jsonl
+
+
+def run_evals_on_dname(dname):
+    dname = Path(dname)
+
+    predictions = load_predictions([dname], devin_only=(using_dataset == "devin"))
+
+    predictions_jsonl = preds_to_jsonl(dname, predictions)
+    pprint(predictions_jsonl)
+
+    log_dir = Path("logs") / dname.name
+    log_dir.mkdir(exist_ok=True, parents=True)
+    pprint(log_dir)
+
+    any_need_evals = any("resolved" not in pred for pred in predictions.values())
+    any_need_evals = True
+    if any_need_evals:
+        run_evals(FULL_DATASET_FNAME, str(log_dir), predictions_jsonl)
+
+        model_name_or_path = list(predictions.values())[0]["model_name_or_path"]
+        report = get_report(FULL_DATASET_FNAME, log_dir, predictions_jsonl, model_name_or_path)
+        predictions = update_pred_json(predictions, report)
+
+    return predictions_jsonl, log_dir
+
+
+def combine_jsonl_logs(predictions, model_name_or_path):
+    logs = Path("logs")
+    log_dir = logs / model_name_or_path
+
+    log_dir.mkdir(exist_ok=True)
+    pprint(log_dir)
+
+    preds_dir = Path("predictions") / model_name_or_path
+
+    predictions_jsonl = preds_to_jsonl(preds_dir, predictions)
+    for inst, pred in predictions.items():
+        from_fname = logs / pred["dname"]
+        # dump(from_fname, inst)
+        from_fname = list(from_fname.glob(f"{inst}.*.log"))
+        assert len(from_fname) <= 1, from_fname
+        if not len(from_fname):
+            print("Missing", pred["dname"], inst)
+            continue
+        from_fname = from_fname[0]
+        # dump(from_fname)
+
+        to_fname = log_dir / f"{inst}.{model_name_or_path}.eval.log"
+        # dump(from_fname, to_fname)
+        shutil.copyfile(from_fname, to_fname)
+
+    return predictions_jsonl, log_dir
+
+
+def main():
+    # Automatically find all JSON files in predictions/results
+    results_dir = Path("predictions/results")
+    if not results_dir.exists():
+        print(f"Directory does not exist: {results_dir}")
+        return 1
+
+    prediction_files = list(results_dir.glob("*.json"))
+    print(f"Found {len(prediction_files)} prediction files")
+
+    predictions = {}
+    for file_path in prediction_files:
+        try:
+            with open(file_path) as f:
+                prediction = json.load(f)
+                if isinstance(prediction, dict) and "instance_id" in prediction:
+                    predictions[prediction["instance_id"]] = prediction
+        except json.JSONDecodeError:
+            print(f"Error reading JSON from {file_path}")
+            continue
+
+    print(f"Successfully loaded {len(predictions)} predictions")
+    
+    if predictions:
+        # Create predictions JSONL file
+        predictions_jsonl = preds_to_jsonl("predictions/results", predictions)
+        print(f"\nCreated predictions JSONL: {predictions_jsonl}")
+
+        # Setup log directory
+        log_dir = Path("logs/results")
+        log_dir.mkdir(exist_ok=True, parents=True)
+        print(f"Using log directory: {log_dir}")
+
+        # Run evaluations
+        run_evals(FULL_DATASET_FNAME, str(log_dir), predictions_jsonl)
+
+        # Get and display report
+        model_name = "results"  # or whatever model name you want to use
+        report = get_report(FULL_DATASET_FNAME, log_dir, predictions_jsonl, model_name)
+        
+        print("\nEvaluation Results:")
+        print(f"Total predictions: {len(predictions)}")
+        print(f"Successfully applied: {len(report.get('applied', []))}")
+        print(f"Resolved: {len(report.get('resolved', []))}")
+        print(f"Failed to apply: {len(report.get('no_apply', []))}")
+        print(f"With logs: {len(report.get('with_logs', []))}")
+        print(f"No logs: {len(report.get('no_logs', []))}")
+        
+        # Update prediction JSONs with results
+        predictions = update_pred_json(predictions, report)
+    else:
+        print("No valid predictions found")
+        return 1
+
+    return 0
+
+
+def stats_on_tests_before_and_after(report, predictions):
+    num = 0
+    num_before_pass = 0
+    num_pass_to_fail = 0
+
+    dataset = get_dataset()
+
+    random.shuffle(predictions)
+
+    outcomes = defaultdict(int)
+    for pred in predictions:
+        instance_id = pred["instance_id"]
+
+        # if instance_id not in has_patch_not_resolved:
+        #    continue
+
+        num += 1
+
+        entry = dataset[instance_id]
+        before_passed, _ = run_tests(entry)
+        if not before_passed:
+            continue
+
+        after_passed, _ = run_tests(entry, model_patch=pred["model_patch"])
+
+        resolved = instance_id in report["resolved"]
+        pprint(before_passed, after_passed, resolved)
+        outcome = (before_passed, after_passed, resolved)
+        outcomes[outcome] += 1
+        pprint(sorted(outcomes.items()))
+
+        if before_passed:
+            num_before_pass += 1
+        if before_passed and not after_passed:
+            num_pass_to_fail += 1
+
+        print()
+        pprint(num)
+        pprint(num_before_pass)
+        pprint(num_pass_to_fail)
+
+        pct_before_pass = num_before_pass / num * 100
+        pprint(pct_before_pass)
+        pct_pass_to_fail = num_pass_to_fail / num_before_pass * 100
+        pprint(pct_pass_to_fail)
+
+        print()
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/codegen/extensions/swebench/results/all_preds.jsonl b/src/codegen/extensions/swebench/results/all_preds.jsonl
new file mode 100644
index 000000000..473d123da
--- /dev/null
+++ b/src/codegen/extensions/swebench/results/all_preds.jsonl
@@ -0,0 +1,3 @@
+{"model_name_or_path": "results", "model_patch": "diff --git a/src/_pytest/_io/saferepr.py b/src/_pytest/_io/saferepr.py\nindex 23af4d0..eb03600 100644\n--- a/src/_pytest/_io/saferepr.py\n+++ b/src/_pytest/_io/saferepr.py\n@@ -17,6 +17,8 @@ def _format_repr_exception(exc: BaseException, obj: Any) -> str:\n         exc_info = _try_repr_or_str(exc)\n     except (KeyboardInterrupt, SystemExit):\n         raise\n+    except RuntimeError as exc:\n+        exc_info = \"RuntimeError: No active exception to reraise\"\n     except BaseException as exc:\n         exc_info = \"unpresentable exception ({})\".format(_try_repr_or_str(exc))\n     return \"<[{} raised in repr()] {} object at 0x{:x}>\".format(\n@@ -100,4 +102,4 @@ class AlwaysDispatchingPrettyPrinter(pprint.PrettyPrinter):\n def _pformat_dispatch(object, indent=1, width=80, depth=None, *, compact=False):\n     return AlwaysDispatchingPrettyPrinter(\n         indent=indent, width=width, depth=depth, compact=compact\n-    ).pformat(object)\n+    ).pformat(object)\n\\ No newline at end of file\n", "instance_id": "pytest-dev__pytest-7168"}
+{"model_name_or_path": "results", "model_patch": "diff --git a/requests/sessions.py b/requests/sessions.py\nindex 425db22..d7afd2a 100644\n--- a/requests/sessions.py\n+++ b/requests/sessions.py\n@@ -86,6 +86,7 @@ class SessionRedirectMixin(object):\n         \"\"\"Receives a Response. Returns a generator of Responses.\"\"\"\n \n         i = 0\n+        method = req.method  # Track the current method\n \n         while resp.is_redirect:\n             prepared_request = req.copy()\n@@ -99,7 +100,6 @@ class SessionRedirectMixin(object):\n             resp.close()\n \n             url = resp.headers['location']\n-            method = req.method\n \n             # Handle redirection without scheme (see: RFC 1808 Section 4)\n             if url.startswith('//'):\n@@ -156,7 +156,7 @@ class SessionRedirectMixin(object):\n \n             if 'Authorization' in headers:\n                 # If we get redirected to a new host, we should strip out any\n-                #\u00a0authentication headers.\n+                # authentication headers.\n                 original_parsed = urlparse(resp.request.url)\n                 redirect_parsed = urlparse(url)\n \n@@ -568,4 +568,4 @@ class Session(SessionRedirectMixin):\n def session():\n     \"\"\"Returns a :class:`Session` for context-management.\"\"\"\n \n-    return Session()\n+    return Session()\n\\ No newline at end of file\n", "instance_id": "psf__requests-1963"}
+{"model_name_or_path": "results", "model_patch": "diff --git a/src/flask/config.py b/src/flask/config.py\nindex d4fc310..f0a2fc0 100644\n--- a/src/flask/config.py\n+++ b/src/flask/config.py\n@@ -234,6 +234,7 @@ class Config(dict):\n         filename: str,\n         load: t.Callable[[t.IO[t.Any]], t.Mapping],\n         silent: bool = False,\n+        mode: str = 'r',  # Add mode parameter with default 'r'\n     ) -> bool:\n         \"\"\"Update the values in the config from a file that is loaded\n         using the ``load`` parameter. The loaded data is passed to the\n@@ -254,6 +255,7 @@ class Config(dict):\n         :type load: ``Callable[[Reader], Mapping]`` where ``Reader``\n             implements a ``read`` method.\n         :param silent: Ignore the file if it doesn't exist.\n+        :param mode: The mode in which to open the file. Default is 'r'.\n         :return: ``True`` if the file was loaded successfully.\n \n         .. versionadded:: 2.0\n@@ -261,7 +263,7 @@ class Config(dict):\n         filename = os.path.join(self.root_path, filename)\n \n         try:\n-            with open(filename) as f:\n+            with open(filename, mode=mode) as f:  # Use the mode parameter\n                 obj = load(f)\n         except OSError as e:\n             if silent and e.errno in (errno.ENOENT, errno.EISDIR):\n@@ -335,4 +337,4 @@ class Config(dict):\n         return rv\n \n     def __repr__(self) -> str:\n-        return f\"<{type(self).__name__} {dict.__repr__(self)}>\"\n+        return f\"<{type(self).__name__} {dict.__repr__(self)}>\"\n\\ No newline at end of file\n", "instance_id": "pallets__flask-4992"}
diff --git a/src/codegen/extensions/swebench/results/pallets__flask-4992.json b/src/codegen/extensions/swebench/results/pallets__flask-4992.json
new file mode 100644
index 000000000..13643ac16
--- /dev/null
+++ b/src/codegen/extensions/swebench/results/pallets__flask-4992.json
@@ -0,0 +1,29 @@
+{
+    "instance_id": "pallets__flask-4992",
+    "model_patch": "diff --git a/src/flask/config.py b/src/flask/config.py\nindex d4fc310..f0a2fc0 100644\n--- a/src/flask/config.py\n+++ b/src/flask/config.py\n@@ -234,6 +234,7 @@ class Config(dict):\n         filename: str,\n         load: t.Callable[[t.IO[t.Any]], t.Mapping],\n         silent: bool = False,\n+        mode: str = 'r',  # Add mode parameter with default 'r'\n     ) -> bool:\n         \"\"\"Update the values in the config from a file that is loaded\n         using the ``load`` parameter. The loaded data is passed to the\n@@ -254,6 +255,7 @@ class Config(dict):\n         :type load: ``Callable[[Reader], Mapping]`` where ``Reader``\n             implements a ``read`` method.\n         :param silent: Ignore the file if it doesn't exist.\n+        :param mode: The mode in which to open the file. Default is 'r'.\n         :return: ``True`` if the file was loaded successfully.\n \n         .. versionadded:: 2.0\n@@ -261,7 +263,7 @@ class Config(dict):\n         filename = os.path.join(self.root_path, filename)\n \n         try:\n-            with open(filename) as f:\n+            with open(filename, mode=mode) as f:  # Use the mode parameter\n                 obj = load(f)\n         except OSError as e:\n             if silent and e.errno in (errno.ENOENT, errno.EISDIR):\n@@ -335,4 +337,4 @@ class Config(dict):\n         return rv\n \n     def __repr__(self) -> str:\n-        return f\"<{type(self).__name__} {dict.__repr__(self)}>\"\n+        return f\"<{type(self).__name__} {dict.__repr__(self)}>\"\n\\ No newline at end of file\ndiff --git a/tests/test_config.py b/tests/test_config.py\nindex 76c5d27..782897b 100644\n--- a/tests/test_config.py\n+++ b/tests/test_config.py\n@@ -1,6 +1,7 @@\n import json\n import os\n import textwrap\n+import tomllib\n \n import pytest\n \n@@ -239,11 +240,18 @@ def test_from_pyfile_weird_encoding(tmpdir, encoding):\n         textwrap.dedent(\n             f\"\"\"\n             # -*- coding: {encoding} -*-\n-            TEST_VALUE = \"f\u00f6\u00f6\"\n+            TEST_VALUE = \"f\u000f\u000f\"\n             \"\"\"\n         ).encode(encoding)\n     )\n     app = flask.Flask(__name__)\n     app.config.from_pyfile(str(f))\n     value = app.config[\"TEST_VALUE\"]\n-    assert value == \"f\u00f6\u00f6\"\n+    assert value == \"f\u000f\u000f\"\n+\n+\n+def test_config_from_toml_file():\n+    app = flask.Flask(__name__)\n+    current_dir = os.path.dirname(os.path.abspath(__file__))\n+    app.config.from_file(os.path.join(current_dir, \"static\", \"config.toml\"), tomllib.load, mode=\"rb\")\n+    common_object_test(app)\n",
+    "gold_files": [
+        "src/flask/config.py"
+    ],
+    "edited_files": [
+        "src/flask/config.py",
+        "tests/test_config.py"
+    ],
+    "try": 1,
+    "tries": 1,
+    "all_results": [
+        {
+            "instance_id": "pallets__flask-4992",
+            "model_patch": "diff --git a/src/flask/config.py b/src/flask/config.py\nindex d4fc310..f0a2fc0 100644\n--- a/src/flask/config.py\n+++ b/src/flask/config.py\n@@ -234,6 +234,7 @@ class Config(dict):\n         filename: str,\n         load: t.Callable[[t.IO[t.Any]], t.Mapping],\n         silent: bool = False,\n+        mode: str = 'r',  # Add mode parameter with default 'r'\n     ) -> bool:\n         \"\"\"Update the values in the config from a file that is loaded\n         using the ``load`` parameter. The loaded data is passed to the\n@@ -254,6 +255,7 @@ class Config(dict):\n         :type load: ``Callable[[Reader], Mapping]`` where ``Reader``\n             implements a ``read`` method.\n         :param silent: Ignore the file if it doesn't exist.\n+        :param mode: The mode in which to open the file. Default is 'r'.\n         :return: ``True`` if the file was loaded successfully.\n \n         .. versionadded:: 2.0\n@@ -261,7 +263,7 @@ class Config(dict):\n         filename = os.path.join(self.root_path, filename)\n \n         try:\n-            with open(filename) as f:\n+            with open(filename, mode=mode) as f:  # Use the mode parameter\n                 obj = load(f)\n         except OSError as e:\n             if silent and e.errno in (errno.ENOENT, errno.EISDIR):\n@@ -335,4 +337,4 @@ class Config(dict):\n         return rv\n \n     def __repr__(self) -> str:\n-        return f\"<{type(self).__name__} {dict.__repr__(self)}>\"\n+        return f\"<{type(self).__name__} {dict.__repr__(self)}>\"\n\\ No newline at end of file\ndiff --git a/tests/test_config.py b/tests/test_config.py\nindex 76c5d27..782897b 100644\n--- a/tests/test_config.py\n+++ b/tests/test_config.py\n@@ -1,6 +1,7 @@\n import json\n import os\n import textwrap\n+import tomllib\n \n import pytest\n \n@@ -239,11 +240,18 @@ def test_from_pyfile_weird_encoding(tmpdir, encoding):\n         textwrap.dedent(\n             f\"\"\"\n             # -*- coding: {encoding} -*-\n-            TEST_VALUE = \"f\u00f6\u00f6\"\n+            TEST_VALUE = \"f\u000f\u000f\"\n             \"\"\"\n         ).encode(encoding)\n     )\n     app = flask.Flask(__name__)\n     app.config.from_pyfile(str(f))\n     value = app.config[\"TEST_VALUE\"]\n-    assert value == \"f\u00f6\u00f6\"\n+    assert value == \"f\u000f\u000f\"\n+\n+\n+def test_config_from_toml_file():\n+    app = flask.Flask(__name__)\n+    current_dir = os.path.dirname(os.path.abspath(__file__))\n+    app.config.from_file(os.path.join(current_dir, \"static\", \"config.toml\"), tomllib.load, mode=\"rb\")\n+    common_object_test(app)\n",
+            "gold_files": [
+                "src/flask/config.py"
+            ],
+            "edited_files": [
+                "src/flask/config.py",
+                "tests/test_config.py"
+            ],
+            "try": 1
+        }
+    ],
+    "cost": 0,
+    "resolved": false
+}
\ No newline at end of file
diff --git a/src/codegen/extensions/swebench/results/psf__requests-1963.json b/src/codegen/extensions/swebench/results/psf__requests-1963.json
new file mode 100644
index 000000000..8146ce1d5
--- /dev/null
+++ b/src/codegen/extensions/swebench/results/psf__requests-1963.json
@@ -0,0 +1,27 @@
+{
+    "instance_id": "psf__requests-1963",
+    "model_patch": "diff --git a/requests/sessions.py b/requests/sessions.py\nindex 425db22..d7afd2a 100644\n--- a/requests/sessions.py\n+++ b/requests/sessions.py\n@@ -86,6 +86,7 @@ class SessionRedirectMixin(object):\n         \"\"\"Receives a Response. Returns a generator of Responses.\"\"\"\n \n         i = 0\n+        method = req.method  # Track the current method\n \n         while resp.is_redirect:\n             prepared_request = req.copy()\n@@ -99,7 +100,6 @@ class SessionRedirectMixin(object):\n             resp.close()\n \n             url = resp.headers['location']\n-            method = req.method\n \n             # Handle redirection without scheme (see: RFC 1808 Section 4)\n             if url.startswith('//'):\n@@ -156,7 +156,7 @@ class SessionRedirectMixin(object):\n \n             if 'Authorization' in headers:\n                 # If we get redirected to a new host, we should strip out any\n-                #\u00a0authentication headers.\n+                # authentication headers.\n                 original_parsed = urlparse(resp.request.url)\n                 redirect_parsed = urlparse(url)\n \n@@ -568,4 +568,4 @@ class Session(SessionRedirectMixin):\n def session():\n     \"\"\"Returns a :class:`Session` for context-management.\"\"\"\n \n-    return Session()\n+    return Session()\n\\ No newline at end of file\n",
+    "gold_files": [
+        "requests/sessions.py"
+    ],
+    "edited_files": [
+        "requests/sessions.py"
+    ],
+    "try": 1,
+    "tries": 1,
+    "all_results": [
+        {
+            "instance_id": "psf__requests-1963",
+            "model_patch": "diff --git a/requests/sessions.py b/requests/sessions.py\nindex 425db22..d7afd2a 100644\n--- a/requests/sessions.py\n+++ b/requests/sessions.py\n@@ -86,6 +86,7 @@ class SessionRedirectMixin(object):\n         \"\"\"Receives a Response. Returns a generator of Responses.\"\"\"\n \n         i = 0\n+        method = req.method  # Track the current method\n \n         while resp.is_redirect:\n             prepared_request = req.copy()\n@@ -99,7 +100,6 @@ class SessionRedirectMixin(object):\n             resp.close()\n \n             url = resp.headers['location']\n-            method = req.method\n \n             # Handle redirection without scheme (see: RFC 1808 Section 4)\n             if url.startswith('//'):\n@@ -156,7 +156,7 @@ class SessionRedirectMixin(object):\n \n             if 'Authorization' in headers:\n                 # If we get redirected to a new host, we should strip out any\n-                #\u00a0authentication headers.\n+                # authentication headers.\n                 original_parsed = urlparse(resp.request.url)\n                 redirect_parsed = urlparse(url)\n \n@@ -568,4 +568,4 @@ class Session(SessionRedirectMixin):\n def session():\n     \"\"\"Returns a :class:`Session` for context-management.\"\"\"\n \n-    return Session()\n+    return Session()\n\\ No newline at end of file\n",
+            "gold_files": [
+                "requests/sessions.py"
+            ],
+            "edited_files": [
+                "requests/sessions.py"
+            ],
+            "try": 1
+        }
+    ],
+    "cost": 0,
+    "resolved": false
+}
\ No newline at end of file
diff --git a/src/codegen/extensions/swebench/results/pytest-dev__pytest-7168.json b/src/codegen/extensions/swebench/results/pytest-dev__pytest-7168.json
new file mode 100644
index 000000000..e9a37a220
--- /dev/null
+++ b/src/codegen/extensions/swebench/results/pytest-dev__pytest-7168.json
@@ -0,0 +1,27 @@
+{
+    "instance_id": "pytest-dev__pytest-7168",
+    "model_patch": "diff --git a/src/_pytest/_io/saferepr.py b/src/_pytest/_io/saferepr.py\nindex 23af4d0..eb03600 100644\n--- a/src/_pytest/_io/saferepr.py\n+++ b/src/_pytest/_io/saferepr.py\n@@ -17,6 +17,8 @@ def _format_repr_exception(exc: BaseException, obj: Any) -> str:\n         exc_info = _try_repr_or_str(exc)\n     except (KeyboardInterrupt, SystemExit):\n         raise\n+    except RuntimeError as exc:\n+        exc_info = \"RuntimeError: No active exception to reraise\"\n     except BaseException as exc:\n         exc_info = \"unpresentable exception ({})\".format(_try_repr_or_str(exc))\n     return \"<[{} raised in repr()] {} object at 0x{:x}>\".format(\n@@ -100,4 +102,4 @@ class AlwaysDispatchingPrettyPrinter(pprint.PrettyPrinter):\n def _pformat_dispatch(object, indent=1, width=80, depth=None, *, compact=False):\n     return AlwaysDispatchingPrettyPrinter(\n         indent=indent, width=width, depth=depth, compact=compact\n-    ).pformat(object)\n+    ).pformat(object)\n\\ No newline at end of file\n",
+    "gold_files": [
+        "src/_pytest/_io/saferepr.py"
+    ],
+    "edited_files": [
+        "src/_pytest/_io/saferepr.py"
+    ],
+    "try": 1,
+    "tries": 1,
+    "all_results": [
+        {
+            "instance_id": "pytest-dev__pytest-7168",
+            "model_patch": "diff --git a/src/_pytest/_io/saferepr.py b/src/_pytest/_io/saferepr.py\nindex 23af4d0..eb03600 100644\n--- a/src/_pytest/_io/saferepr.py\n+++ b/src/_pytest/_io/saferepr.py\n@@ -17,6 +17,8 @@ def _format_repr_exception(exc: BaseException, obj: Any) -> str:\n         exc_info = _try_repr_or_str(exc)\n     except (KeyboardInterrupt, SystemExit):\n         raise\n+    except RuntimeError as exc:\n+        exc_info = \"RuntimeError: No active exception to reraise\"\n     except BaseException as exc:\n         exc_info = \"unpresentable exception ({})\".format(_try_repr_or_str(exc))\n     return \"<[{} raised in repr()] {} object at 0x{:x}>\".format(\n@@ -100,4 +102,4 @@ class AlwaysDispatchingPrettyPrinter(pprint.PrettyPrinter):\n def _pformat_dispatch(object, indent=1, width=80, depth=None, *, compact=False):\n     return AlwaysDispatchingPrettyPrinter(\n         indent=indent, width=width, depth=depth, compact=compact\n-    ).pformat(object)\n+    ).pformat(object)\n\\ No newline at end of file\n",
+            "gold_files": [
+                "src/_pytest/_io/saferepr.py"
+            ],
+            "edited_files": [
+                "src/_pytest/_io/saferepr.py"
+            ],
+            "try": 1
+        }
+    ],
+    "cost": 0,
+    "resolved": false
+}
\ No newline at end of file
diff --git a/src/codegen/extensions/swebench/tests.py b/src/codegen/extensions/swebench/tests.py
new file mode 100755
index 000000000..b81b6b274
--- /dev/null
+++ b/src/codegen/extensions/swebench/tests.py
@@ -0,0 +1,237 @@
+#!/usr/bin/env python
+
+import json
+import random
+import sys
+import pprint
+from collections import defaultdict
+from pathlib import Path
+
+from utils import get_dataset, load_predictions  # noqa: F401
+
+# A no-op patch which creates an empty file is used to stand in for
+# the `model_patch` and/or `test_patch` when running SWE Bench tests
+# without one or both of those patches.
+NOOP_PATCH = (
+    "diff --git a/empty.file.{nonce}.ignore b/empty.file.{nonce}.ignore\n"
+    "new file mode 100644\n"
+    "index 0000000..e69de29\n"
+)
+
+def remove_patches_to_tests(model_patch):
+    """
+    Remove any changes to the tests directory from the provided patch.
+    This is to ensure that the model_patch does not disturb the repo's
+    tests when doing acceptance testing with the `test_patch`.
+    """
+    if not model_patch:
+        return model_patch
+        
+    lines = model_patch.splitlines(keepends=True)
+    filtered_lines = []
+    is_tests = False
+
+    for line in lines:
+        if line.startswith("diff --git a/"):
+            pieces = line.split()
+            to = pieces[-1]
+            if to.startswith("b/") and (
+                "/test/" in to
+                or "/tests/" in to
+                or "/testing/" in to
+                or "/test_" in to
+                or "/tox.ini" in to
+            ):
+                is_tests = True
+            else:
+                is_tests = False
+
+        if not is_tests:
+            filtered_lines.append(line)
+
+    return "".join(filtered_lines)
+
+def files_in_patch(patch):
+    """
+    Extract the list of modified files from a unified diff patch string.
+    """
+    files = []
+    for line in patch.split("\n"):
+        if line.startswith("--- a/") or line.startswith("+++ b/"):
+            fname = line.split("/", 1)[1]
+            if fname not in files:
+                files.append(fname)
+    return files
+
+def run_tests(entry, model_patch=None, use_test_patch=False, model_name_or_path="none"):
+    """
+    Run tests for the SWE Bench `entry`, optionally applying a `model_patch` first.
+
+    If `use_test_patch` is True, then also apply the `test_patch` to bring in
+    the tests which determine if the issue is resolved. So False means
+    only run the tests that existed at the `base_commit` and any new/changed
+    tests contained in the `model_patch`.
+
+    Optionally specify a `model_name_or_path`, which isn't really used since
+    the log_dir for the tests is a temp dir which is discarded.
+    """
+    instance_id = entry["instance_id"]
+
+    test_type = MAP_REPO_TO_TEST_FRAMEWORK[entry["repo"]]
+    test_directives = get_test_directives(entry)
+    test_cmd = f"{test_type} {' '.join(test_directives)}"
+
+    # Use a no-op patch if no model_patch is provided
+    if not model_patch:
+        model_patch = NOOP_PATCH.format(nonce="model_patch")
+
+    # Use a no-op patch if use_test_patch is False
+    if use_test_patch:
+        test_patch = entry["test_patch"]
+    else:
+        test_patch = NOOP_PATCH.format(nonce="test_patch")
+
+    if model_patch and use_test_patch:
+        # Make sure the model_patch does not disturb the repo's tests
+        # when doing acceptance testing with the `test_patch`.
+        print("=" * 30)
+        print(model_patch)
+        model_patch = remove_patches_to_tests(model_patch)
+        print("=" * 30)
+        print(model_patch)
+        print("=" * 30)
+
+    entry_instance = {
+        "repo": entry["repo"],
+        "version": entry["version"],
+        "base_commit": entry["base_commit"],
+        "instance_id": entry["instance_id"],
+        "model_name_or_path": model_name_or_path,
+        "model_patch": model_patch,
+        "test_patch": test_patch,
+        "test_directives": test_directives,
+        "test_cmd": test_cmd,
+    }
+
+    namespace = "aorwall"
+    with tempfile.TemporaryDirectory(dir="/mnt/aider") as log_dir:
+        timeout = 60
+        log_suffix = ""
+
+        asyncio.run(run_docker_evaluation(entry_instance, namespace, log_dir, timeout, log_suffix))
+
+        log_fname = Path(log_dir) / f"{instance_id}.{model_name_or_path}.eval.log"
+        if not log_fname.exists():
+            return None, ""
+
+        log_text = log_fname.read_text()
+        log_lines = log_text.splitlines()
+        log_lines = [line for line in log_lines if line.startswith(">>>>")]
+        print("\n".join(log_lines))
+
+        passed = ">>>>> All Tests Passed" in log_text
+
+        return passed, log_text
+
+
+def main_check_docker_images():
+    dataset = get_dataset()
+
+    # instances = get_devin_instance_ids()
+    instances = list(dataset.keys())
+    random.shuffle(instances)
+
+    cache_fname = Path("tmp.dockerimages.json")
+    if cache_fname.exists():
+        data = json.loads(cache_fname.read_text())
+        good_dockers = defaultdict(int, data["good"])
+        bad_dockers = defaultdict(int, data["bad"])
+        seen_instances = set(data["instances"])
+    else:
+        good_dockers = defaultdict(int)
+        bad_dockers = defaultdict(int)
+        seen_instances = set()
+
+    for instance_id in instances:
+        entry = dataset[instance_id]
+
+        if instance_id in seen_instances:
+            continue
+
+        seen_instances.add(instance_id)
+
+        docker_image = get_docker_image(entry)
+        if docker_image in bad_dockers:
+            bad_dockers[docker_image] += 1
+            continue
+
+        if docker_image in good_dockers:
+            good_dockers[docker_image] += 1
+            continue
+
+        pprint.pprint(instance_id)
+        pprint.pprint(docker_image)
+
+        passed, test_text = run_tests(
+            entry,
+            model_patch=None,
+            use_test_patch=False,
+        )
+        if passed is None:
+            bad_dockers[docker_image] += 1
+        else:
+            good_dockers[docker_image] += 1
+
+        update_cache(cache_fname, seen_instances, good_dockers, bad_dockers)
+
+    update_cache(cache_fname, seen_instances, good_dockers, bad_dockers)
+
+    pprint.pprint(bad_dockers)
+
+
+def update_cache(cache_fname, instances, good_dockers, bad_dockers):
+    save_dict = dict(
+        instances=list(instances),
+        good=dict(good_dockers),
+        bad=dict(bad_dockers),
+    )
+    cache_fname.write_text(json.dumps(save_dict, indent=4, sort_keys=True))
+
+    total_instances = sum(good_dockers.values()) + sum(bad_dockers.values())
+    pprint.pprint(total_instances)
+    bad_instances = sum(bad_dockers.values())
+    pprint.pprint(bad_instances)
+    if total_instances:
+        pct_bad_instances = bad_instances / total_instances * 100
+        pprint.pprint(pct_bad_instances)
+    pprint.pprint(len(bad_dockers))
+
+
+def main_preds():
+    dataset = get_dataset()
+
+    dnames = sys.argv[1:]
+    preds = load_predictions(dnames)
+
+    num = 0
+    num_passed = 0
+    for instance_id, pred in preds.items():
+        entry = dataset[instance_id]
+
+        passed, test_text = run_tests(
+            entry,
+            model_patch=pred["model_patch"],
+            use_test_patch=True,
+        )
+
+        num += 1
+        if passed:
+            num_passed += 1
+
+        pprint.pprint((num_passed, num))
+
+
+if __name__ == "__main__":
+    status = main_check_docker_images()
+    # status = main_preds()
+    sys.exit(status)
diff --git a/src/codegen/extensions/swebench/utils.py b/src/codegen/extensions/swebench/utils.py
new file mode 100644
index 000000000..0593a1a59
--- /dev/null
+++ b/src/codegen/extensions/swebench/utils.py
@@ -0,0 +1,186 @@
+import datetime
+import json
+import shutil
+from pathlib import Path
+from pprint import pprint
+
+from datasets import load_dataset
+
+
+FULL_DATASET = "princeton-nlp/SWE-bench"
+FULL_DATASET_FNAME = FULL_DATASET.replace("/", "--") + ".json"
+
+
+VERIFIED_DATASET = "princeton-nlp/SWE-bench-verified"
+VERIFIED_DATASET_FNAME = VERIFIED_DATASET.replace("/", "--") + ".json"
+
+LITE_DATASET = "princeton-nlp/SWE-bench_Lite"
+LITE_DATASET_FNAME = LITE_DATASET.replace("/", "--") + ".json"
+
+
+def dump_dataset(dataset, fname):
+    """
+    Save the dataset to json.
+    """
+    entries = list(dataset)
+    for entry in entries:
+        entry["FAIL_TO_PASS"] = json.loads(entry["FAIL_TO_PASS"])
+        entry["PASS_TO_PASS"] = json.loads(entry["PASS_TO_PASS"])
+
+    with open(fname, "w") as f:
+        json.dump(entries, f, indent=4)
+
+
+def get_full_dataset():
+    return get_dataset(FULL_DATASET, FULL_DATASET_FNAME)
+
+
+def get_lite_dataset():
+    return get_dataset(LITE_DATASET, LITE_DATASET_FNAME)
+
+
+def get_verified_dataset():
+    return get_dataset(VERIFIED_DATASET, VERIFIED_DATASET_FNAME)
+
+
+def get_dataset(dataset, fname):
+    """
+    Load the `DATASET` from hugging face, and turn it into a dict
+    keyed on `instance_id`.
+    Cache the dict locally in a json file.
+    """
+
+    fname = Path(fname)
+    if fname.exists():
+        dataset = json.loads(fname.read_text())
+    else:
+        pprint(dataset)
+        dataset = load_dataset(dataset)
+        dataset = dataset["test"]
+        dump_dataset(dataset, fname)
+        pprint(dataset)
+
+    res = dict()
+    for entry in dataset:
+        res[entry["instance_id"]] = entry
+
+    return res
+
+
+def load_predictions(paths):
+    prediction_paths = []
+    for path in paths:
+        path = Path(path)
+        if path.is_file():
+            prediction_paths.append(path)
+        elif path.is_dir():
+            prediction_paths += list(path.glob("*.json"))
+        else:
+            assert False, path
+
+    # prediction_paths.sort(key=lambda p: p.stat().st_mtime)
+
+    predictions = dict()
+    for fname in prediction_paths:
+        try:
+            pred = json.loads(fname.read_text())
+        except json.decoder.JSONDecodeError as err:
+            pprint(fname)
+            raise err
+
+        if "instance_id" not in pred:
+            print("Skipping json without instance_id", fname)
+            continue
+
+        inst = pred["instance_id"]
+        pred["json_fname"] = str(fname)
+        predictions[inst] = pred
+
+    return predictions
+
+
+def is_plausible(pred):
+    attrs = "model_patch edit_outcome lint_outcome test_outcome".split()
+    for attr in attrs:
+        if not pred.get(attr):
+            return
+    return True
+
+
+def get_plausible(preds):
+    return set(inst for inst, pred in preds.items() if is_plausible(pred))
+
+
+def check_criteria(pred, criteria):
+    attrs = criteria.split()
+    for attr in attrs:
+        if not pred[attr]:
+            return False
+    return True
+
+
+def pick_winner(results):
+    """
+    Given that we didn't obtain a result with all good outcomes,
+    try a series of weaker outcome sets to find the strongest result.
+    """
+    priority = (
+        "model_patch edit_outcome lint_outcome test_outcome",  # all good!
+        "model_patch edit_outcome lint_outcome",  # all good but test_outcome
+        "model_patch lint_outcome",  # a patch that lints?
+        "model_patch edit_outcome",  # a patch that had no edit errors?
+        "model_patch",  # anything with an actual patch!
+    )
+
+    # choose the best result available
+    for criteria in priority:
+        for res in results:
+            if check_criteria(res, criteria):
+                return res
+
+    # choose the first result as a last resort
+    if results:
+        return results[0]
+
+
+def choose_pred(inst, all_preds, dnames):
+    results = []
+    for i in range(len(all_preds)):
+        preds = all_preds[i]
+        dname = dnames[i]
+
+        if inst not in preds:
+            continue
+        pred = dict(preds[inst])
+        pred["dname"] = Path(dname).name
+        results.append(pred)
+
+    return pick_winner(results)
+
+
+def choose_predictions(dnames, model_name_or_path=None, copy_md=False, devin_only=False):
+    all_preds = [load_predictions([dname], devin_only=devin_only) for dname in dnames]
+    all_instances = set()
+    for preds in all_preds:
+        all_instances.update(preds.keys())
+
+    chosen = dict()
+    for inst in all_instances:
+        res = choose_pred(inst, all_preds, dnames)
+        chosen[inst] = res
+
+        if copy_md:
+            pred_dname = Path("predictions")
+            md_fname = pred_dname / res["dname"] / (inst + ".md")
+            assert md_fname.exists(), md_fname
+            new_md_fname = pred_dname / model_name_or_path / (inst + ".md")
+            shutil.copyfile(md_fname, new_md_fname)
+
+    for inst in chosen:
+        pred = dict(chosen[inst])
+        pred["model_name_or_path"] = model_name_or_path
+        chosen[inst] = pred
+
+    pprint(len(chosen))
+    pprint(chosen)
+    return chosen