diff --git a/commit0/__main__.py b/commit0/__main__.py index 50b3125..8758562 100644 --- a/commit0/__main__.py +++ b/commit0/__main__.py @@ -11,6 +11,7 @@ from hydra.core.config_store import ConfigStore from commit0.configs.config_class import Commit0Config from commit0.harness.constants import COMMANDS, SPLIT +from omegaconf import OmegaConf def main() -> None: @@ -24,9 +25,17 @@ def main() -> None: cs.store(name="user", group="Commit0Config", node=Commit0Config) # have hydra to ignore all command-line arguments sys_argv = copy.deepcopy(sys.argv) - sys.argv = [sys.argv[0]] + cfg_arg = next((arg for arg in sys_argv if arg.startswith("--cfg=")), None) + hydra.initialize(version_base=None, config_path="configs") config = hydra.compose(config_name="user") + + if cfg_arg: + sys_argv.remove(cfg_arg) + config_name = cfg_arg.split("=")[1] + user_config = OmegaConf.load(config_name) + config = OmegaConf.merge(config, user_config) + # after hydra gets all configs, put command-line arguments back sys.argv = sys_argv # repo_split: split from command line has a higher priority than split in hydra diff --git a/commit0/harness/docker_build.py b/commit0/harness/docker_build.py index dea9055..264f379 100644 --- a/commit0/harness/docker_build.py +++ b/commit0/harness/docker_build.py @@ -7,6 +7,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path from typing import Any +import sys from commit0.harness.constants import ( BASE_IMAGE_BUILD_DIR, @@ -39,6 +40,8 @@ def setup_logger(repo: str, log_file: Path, mode: str = "w") -> logging.Logger: log_file.parent.mkdir(parents=True, exist_ok=True) logger = logging.getLogger(f"{repo}.{log_file.name}") handler = logging.FileHandler(log_file, mode=mode) + stdout_handler = logging.StreamHandler(sys.stdout) + logger.addHandler(stdout_handler) formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") handler.setFormatter(formatter) logger.addHandler(handler) diff --git a/commit0/harness/run_pytest_ids.py b/commit0/harness/run_pytest_ids.py index 37b5d6d..058cac0 100644 --- a/commit0/harness/run_pytest_ids.py +++ b/commit0/harness/run_pytest_ids.py @@ -100,8 +100,10 @@ def main( eval_file.write_text(eval_script) if ExecutionBackend(backend) == ExecutionBackend.MODAL: + logger.info("Runnning on Modal") execution_context = Modal elif ExecutionBackend(backend) == ExecutionBackend.LOCAL: + logger.info("Runnning locally") execution_context = Docker else: raise ValueError( diff --git a/docs/about.md b/docs/about.md index c5f34d3..bd2455e 100644 --- a/docs/about.md +++ b/docs/about.md @@ -1 +1 @@ -Spec2Repo is made by ... +Commit0 is .. diff --git a/docs/distributed.md b/docs/distributed.md new file mode 100644 index 0000000..0c4ca1e --- /dev/null +++ b/docs/distributed.md @@ -0,0 +1,33 @@ +# Distributed + +One of the main advantages of `commit0` is that it can run +a range of unit tests in distributed environments. + +By default, the library is configured to work with [modal](https://modal.com/). + +```bash +pip install modal +modal token new +``` + +## Modal Setup + +To enable distributed run, first +create a file called `distributed.yaml` + +```yaml +backend: modal +base_dir: repos.dist/ +``` + +You can pass this configuration file as an argumnet to clone. + +```bash +commit0 clone lite --cfg=distributed +``` + +Next to run tests you can run the standard test command. + +```bash +commit0 test simpy master tests/test_event.py::test_succeed --cfg=distributed +``` diff --git a/docs/index.md b/docs/index.md index 13a5ca4..5b9f22d 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,11 +1,79 @@ + ![](logo.webp) # -Commit-0 is a new challenge for AI code generation. +Commit-0 is a real-world AI coding challenge. +Can your agent generate a working library from commit 0? + +The benchmark consists of 57 core Python libraries. +Libraries are selected based on: + +* Significant unit-test coverage +* Detailed specification and documentation +* Lint and type checking + +The [commit0 tool](setup) allows you to: -Given a specification, can you generate a repository? +* Efficiently run interactive tests in isolated environemnts +* Distribute testing and development across cloud systems +* Track and log all changes made throughout. -- YYY Lines of code -- XXX Unit Test -- 50 Repositories +| | Name | Repo | Commit0 | Tests | | +|--|--------|-------|----|----|------| +| | [minitorch](https://minitorch.github.io/) | [[orig](http://github.com/minitorch/minitorch)] | [[commit0](http://github.com/commit-0/minitorch)] | 230 | | +| | [simpy](https://simpy.readthedocs.io/en/4.1.1/) | [[orig](http://github.com/wenting-zhao/simpy)] | [[commit0](http://github.com/commit-0/simpy)] | 140 | | +| | [bitstring](https://bitstring.readthedocs.io/en/stable/) | [[orig](http://github.com/scott-griffiths/bitstring)] | [[commit0](http://github.com/commit-0/bitstring)] | 834 | | +| | [tinydb](https://tinydb.readthedocs.io/_/downloads/en/v4.8.0/pdf/) | [[orig](http://github.com/msiemens/tinydb)] | [[commit0](http://github.com/commit-0/tinydb)] | 201 | | +| | [marshmallow](https://marshmallow.readthedocs.io/_/downloads/en/stable/pdf/) | [[orig](http://github.com/marshmallow-code/marshmallow)] | [[commit0](http://github.com/commit-0/marshmallow)] | 1229 | | +| | [python-prompt-toolkit](https://python-prompt-toolkit.readthedocs.io/_/downloads/en/3.0.43/pdf/) | [[orig](http://github.com/prompt-toolkit/python-prompt-toolkit)] | [[commit0](http://github.com/commit-0/python-prompt-toolkit)] | 151 | | +| | [parsel](https://parsel.readthedocs.io/_/downloads/en/latest/pdf/) | [[orig](http://github.com/scrapy/parsel)] | [[commit0](http://github.com/commit-0/parsel)] | 343 | | +| pyjwt | [pyjwt](https://pyjwt.readthedocs.io/_/downloads/en/2.8.0/pdf/) | [[orig](http://github.com/jpadilla/pyjwt)] | [[commit0](http://github.com/commit-0/pyjwt)] | 259 | | +| | [networkx](https://networkx.org/documentation/networkx-3.3/) | [[orig](http://github.com/networkx/networkx)] | [[commit0](http://github.com/commit-0/networkx)] | 5440 | | +| | [graphene](https://docs.graphene-python.org/en/stable/) | [[orig](http://github.com/graphql-python/graphene)] | [[commit0](http://github.com/commit-0/graphene)] | 447 | | +| tlslite-ng | [tlslite-ng](https://tlslite-ng.readthedocs.io/en/latest/) | [[orig](http://github.com/tlsfuzzer/tlslite-ng)] | [[commit0](http://github.com/commit-0/tlslite-ng)] | 1653 | | +| wcwidth | [wcwidth](https://wcwidth.readthedocs.io/en/stable/) | [[orig](http://github.com/jquast/wcwidth)] | [[commit0](http://github.com/commit-0/wcwidth)] | 38 | | +| chardet | [chardet](https://chardet.readthedocs.io/_/downloads/en/stable/pdf/) | [[orig](http://github.com/chardet/chardet)] | [[commit0](http://github.com/commit-0/chardet)] | 376 | | +| dnspython | [dnspython](https://dnspython.readthedocs.io/en/stable/) | [[orig](http://github.com/rthalley/dnspython)] | [[commit0](http://github.com/commit-0/dnspython)] | 1304 | | +| imapclient | [imapclient](https://imapclient.readthedocs.io/en/3.0.1/) | [[orig](http://github.com/mjs/imapclient)] | [[commit0](http://github.com/commit-0/imapclient)] | 267 | | +| | [virtualenv](https://virtualenv.pypa.io/en/20.26.3/) | [[orig](http://github.com/pypa/virtualenv)] | [[commit0](http://github.com/commit-0/virtualenv)] | 284 | | +| pexpect | [pexpect](https://pexpect.readthedocs.io/_/downloads/en/stable/pdf/) | [[orig](http://github.com/pexpect/pexpect)] | [[commit0](http://github.com/commit-0/pexpect)] | 255 | | +| | [web3.py](https://web3py.readthedocs.io/_/downloads/en/v6.20.2/pdf/) | [[orig](http://github.com/ethereum/web3.py)] | [[commit0](http://github.com/commit-0/web3.py)] | 40433 | | +| | [babel](https://babel.pocoo.org/_/downloads/en/stable/pdf/) | [[orig](http://github.com/python-babel/babel)] | [[commit0](http://github.com/commit-0/babel)] | 5663 | | +| | [geopandas](https://geopandas.org/en/stable/) | [[orig](http://github.com/geopandas/geopandas)] | [[commit0](http://github.com/commit-0/geopandas)] | 2196 | | +| dulwich | [dulwich](https://dulwich.readthedocs.io/_/downloads/en/latest/pdf/) | [[orig](http://github.com/jelmer/dulwich)] | [[commit0](http://github.com/commit-0/dulwich)] | 1522 | | +| | [flask](https://flask.palletsprojects.com/en/3.0.x/) | [[orig](http://github.com/pallets/flask)] | [[commit0](http://github.com/commit-0/flask)] | 477 | | +| voluptuous | [voluptuous](https://alecthomas.github.io/voluptuous/docs/_build/html/) | [[orig](http://github.com/alecthomas/voluptuous)] | [[commit0](http://github.com/commit-0/voluptuous)] | 149 | | +| | [jinja](https://jinja.palletsprojects.com/en/3.1.x/) | [[orig](http://github.com/pallets/jinja)] | [[commit0](http://github.com/commit-0/jinja)] | 851 | | +| | [seaborn](https://seaborn.pydata.org/) | [[orig](http://github.com/mwaskom/seaborn)] | [[commit0](http://github.com/commit-0/seaborn)] | 2362 | | +| requests | [requests](https://requests.readthedocs.io/_/downloads/en/latest/pdf/) | [[orig](http://github.com/psf/requests)] | [[commit0](http://github.com/commit-0/requests)] | 590 | | +| | [scrapy](https://docs.scrapy.org/_/downloads/en/2.11/pdf/) | [[orig](http://github.com/scrapy/scrapy)] | [[commit0](http://github.com/commit-0/scrapy)] | 2904 | | +| | [fastapi](https://fastapi.tiangolo.com/reference/) | [[orig](http://github.com/fastapi/fastapi)] | [[commit0](http://github.com/commit-0/fastapi)] | 2013 | | +| | [click](https://click.palletsprojects.com/en/8.1.x/) | [[orig](http://github.com/pallets/click)] | [[commit0](http://github.com/commit-0/click)] | 589 | | +| | [python-rsa](https://stuvel.eu/python-rsa-doc/) | [[orig](http://github.com/sybrenstuvel/python-rsa)] | [[commit0](http://github.com/commit-0/python-rsa)] | 86 | | +| | [statsmodels](https://www.statsmodels.org/stable/) | [[orig](http://github.com/statsmodels/statsmodels)] | [[commit0](http://github.com/commit-0/statsmodels)] | 17669 | | +| more-itertools | [more-itertools](https://more-itertools.readthedocs.io/en/v10.4.0/) | [[orig](http://github.com/more-itertools/more-itertools)] | [[commit0](http://github.com/commit-0/more-itertools)] | 662 | | +| | [moviepy](https://zulko.github.io/moviepy/) | [[orig](http://github.com/Zulko/moviepy)] | [[commit0](http://github.com/commit-0/moviepy)] | 109 | | +| deprecated | [deprecated](https://deprecated.readthedocs.io/en/latest/) | [[orig](http://github.com/laurent-laporte-pro/deprecated)] | [[commit0](http://github.com/commit-0/deprecated)] | 171 | | +| | [pydantic](https://docs.pydantic.dev/2.8/) | [[orig](http://github.com/pydantic/pydantic)] | [[commit0](http://github.com/commit-0/pydantic)] | 5091 | | +| | [loguru](https://loguru.readthedocs.io/_/downloads/en/0.7.2/pdf/) | [[orig](http://github.com/Delgan/loguru)] | [[commit0](http://github.com/commit-0/loguru)] | 1461 | | +| | [pypdf](https://pypdf.readthedocs.io/_/downloads/en/4.3.1/pdf/) | [[orig](http://github.com/py-pdf/pypdf)] | [[commit0](http://github.com/commit-0/pypdf)] | 911 | | +| | [attrs](https://www.attrs.org/en/24.2.0/) | [[orig](http://github.com/python-attrs/attrs)] | [[commit0](http://github.com/commit-0/attrs)] | 1414 | | +| | [mimesis](https://mimesis.name/en/v17.0.0/) | [[orig](http://github.com/lk-geimfari/mimesis)] | [[commit0](http://github.com/commit-0/mimesis)] | 6159 | | +| | [cookiecutter](https://cookiecutter.readthedocs.io/_/downloads/en/2.6.0/pdf/) | [[orig](http://github.com/cookiecutter/cookiecutter)] | [[commit0](http://github.com/commit-0/cookiecutter)] | 367 | | +| | [tornado](https://www.tornadoweb.org/_/downloads/en/stable/pdf/) | [[orig](http://github.com/tornadoweb/tornado)] | [[commit0](http://github.com/commit-0/tornado)] | 1150 | | +| | [imbalanced-learn](https://imbalanced-learn.org/stable/) | [[orig](http://github.com/scikit-learn-contrib/imbalanced-learn)] | [[commit0](http://github.com/commit-0/imbalanced-learn)] | 2310 | | +| | [python-progressbar](https://progressbar-2.readthedocs.io/_/downloads/en/stable/pdf/) | [[orig](http://github.com/wolph/python-progressbar)] | [[commit0](http://github.com/commit-0/python-progressbar)] | 385 | | +| | [PyBoy](https://docs.pyboy.dk/) | [[orig](http://github.com/Baekalfen/PyBoy)] | [[commit0](http://github.com/commit-0/PyBoy)] | 201 | | +| | [pytest](https://docs.pytest.org/_/downloads/en/8.3.x/pdf/) | [[orig](http://github.com/pytest-dev/pytest)] | [[commit0](http://github.com/commit-0/pytest)] | 3612 | | +| | [pylint](https://pylint.readthedocs.io/en/v3.2.6/) | [[orig](http://github.com/pylint-dev/pylint)] | [[commit0](http://github.com/commit-0/pylint)] | 1878 | | +| | [sphinx](https://www.sphinx-doc.org/en/master/) | [[orig](http://github.com/sphinx-doc/sphinx)] | [[commit0](http://github.com/commit-0/sphinx)] | 2187 | | +| | [joblib](https://joblib.readthedocs.io/en/stable/) | [[orig](http://github.com/joblib/joblib)] | [[commit0](http://github.com/commit-0/joblib)] | 1450 | | +| | [xarray](https://docs.xarray.dev/en/v2024.07.0/) | [[orig](http://github.com/pydata/xarray)] | [[commit0](http://github.com/commit-0/xarray)] | 15643 | | +| cachetools | [cachetools](https://cachetools.readthedocs.io/en/v5.5.0/) | [[orig](http://github.com/tkem/cachetools)] | [[commit0](http://github.com/commit-0/cachetools)] | 215 | | +| paramiko | [paramiko](https://www.paramiko.org/) | [[orig](http://github.com/paramiko/paramiko)] | [[commit0](http://github.com/commit-0/paramiko)] | 557 | | +| | [fabric](https://www.fabfile.org/) | [[orig](http://github.com/fabric/fabric)] | [[commit0](http://github.com/commit-0/fabric)] | 353 | | +| | [filesystem_spec](https://filesystem-spec.readthedocs.io/en/stable/) | [[orig](http://github.com/fsspec/filesystem_spec)] | [[commit0](http://github.com/commit-0/filesystem_spec)] | 698 | | +| jedi | [jedi](https://jedi.readthedocs.io/en/stable/) | [[orig](http://github.com/davidhalter/jedi)] | [[commit0](http://github.com/commit-0/jedi)] | 3854 | | +| sqlparse | [sqlparse](https://sqlparse.readthedocs.io/en/stable/) | [[orig](http://github.com/andialbrecht/sqlparse)] | [[commit0](http://github.com/commit-0/sqlparse)] | 461 | | +| | [portalocker](https://portalocker.readthedocs.io/en/stable/) | [[orig](http://github.com/wolph/portalocker)] | [[commit0](http://github.com/commit-0/portalocker)] | 38 | | diff --git a/docs/make_md.py b/docs/make_md.py new file mode 100644 index 0000000..2e8f9d7 --- /dev/null +++ b/docs/make_md.py @@ -0,0 +1,88 @@ +import datasets +import subprocess + +import requests +from bs4 import BeautifulSoup + +def get_github_avatar(repo): + """ + Given a GitHub repo in the format 'owner/repo', get the avatar URL of the organization or user. + """ + try: + org = repo.split("/")[0] + # Construct the URL for the repo + url = f"https://github.com/{org}" + + # Make a request to the page + response = requests.get(url) + + # Check if the request was successful + if response.status_code != 200: + print(f"Failed to fetch page for {repo}. Status code: {response.status_code}") + return None + + # Parse the HTML content using BeautifulSoup + soup = BeautifulSoup(response.content, 'html.parser') + + # Find the meta tag with property "og:image" which contains the avatar URL + meta_tag = soup.find('meta', property='og:image') + + if meta_tag and 'content' in meta_tag.attrs: + avatar_url = meta_tag['content'] + return avatar_url + else: + print(f"Avatar URL not found for {repo}") + return None + + except Exception as e: + print(f"An error occurred: {e}") + return None + +d = datasets.load_dataset("wentingzhao/commit0_docstring", split="test") + +print(d) + + + +print("| | Name | Repo | Commit0 | Tests | | ") +print("|--|--------|-------|----|----|------| ") +overload = { + "simpy" : "https://simpy.readthedocs.io/en/4.1.1/_images/simpy-logo-small.png", + "tinydb" : "https://raw.githubusercontent.com/msiemens/tinydb/master/artwork/logo.png", + "bitstring": "https://bitstring.readthedocs.io/en/stable/_images/bitstring_logo.png", + "seaborn": "https://raw.githubusercontent.com/mwaskom/seaborn/master/doc/_static/logo-wide-lightbg.svg", + "statsmodels": "https://raw.githubusercontent.com/statsmodels/statsmodels/main/docs/source/images/statsmodels-logo-v2-horizontal.svg", + "pyboy" : "https://github.com/Baekalfen/PyBoy/raw/master/extras/README/pyboy.svg", +} +skip = { + "pyjwt", + "wcwidth", + "chardet", + "dnspython", + "imapclient", + "pexpect", + "dulwich", + "voluptuous", + "requests", + "tlslite-ng", + "more-itertools", + "deprecated", + "cachetools", + "paramiko", + "jedi", + "sqlparse", +} +for i, ex in enumerate(d): + img = get_github_avatar(ex["original_repo"]) + + name = ex["repo"].split("/")[1] + result = subprocess.check_output(f"commit0 get-tests {name} | wc", shell=True, text=True) + + tests = int(result.split()[0]) + if name.lower() not in skip and name.lower() not in overload: + img = f"" + elif name.lower() in overload: + img = f"" + else: + img = f"{name}" + print(f"| {img} | [{name}]({ex['setup']['specification']}) | [[orig](http://github.com/{ex['original_repo']})] | [[commit0](http://github.com/{ex['repo']})] | {tests} | |") diff --git a/docs/setup.md b/docs/setup.md index e6cdce0..756d305 100644 --- a/docs/setup.md +++ b/docs/setup.md @@ -1,3 +1,7 @@ +# Quickstart + +## Install + First be sure that you have docker tools installed. ```bash @@ -7,42 +11,129 @@ apt install docker To install the benchmark run, ```bash -pip install spec2repo +pip install commit0 ``` -Then run +## Commands + +The system is a command-line tool that allows you to run unit-tests on a +variety of libraries in isolated environments. To get started with the full +setup run the `clone` command which will install a clone the code of a subset +of libraries to your `repos/` directory. ```bash -spec2repo new local +commit0 clone lit ``` -This will generate a file `spec2repo.yml` in your project. -To launch the benchmark suite run +Next run the `build` command which will configure Docker containers for +each of the libraries with isolated virtual environments. The command uses the +[uv](https://github.com/astral-sh/uv) library for efficient builds. ```bash -spec2repo launch +commit0 build lit ``` -This will launch a set of docker instances for each of the repos as well as a -local master. +The main operation you can do with these enviroments is to run tests. +Here we run [a test](https://github.com/commit-0/simpy/blob/master/tests/test_event.py#L11) in the `simpy` library. + +```bash +commit0 test simpy tests/test_event.py::test_succeed +``` -Now let's apply a patch to one of our repos: +This test should run and pass, but others will fail. + +```bash +commit0 test minitorch tests/test_operators.py::test_relu +``` + +Let's now manually go in and change that repo. +This is all just standard shell commands. ```bash cd repos/minitorch/ -git checkout -b first_change -patch ../../minitorch.example.patch . -spec2repo test minitorch first_change test_add +git checkout -b mychange ``` -This will run the `test_add` in the MiniTorch Repository and show the results. +And apply and commit this patch. -To get your current score on a repository you can run +``` +--- a/minitorch/operators.py ++++ b/minitorch/operators.py +@@ -81,7 +81,7 @@ def relu(x: float) -> float: + (See https://en.wikipedia.org/wiki/Rectifier_(neural_networks) .) + """ + # TODO: Implement for Task 0.1. +- raise NotImplementedError('Need to implement for Task 0.1') ++ return 1. if x > 0. else 0. +``` + +Once this is done we can run `test` with +a branch and the environment will sync and run. + +```bash +commit0 test minitorch branch=mychange tests/test_operators.py::test_relu +``` + +## Running an Agent + +Next we will see how this can be run with an AI agent system. +We will use [Aider](https://aider.chat/) which is a nice +command-line oriented agent system. + +To setup Aider first set your api key. +We recommend using Claude Sonnet. ```bash -spec2repo score minitorch +# Work with Claude 3.5 Sonnet on your repo +export ANTHROPIC_API_KEY=your-key-goes-here ``` -## Running Aider +Once this is setup you can run Aider with the following command. +This will edit the files locally in your branch, but +run the tests inside the environment. -... +```bash +aider --model sonnet --file repos/minitorch/operators.py --message "fill in" \ + --auto-test --test \ + --test-cmd 'commit0 test minitorch branch=mychange tests/test_operators.py::test_relu' \ + --yes +``` + +This will run an LLM agent that will try to fill in the +functions in one file of the minitorch library. + +For a full example baseline system that tries to solve +all the tests in the library see the [baseline](baseline) documentation. + + +## Distributed Tests + +One of the main advantages of `commit0` is that it can run +a range of unit tests in distributed environments. + +By default, the library is configured to work with [modal](https://modal.com/). + +```bash +pip install modal +modal token new +``` + +To enable distributed run, first +create a file called `distributed.yaml` + +```yaml +backend: modal +base_dir: repos.dist/ +``` + +You can pass this configuration file as an argumnet to clone. + +```bash +commit0 clone lite --cfg=distributed.yaml +``` + +Next to run tests you can run the standard test command. + +```bash +commit0 test simpy master tests/test_event.py::test_succeed --cfg=distributed.yaml +```