diff --git a/commit0/__main__.py b/commit0/__main__.py
index 50b3125..8758562 100644
--- a/commit0/__main__.py
+++ b/commit0/__main__.py
@@ -11,6 +11,7 @@
from hydra.core.config_store import ConfigStore
from commit0.configs.config_class import Commit0Config
from commit0.harness.constants import COMMANDS, SPLIT
+from omegaconf import OmegaConf
def main() -> None:
@@ -24,9 +25,17 @@ def main() -> None:
cs.store(name="user", group="Commit0Config", node=Commit0Config)
# have hydra to ignore all command-line arguments
sys_argv = copy.deepcopy(sys.argv)
- sys.argv = [sys.argv[0]]
+ cfg_arg = next((arg for arg in sys_argv if arg.startswith("--cfg=")), None)
+
hydra.initialize(version_base=None, config_path="configs")
config = hydra.compose(config_name="user")
+
+ if cfg_arg:
+ sys_argv.remove(cfg_arg)
+ config_name = cfg_arg.split("=")[1]
+ user_config = OmegaConf.load(config_name)
+ config = OmegaConf.merge(config, user_config)
+
# after hydra gets all configs, put command-line arguments back
sys.argv = sys_argv
# repo_split: split from command line has a higher priority than split in hydra
diff --git a/commit0/harness/docker_build.py b/commit0/harness/docker_build.py
index dea9055..264f379 100644
--- a/commit0/harness/docker_build.py
+++ b/commit0/harness/docker_build.py
@@ -7,6 +7,7 @@
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from typing import Any
+import sys
from commit0.harness.constants import (
BASE_IMAGE_BUILD_DIR,
@@ -39,6 +40,8 @@ def setup_logger(repo: str, log_file: Path, mode: str = "w") -> logging.Logger:
log_file.parent.mkdir(parents=True, exist_ok=True)
logger = logging.getLogger(f"{repo}.{log_file.name}")
handler = logging.FileHandler(log_file, mode=mode)
+ stdout_handler = logging.StreamHandler(sys.stdout)
+ logger.addHandler(stdout_handler)
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
diff --git a/commit0/harness/run_pytest_ids.py b/commit0/harness/run_pytest_ids.py
index 37b5d6d..058cac0 100644
--- a/commit0/harness/run_pytest_ids.py
+++ b/commit0/harness/run_pytest_ids.py
@@ -100,8 +100,10 @@ def main(
eval_file.write_text(eval_script)
if ExecutionBackend(backend) == ExecutionBackend.MODAL:
+ logger.info("Runnning on Modal")
execution_context = Modal
elif ExecutionBackend(backend) == ExecutionBackend.LOCAL:
+ logger.info("Runnning locally")
execution_context = Docker
else:
raise ValueError(
diff --git a/docs/about.md b/docs/about.md
index c5f34d3..bd2455e 100644
--- a/docs/about.md
+++ b/docs/about.md
@@ -1 +1 @@
-Spec2Repo is made by ...
+Commit0 is ..
diff --git a/docs/distributed.md b/docs/distributed.md
new file mode 100644
index 0000000..0c4ca1e
--- /dev/null
+++ b/docs/distributed.md
@@ -0,0 +1,33 @@
+# Distributed
+
+One of the main advantages of `commit0` is that it can run
+a range of unit tests in distributed environments.
+
+By default, the library is configured to work with [modal](https://modal.com/).
+
+```bash
+pip install modal
+modal token new
+```
+
+## Modal Setup
+
+To enable distributed run, first
+create a file called `distributed.yaml`
+
+```yaml
+backend: modal
+base_dir: repos.dist/
+```
+
+You can pass this configuration file as an argumnet to clone.
+
+```bash
+commit0 clone lite --cfg=distributed
+```
+
+Next to run tests you can run the standard test command.
+
+```bash
+commit0 test simpy master tests/test_event.py::test_succeed --cfg=distributed
+```
diff --git a/docs/index.md b/docs/index.md
index 13a5ca4..5b9f22d 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,11 +1,79 @@
+

#
-Commit-0 is a new challenge for AI code generation.
+Commit-0 is a real-world AI coding challenge.
+Can your agent generate a working library from commit 0?
+
+The benchmark consists of 57 core Python libraries.
+Libraries are selected based on:
+
+* Significant unit-test coverage
+* Detailed specification and documentation
+* Lint and type checking
+
+The [commit0 tool](setup) allows you to:
-Given a specification, can you generate a repository?
+* Efficiently run interactive tests in isolated environemnts
+* Distribute testing and development across cloud systems
+* Track and log all changes made throughout.
-- YYY Lines of code
-- XXX Unit Test
-- 50 Repositories
+| | Name | Repo | Commit0 | Tests | |
+|--|--------|-------|----|----|------|
+|
| [minitorch](https://minitorch.github.io/) | [[orig](http://github.com/minitorch/minitorch)] | [[commit0](http://github.com/commit-0/minitorch)] | 230 |
|
+|
| [simpy](https://simpy.readthedocs.io/en/4.1.1/) | [[orig](http://github.com/wenting-zhao/simpy)] | [[commit0](http://github.com/commit-0/simpy)] | 140 |
|
+|
| [bitstring](https://bitstring.readthedocs.io/en/stable/) | [[orig](http://github.com/scott-griffiths/bitstring)] | [[commit0](http://github.com/commit-0/bitstring)] | 834 |
|
+|
| [tinydb](https://tinydb.readthedocs.io/_/downloads/en/v4.8.0/pdf/) | [[orig](http://github.com/msiemens/tinydb)] | [[commit0](http://github.com/commit-0/tinydb)] | 201 |
|
+|
| [marshmallow](https://marshmallow.readthedocs.io/_/downloads/en/stable/pdf/) | [[orig](http://github.com/marshmallow-code/marshmallow)] | [[commit0](http://github.com/commit-0/marshmallow)] | 1229 |
|
+|
| [python-prompt-toolkit](https://python-prompt-toolkit.readthedocs.io/_/downloads/en/3.0.43/pdf/) | [[orig](http://github.com/prompt-toolkit/python-prompt-toolkit)] | [[commit0](http://github.com/commit-0/python-prompt-toolkit)] | 151 |
|
+|
| [parsel](https://parsel.readthedocs.io/_/downloads/en/latest/pdf/) | [[orig](http://github.com/scrapy/parsel)] | [[commit0](http://github.com/commit-0/parsel)] | 343 |
|
+| pyjwt | [pyjwt](https://pyjwt.readthedocs.io/_/downloads/en/2.8.0/pdf/) | [[orig](http://github.com/jpadilla/pyjwt)] | [[commit0](http://github.com/commit-0/pyjwt)] | 259 |
|
+|
| [networkx](https://networkx.org/documentation/networkx-3.3/) | [[orig](http://github.com/networkx/networkx)] | [[commit0](http://github.com/commit-0/networkx)] | 5440 |
|
+|
| [graphene](https://docs.graphene-python.org/en/stable/) | [[orig](http://github.com/graphql-python/graphene)] | [[commit0](http://github.com/commit-0/graphene)] | 447 |
|
+| tlslite-ng | [tlslite-ng](https://tlslite-ng.readthedocs.io/en/latest/) | [[orig](http://github.com/tlsfuzzer/tlslite-ng)] | [[commit0](http://github.com/commit-0/tlslite-ng)] | 1653 |
|
+| wcwidth | [wcwidth](https://wcwidth.readthedocs.io/en/stable/) | [[orig](http://github.com/jquast/wcwidth)] | [[commit0](http://github.com/commit-0/wcwidth)] | 38 |
|
+| chardet | [chardet](https://chardet.readthedocs.io/_/downloads/en/stable/pdf/) | [[orig](http://github.com/chardet/chardet)] | [[commit0](http://github.com/commit-0/chardet)] | 376 |
|
+| dnspython | [dnspython](https://dnspython.readthedocs.io/en/stable/) | [[orig](http://github.com/rthalley/dnspython)] | [[commit0](http://github.com/commit-0/dnspython)] | 1304 |
|
+| imapclient | [imapclient](https://imapclient.readthedocs.io/en/3.0.1/) | [[orig](http://github.com/mjs/imapclient)] | [[commit0](http://github.com/commit-0/imapclient)] | 267 |
|
+|
| [virtualenv](https://virtualenv.pypa.io/en/20.26.3/) | [[orig](http://github.com/pypa/virtualenv)] | [[commit0](http://github.com/commit-0/virtualenv)] | 284 |
|
+| pexpect | [pexpect](https://pexpect.readthedocs.io/_/downloads/en/stable/pdf/) | [[orig](http://github.com/pexpect/pexpect)] | [[commit0](http://github.com/commit-0/pexpect)] | 255 |
|
+|
| [web3.py](https://web3py.readthedocs.io/_/downloads/en/v6.20.2/pdf/) | [[orig](http://github.com/ethereum/web3.py)] | [[commit0](http://github.com/commit-0/web3.py)] | 40433 |
|
+|
| [babel](https://babel.pocoo.org/_/downloads/en/stable/pdf/) | [[orig](http://github.com/python-babel/babel)] | [[commit0](http://github.com/commit-0/babel)] | 5663 |
|
+|
| [geopandas](https://geopandas.org/en/stable/) | [[orig](http://github.com/geopandas/geopandas)] | [[commit0](http://github.com/commit-0/geopandas)] | 2196 |
|
+| dulwich | [dulwich](https://dulwich.readthedocs.io/_/downloads/en/latest/pdf/) | [[orig](http://github.com/jelmer/dulwich)] | [[commit0](http://github.com/commit-0/dulwich)] | 1522 |
|
+|
| [flask](https://flask.palletsprojects.com/en/3.0.x/) | [[orig](http://github.com/pallets/flask)] | [[commit0](http://github.com/commit-0/flask)] | 477 |
|
+| voluptuous | [voluptuous](https://alecthomas.github.io/voluptuous/docs/_build/html/) | [[orig](http://github.com/alecthomas/voluptuous)] | [[commit0](http://github.com/commit-0/voluptuous)] | 149 |
|
+|
| [jinja](https://jinja.palletsprojects.com/en/3.1.x/) | [[orig](http://github.com/pallets/jinja)] | [[commit0](http://github.com/commit-0/jinja)] | 851 |
|
+|
| [seaborn](https://seaborn.pydata.org/) | [[orig](http://github.com/mwaskom/seaborn)] | [[commit0](http://github.com/commit-0/seaborn)] | 2362 |
|
+| requests | [requests](https://requests.readthedocs.io/_/downloads/en/latest/pdf/) | [[orig](http://github.com/psf/requests)] | [[commit0](http://github.com/commit-0/requests)] | 590 |
|
+|
| [scrapy](https://docs.scrapy.org/_/downloads/en/2.11/pdf/) | [[orig](http://github.com/scrapy/scrapy)] | [[commit0](http://github.com/commit-0/scrapy)] | 2904 |
|
+|
| [fastapi](https://fastapi.tiangolo.com/reference/) | [[orig](http://github.com/fastapi/fastapi)] | [[commit0](http://github.com/commit-0/fastapi)] | 2013 |
|
+|
| [click](https://click.palletsprojects.com/en/8.1.x/) | [[orig](http://github.com/pallets/click)] | [[commit0](http://github.com/commit-0/click)] | 589 |
|
+|
| [python-rsa](https://stuvel.eu/python-rsa-doc/) | [[orig](http://github.com/sybrenstuvel/python-rsa)] | [[commit0](http://github.com/commit-0/python-rsa)] | 86 |
|
+|
| [statsmodels](https://www.statsmodels.org/stable/) | [[orig](http://github.com/statsmodels/statsmodels)] | [[commit0](http://github.com/commit-0/statsmodels)] | 17669 |
|
+| more-itertools | [more-itertools](https://more-itertools.readthedocs.io/en/v10.4.0/) | [[orig](http://github.com/more-itertools/more-itertools)] | [[commit0](http://github.com/commit-0/more-itertools)] | 662 |
|
+|
| [moviepy](https://zulko.github.io/moviepy/) | [[orig](http://github.com/Zulko/moviepy)] | [[commit0](http://github.com/commit-0/moviepy)] | 109 |
|
+| deprecated | [deprecated](https://deprecated.readthedocs.io/en/latest/) | [[orig](http://github.com/laurent-laporte-pro/deprecated)] | [[commit0](http://github.com/commit-0/deprecated)] | 171 |
|
+|
| [pydantic](https://docs.pydantic.dev/2.8/) | [[orig](http://github.com/pydantic/pydantic)] | [[commit0](http://github.com/commit-0/pydantic)] | 5091 |
|
+|
| [loguru](https://loguru.readthedocs.io/_/downloads/en/0.7.2/pdf/) | [[orig](http://github.com/Delgan/loguru)] | [[commit0](http://github.com/commit-0/loguru)] | 1461 |
|
+|
| [pypdf](https://pypdf.readthedocs.io/_/downloads/en/4.3.1/pdf/) | [[orig](http://github.com/py-pdf/pypdf)] | [[commit0](http://github.com/commit-0/pypdf)] | 911 |
|
+|
| [attrs](https://www.attrs.org/en/24.2.0/) | [[orig](http://github.com/python-attrs/attrs)] | [[commit0](http://github.com/commit-0/attrs)] | 1414 |
|
+|
| [mimesis](https://mimesis.name/en/v17.0.0/) | [[orig](http://github.com/lk-geimfari/mimesis)] | [[commit0](http://github.com/commit-0/mimesis)] | 6159 |
|
+|
| [cookiecutter](https://cookiecutter.readthedocs.io/_/downloads/en/2.6.0/pdf/) | [[orig](http://github.com/cookiecutter/cookiecutter)] | [[commit0](http://github.com/commit-0/cookiecutter)] | 367 |
|
+|
| [tornado](https://www.tornadoweb.org/_/downloads/en/stable/pdf/) | [[orig](http://github.com/tornadoweb/tornado)] | [[commit0](http://github.com/commit-0/tornado)] | 1150 |
|
+|
| [imbalanced-learn](https://imbalanced-learn.org/stable/) | [[orig](http://github.com/scikit-learn-contrib/imbalanced-learn)] | [[commit0](http://github.com/commit-0/imbalanced-learn)] | 2310 |
|
+|
| [python-progressbar](https://progressbar-2.readthedocs.io/_/downloads/en/stable/pdf/) | [[orig](http://github.com/wolph/python-progressbar)] | [[commit0](http://github.com/commit-0/python-progressbar)] | 385 |
|
+|
| [PyBoy](https://docs.pyboy.dk/) | [[orig](http://github.com/Baekalfen/PyBoy)] | [[commit0](http://github.com/commit-0/PyBoy)] | 201 |
|
+|
| [pytest](https://docs.pytest.org/_/downloads/en/8.3.x/pdf/) | [[orig](http://github.com/pytest-dev/pytest)] | [[commit0](http://github.com/commit-0/pytest)] | 3612 |
|
+|
| [pylint](https://pylint.readthedocs.io/en/v3.2.6/) | [[orig](http://github.com/pylint-dev/pylint)] | [[commit0](http://github.com/commit-0/pylint)] | 1878 |
|
+|
| [sphinx](https://www.sphinx-doc.org/en/master/) | [[orig](http://github.com/sphinx-doc/sphinx)] | [[commit0](http://github.com/commit-0/sphinx)] | 2187 |
|
+|
| [joblib](https://joblib.readthedocs.io/en/stable/) | [[orig](http://github.com/joblib/joblib)] | [[commit0](http://github.com/commit-0/joblib)] | 1450 |
|
+|
| [xarray](https://docs.xarray.dev/en/v2024.07.0/) | [[orig](http://github.com/pydata/xarray)] | [[commit0](http://github.com/commit-0/xarray)] | 15643 |
|
+| cachetools | [cachetools](https://cachetools.readthedocs.io/en/v5.5.0/) | [[orig](http://github.com/tkem/cachetools)] | [[commit0](http://github.com/commit-0/cachetools)] | 215 |
|
+| paramiko | [paramiko](https://www.paramiko.org/) | [[orig](http://github.com/paramiko/paramiko)] | [[commit0](http://github.com/commit-0/paramiko)] | 557 |
|
+|
| [fabric](https://www.fabfile.org/) | [[orig](http://github.com/fabric/fabric)] | [[commit0](http://github.com/commit-0/fabric)] | 353 |
|
+|
| [filesystem_spec](https://filesystem-spec.readthedocs.io/en/stable/) | [[orig](http://github.com/fsspec/filesystem_spec)] | [[commit0](http://github.com/commit-0/filesystem_spec)] | 698 |
|
+| jedi | [jedi](https://jedi.readthedocs.io/en/stable/) | [[orig](http://github.com/davidhalter/jedi)] | [[commit0](http://github.com/commit-0/jedi)] | 3854 |
|
+| sqlparse | [sqlparse](https://sqlparse.readthedocs.io/en/stable/) | [[orig](http://github.com/andialbrecht/sqlparse)] | [[commit0](http://github.com/commit-0/sqlparse)] | 461 |
|
+|
| [portalocker](https://portalocker.readthedocs.io/en/stable/) | [[orig](http://github.com/wolph/portalocker)] | [[commit0](http://github.com/commit-0/portalocker)] | 38 |
|
diff --git a/docs/make_md.py b/docs/make_md.py
new file mode 100644
index 0000000..2e8f9d7
--- /dev/null
+++ b/docs/make_md.py
@@ -0,0 +1,88 @@
+import datasets
+import subprocess
+
+import requests
+from bs4 import BeautifulSoup
+
+def get_github_avatar(repo):
+ """
+ Given a GitHub repo in the format 'owner/repo', get the avatar URL of the organization or user.
+ """
+ try:
+ org = repo.split("/")[0]
+ # Construct the URL for the repo
+ url = f"https://github.com/{org}"
+
+ # Make a request to the page
+ response = requests.get(url)
+
+ # Check if the request was successful
+ if response.status_code != 200:
+ print(f"Failed to fetch page for {repo}. Status code: {response.status_code}")
+ return None
+
+ # Parse the HTML content using BeautifulSoup
+ soup = BeautifulSoup(response.content, 'html.parser')
+
+ # Find the meta tag with property "og:image" which contains the avatar URL
+ meta_tag = soup.find('meta', property='og:image')
+
+ if meta_tag and 'content' in meta_tag.attrs:
+ avatar_url = meta_tag['content']
+ return avatar_url
+ else:
+ print(f"Avatar URL not found for {repo}")
+ return None
+
+ except Exception as e:
+ print(f"An error occurred: {e}")
+ return None
+
+d = datasets.load_dataset("wentingzhao/commit0_docstring", split="test")
+
+print(d)
+
+
+
+print("| | Name | Repo | Commit0 | Tests | | ")
+print("|--|--------|-------|----|----|------| ")
+overload = {
+ "simpy" : "https://simpy.readthedocs.io/en/4.1.1/_images/simpy-logo-small.png",
+ "tinydb" : "https://raw.githubusercontent.com/msiemens/tinydb/master/artwork/logo.png",
+ "bitstring": "https://bitstring.readthedocs.io/en/stable/_images/bitstring_logo.png",
+ "seaborn": "https://raw.githubusercontent.com/mwaskom/seaborn/master/doc/_static/logo-wide-lightbg.svg",
+ "statsmodels": "https://raw.githubusercontent.com/statsmodels/statsmodels/main/docs/source/images/statsmodels-logo-v2-horizontal.svg",
+ "pyboy" : "https://github.com/Baekalfen/PyBoy/raw/master/extras/README/pyboy.svg",
+}
+skip = {
+ "pyjwt",
+ "wcwidth",
+ "chardet",
+ "dnspython",
+ "imapclient",
+ "pexpect",
+ "dulwich",
+ "voluptuous",
+ "requests",
+ "tlslite-ng",
+ "more-itertools",
+ "deprecated",
+ "cachetools",
+ "paramiko",
+ "jedi",
+ "sqlparse",
+}
+for i, ex in enumerate(d):
+ img = get_github_avatar(ex["original_repo"])
+
+ name = ex["repo"].split("/")[1]
+ result = subprocess.check_output(f"commit0 get-tests {name} | wc", shell=True, text=True)
+
+ tests = int(result.split()[0])
+ if name.lower() not in skip and name.lower() not in overload:
+ img = f"
"
+ elif name.lower() in overload:
+ img = f"
"
+ else:
+ img = f"{name}"
+ print(f"| {img} | [{name}]({ex['setup']['specification']}) | [[orig](http://github.com/{ex['original_repo']})] | [[commit0](http://github.com/{ex['repo']})] | {tests} |
|")
diff --git a/docs/setup.md b/docs/setup.md
index e6cdce0..756d305 100644
--- a/docs/setup.md
+++ b/docs/setup.md
@@ -1,3 +1,7 @@
+# Quickstart
+
+## Install
+
First be sure that you have docker tools installed.
```bash
@@ -7,42 +11,129 @@ apt install docker
To install the benchmark run,
```bash
-pip install spec2repo
+pip install commit0
```
-Then run
+## Commands
+
+The system is a command-line tool that allows you to run unit-tests on a
+variety of libraries in isolated environments. To get started with the full
+setup run the `clone` command which will install a clone the code of a subset
+of libraries to your `repos/` directory.
```bash
-spec2repo new local
+commit0 clone lit
```
-This will generate a file `spec2repo.yml` in your project.
-To launch the benchmark suite run
+Next run the `build` command which will configure Docker containers for
+each of the libraries with isolated virtual environments. The command uses the
+[uv](https://github.com/astral-sh/uv) library for efficient builds.
```bash
-spec2repo launch
+commit0 build lit
```
-This will launch a set of docker instances for each of the repos as well as a
-local master.
+The main operation you can do with these enviroments is to run tests.
+Here we run [a test](https://github.com/commit-0/simpy/blob/master/tests/test_event.py#L11) in the `simpy` library.
+
+```bash
+commit0 test simpy tests/test_event.py::test_succeed
+```
-Now let's apply a patch to one of our repos:
+This test should run and pass, but others will fail.
+
+```bash
+commit0 test minitorch tests/test_operators.py::test_relu
+```
+
+Let's now manually go in and change that repo.
+This is all just standard shell commands.
```bash
cd repos/minitorch/
-git checkout -b first_change
-patch ../../minitorch.example.patch .
-spec2repo test minitorch first_change test_add
+git checkout -b mychange
```
-This will run the `test_add` in the MiniTorch Repository and show the results.
+And apply and commit this patch.
-To get your current score on a repository you can run
+```
+--- a/minitorch/operators.py
++++ b/minitorch/operators.py
+@@ -81,7 +81,7 @@ def relu(x: float) -> float:
+ (See https://en.wikipedia.org/wiki/Rectifier_(neural_networks) .)
+ """
+ # TODO: Implement for Task 0.1.
+- raise NotImplementedError('Need to implement for Task 0.1')
++ return 1. if x > 0. else 0.
+```
+
+Once this is done we can run `test` with
+a branch and the environment will sync and run.
+
+```bash
+commit0 test minitorch branch=mychange tests/test_operators.py::test_relu
+```
+
+## Running an Agent
+
+Next we will see how this can be run with an AI agent system.
+We will use [Aider](https://aider.chat/) which is a nice
+command-line oriented agent system.
+
+To setup Aider first set your api key.
+We recommend using Claude Sonnet.
```bash
-spec2repo score minitorch
+# Work with Claude 3.5 Sonnet on your repo
+export ANTHROPIC_API_KEY=your-key-goes-here
```
-## Running Aider
+Once this is setup you can run Aider with the following command.
+This will edit the files locally in your branch, but
+run the tests inside the environment.
-...
+```bash
+aider --model sonnet --file repos/minitorch/operators.py --message "fill in" \
+ --auto-test --test \
+ --test-cmd 'commit0 test minitorch branch=mychange tests/test_operators.py::test_relu' \
+ --yes
+```
+
+This will run an LLM agent that will try to fill in the
+functions in one file of the minitorch library.
+
+For a full example baseline system that tries to solve
+all the tests in the library see the [baseline](baseline) documentation.
+
+
+## Distributed Tests
+
+One of the main advantages of `commit0` is that it can run
+a range of unit tests in distributed environments.
+
+By default, the library is configured to work with [modal](https://modal.com/).
+
+```bash
+pip install modal
+modal token new
+```
+
+To enable distributed run, first
+create a file called `distributed.yaml`
+
+```yaml
+backend: modal
+base_dir: repos.dist/
+```
+
+You can pass this configuration file as an argumnet to clone.
+
+```bash
+commit0 clone lite --cfg=distributed.yaml
+```
+
+Next to run tests you can run the standard test command.
+
+```bash
+commit0 test simpy master tests/test_event.py::test_succeed --cfg=distributed.yaml
+```