Skip to content

Commit e0c79a4

Browse files
committed
fix: minor eval refactor (improved type names), clarified python tool instructions
1 parent a837b32 commit e0c79a4

File tree

14 files changed

+108
-84
lines changed

14 files changed

+108
-84
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ test:
2222
@# if SLOW is not set, pass `-m "not slow"` to skip slow tests
2323
poetry run pytest ${SRCDIRS} -v --log-level INFO --durations=5 \
2424
--cov=gptme --cov-report=xml --cov-report=term-missing --cov-report=html --junitxml=junit.xml \
25-
-n 8 \
25+
-n 16 \
2626
$(if $(EVAL), , -m "not eval") \
2727
$(if $(SLOW), --timeout 60 --retries 2 --retry-delay 5, --timeout 5 -m "not slow and not eval") \
2828
$(if $(PROFILE), --profile-svg)

gptme/eval/main.py

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from ..message import len_tokens
2121
from .run import run_evals
2222
from .suites import suites, tests_default, tests_map
23-
from .types import CaseResult, ExecResult, ExecTest
23+
from .types import CaseResult, EvalResult, EvalSpec
2424

2525
# Configure logging, including fully-qualified module names
2626
logging.basicConfig(
@@ -34,7 +34,7 @@
3434
project_dir = Path(__file__).parent.parent.parent
3535

3636

37-
def print_model_results(model_results: dict[str, list[ExecResult]]):
37+
def print_model_results(model_results: dict[str, list[EvalResult]]):
3838
total_tests = 0
3939
total_tokens = 0
4040

@@ -70,7 +70,7 @@ def print_model_results(model_results: dict[str, list[ExecResult]]):
7070
print(f"Completed {total_tests} tests in {total_tokens}tok")
7171

7272

73-
def print_model_results_table(model_results: dict[str, list[ExecResult]]):
73+
def print_model_results_table(model_results: dict[str, list[EvalResult]]):
7474
test_names = {
7575
result.name for results in model_results.values() for result in results
7676
}
@@ -120,19 +120,23 @@ def main(
120120
):
121121
"""
122122
Run evals for gptme.
123+
Pass eval or suite names to run, or result files to print.
123124
124-
Pass test names to run, or result files to print.
125+
Output from evals will be captured, unless a single eval is run, and saved to the results directory.
125126
"""
126127
# init
127128
multiprocessing_logging.install_mp_handler()
128129

129130
models = _model or [
130131
"openai/gpt-4o",
132+
"openai/gpt-4o-mini",
131133
"anthropic/claude-3-5-sonnet-20240620",
134+
"anthropic/claude-3-haiku-20240307",
132135
"openrouter/meta-llama/llama-3.1-405b-instruct",
133136
]
134137

135138
results_files = [f for f in eval_names_or_result_files if f.endswith(".csv")]
139+
eval_names = [f for f in eval_names_or_result_files if f not in results_files]
136140
if results_files:
137141
for results_file in results_files:
138142
p = Path(results_file)
@@ -148,20 +152,20 @@ def main(
148152
sys.exit(1)
149153
sys.exit(0)
150154

151-
tests_to_run: list[ExecTest] = []
152-
for test_name in eval_names_or_result_files:
153-
if test_name in tests_map:
154-
tests_to_run.append(tests_map[test_name])
155-
elif test_name in suites:
156-
tests_to_run.extend(suites[test_name])
155+
evals_to_run: list[EvalSpec] = []
156+
for eval_name in eval_names:
157+
if test := tests_map.get(eval_name):
158+
evals_to_run.append(test)
159+
elif suite := suites.get(eval_name) or suites.get(eval_name.replace("-", "_")):
160+
evals_to_run.extend(suite)
157161
else:
158-
raise ValueError(f"Test {test_name} not found")
162+
raise ValueError(f"Test {eval_name} not found")
159163

160-
if not tests_to_run:
161-
tests_to_run = tests_default
164+
if not evals_to_run:
165+
evals_to_run = tests_default
162166

163167
print("=== Running evals ===")
164-
model_results = run_evals(tests_to_run, models, timeout, parallel)
168+
model_results = run_evals(evals_to_run, models, timeout, parallel)
165169
print("\n=== Finished ===\n")
166170

167171
print("\n=== Model Results ===")
@@ -211,7 +215,7 @@ def read_log_file(file_path: Path) -> str:
211215
return ""
212216

213217

214-
def read_results_from_csv(filename: str) -> dict[str, list[ExecResult]]:
218+
def read_results_from_csv(filename: str) -> dict[str, list[EvalResult]]:
215219
model_results = defaultdict(list)
216220
results_dir = Path(filename).parent
217221
with open(filename, newline="") as csvfile:
@@ -220,7 +224,7 @@ def read_results_from_csv(filename: str) -> dict[str, list[ExecResult]]:
220224
model = row["Model"]
221225
test_dir = results_dir / model / row["Test"]
222226

223-
result = ExecResult(
227+
result = EvalResult(
224228
name=row["Test"],
225229
status="success" if row["Passed"] == "true" else "error",
226230
results=list(_read_case_results(test_dir / "cases.csv")),
@@ -238,7 +242,7 @@ def read_results_from_csv(filename: str) -> dict[str, list[ExecResult]]:
238242
return dict(model_results)
239243

240244

241-
def write_results(model_results: dict[str, list[ExecResult]]):
245+
def write_results(model_results: dict[str, list[EvalResult]]):
242246
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
243247
# get current commit hash and dirty status, like: a8b2ef0-dirty
244248
# TODO: don't assume we are in the gptme repo, use other version identifiers if available

gptme/eval/run.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919
from .execenv import SimpleExecutionEnv
2020
from .types import (
2121
CaseResult,
22-
ExecResult,
23-
ExecTest,
22+
EvalResult,
23+
EvalSpec,
2424
ResultContext,
2525
Status,
2626
)
@@ -52,8 +52,8 @@ class SyncedDict(TypedDict):
5252

5353

5454
def run_evals(
55-
tests: list[ExecTest], models: list[str], timeout: int, parallel: int
56-
) -> dict[str, list[ExecResult]]:
55+
evals: list[EvalSpec], models: list[str], timeout: int, parallel: int
56+
) -> dict[str, list[EvalResult]]:
5757
"""
5858
Run evals for a list of tests.
5959
"""
@@ -67,14 +67,14 @@ def run_evals(
6767
else:
6868
cleanup_on_sigterm()
6969

70-
n_runs = len(tests) * len(models)
71-
model_results: dict[str, dict[str, ExecResult]] = defaultdict(dict)
70+
n_runs = len(evals) * len(models)
71+
model_results: dict[str, dict[str, EvalResult]] = defaultdict(dict)
7272
parallel = min(n_runs, parallel)
7373
with ProcessPoolExecutor(parallel) as executor:
7474
futures = []
7575
future_to_model_test = {}
7676
for model in models:
77-
for test in tests:
77+
for test in evals:
7878
future = executor.submit(
7979
execute,
8080
test,
@@ -103,7 +103,7 @@ def _handle_future(future: Future):
103103
logger.exception(
104104
f"Test {test_name} for model {model} generated an exception when trying to get result"
105105
)
106-
result = ExecResult(
106+
result = EvalResult(
107107
name=test_name,
108108
status=status,
109109
results=[],
@@ -116,7 +116,7 @@ def _handle_future(future: Future):
116116
model_results[model][test_name] = result
117117

118118
# worse-case run time, with some buffer to account for overhead
119-
max_timeout = timeout * len(tests) / parallel + 10
119+
max_timeout = timeout * len(evals) / parallel + 10
120120
completed = set()
121121
try:
122122
# TODO: can we do better than this? handle timeouts within futures instead?
@@ -147,19 +147,19 @@ def _handle_future(future: Future):
147147
process.terminate()
148148
process.join()
149149

150-
model_results_final: dict[str, list[ExecResult]] = defaultdict(list)
150+
model_results_final: dict[str, list[EvalResult]] = defaultdict(list)
151151
for model in model_results:
152152
# sort results by test order
153153
model_results_final[model] = sorted(
154154
model_results[model].values(),
155-
key=lambda result: [test["name"] for test in tests].index(result.name),
155+
key=lambda result: [test["name"] for test in evals].index(result.name),
156156
)
157157

158158
return model_results_final
159159

160160

161161
# TODO: rewrite to run in Docker? Would help with capturing output + process management.
162-
def execute(test: ExecTest, agent: Agent, timeout: int, parallel: bool) -> ExecResult:
162+
def execute(test: EvalSpec, agent: Agent, timeout: int, parallel: bool) -> EvalResult:
163163
"""
164164
Executes the code for a specific model with a timeout.
165165
"""
@@ -206,7 +206,7 @@ def execute(test: ExecTest, agent: Agent, timeout: int, parallel: bool) -> ExecR
206206
gen_stderr = result.get("stderr", "")
207207
else:
208208
logger.error("No result in shared dictionary")
209-
return ExecResult(
209+
return EvalResult(
210210
name=test["name"],
211211
status="error",
212212
results=[],
@@ -256,7 +256,7 @@ def execute(test: ExecTest, agent: Agent, timeout: int, parallel: bool) -> ExecR
256256
results = []
257257
stdout_run, stderr_run = "", ""
258258

259-
return ExecResult(
259+
return EvalResult(
260260
name=test["name"],
261261
status=status,
262262
results=results,

gptme/eval/suites/__init__.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
1-
from ..types import ExecTest
1+
from ..types import EvalSpec
22
from .basic import tests as tests_basic
33
from .browser import tests as tests_browser
44
from .init_projects import tests as tests_init_projects
55

6-
suites: dict[str, list[ExecTest]] = {
6+
suites: dict[str, list[EvalSpec]] = {
77
"basic": tests_basic,
88
"init_projects": tests_init_projects,
99
"browser": tests_browser,
1010
}
1111

12-
tests: list[ExecTest] = [test for suite in suites.values() for test in suite]
13-
tests_map: dict[str, ExecTest] = {test["name"]: test for test in tests}
12+
tests: list[EvalSpec] = [test for suite in suites.values() for test in suite]
13+
tests_map: dict[str, EvalSpec] = {test["name"]: test for test in tests}
1414

1515
tests_default_ids: list[str] = [
1616
"hello",
@@ -19,4 +19,4 @@
1919
"prime100",
2020
"init-git",
2121
]
22-
tests_default: list[ExecTest] = [tests_map[test_id] for test_id in tests_default_ids]
22+
tests_default: list[EvalSpec] = [tests_map[test_id] for test_id in tests_default_ids]

gptme/eval/suites/basic.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from typing import TYPE_CHECKING
22

33
if TYPE_CHECKING:
4-
from gptme.eval.main import ExecTest
4+
from gptme.eval.main import EvalSpec
55

66

77
def correct_output_hello_world(ctx):
@@ -28,30 +28,30 @@ def check_output_hello_ask(ctx):
2828
return "Hello, Erik!" in ctx.stdout
2929

3030

31-
tests: list["ExecTest"] = [
31+
tests: list["EvalSpec"] = [
3232
{
3333
"name": "hello",
3434
"files": {},
3535
"run": "python hello.py",
36-
"prompt": "write a script hello.py which prints 'Hello, world!'",
36+
"prompt": 'write a script hello.py which prints "Hello, world!"',
3737
"expect": {
3838
"correct output": correct_output_hello_world,
3939
"correct file": check_exists_hello,
4040
},
4141
},
4242
{
4343
"name": "hello-patch",
44-
"files": {"hello.py": "print('Hello, world!')"},
44+
"files": {"hello.py": 'print("Hello, world!")'},
4545
"run": "python hello.py",
46-
"prompt": "Patch the code in hello.py to print 'Hello, human!'",
46+
"prompt": 'Patch the code in hello.py to print "Hello, human!"',
4747
"expect": {
4848
"correct output": correct_output_hello_human,
4949
"correct file": check_exists_hello,
5050
},
5151
},
5252
{
5353
"name": "hello-ask",
54-
"files": {"hello.py": "print('Hello, world!')"},
54+
"files": {"hello.py": 'print("Hello, world!")'},
5555
"run": "echo 'Erik' | python hello.py",
5656
# TODO: work around the "don't try to execute it" part by improving gptme such that it just gives EOF to stdin in non-interactive mode
5757
"prompt": "modify hello.py to ask the user for their name and print 'Hello, <name>!'. don't try to execute it",

gptme/eval/suites/browser.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
from typing import TYPE_CHECKING
22

33
if TYPE_CHECKING:
4-
from gptme.eval.main import ExecTest
4+
from gptme.eval.main import EvalSpec
55

66

77
def check_output_erik(ctx):
88
return "Erik" in ctx.stdout
99

1010

11-
tests: list["ExecTest"] = [
11+
tests: list["EvalSpec"] = [
1212
{
1313
"name": "whois-superuserlabs-ceo",
1414
"files": {},

gptme/eval/suites/init_projects.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from typing import TYPE_CHECKING
22

33
if TYPE_CHECKING:
4-
from gptme.eval.main import ExecTest
4+
from gptme.eval.main import EvalSpec
55

66

77
def check_clean_exit(ctx):
@@ -41,7 +41,7 @@ def check_exists_main(ctx):
4141
return "main.py" in ctx.files
4242

4343

44-
tests: list["ExecTest"] = [
44+
tests: list["EvalSpec"] = [
4545
{
4646
"name": "init-git",
4747
"files": {},

gptme/eval/types.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@ class CaseResult:
3131

3232

3333
@dataclass
34-
class ExecResult:
34+
class EvalResult:
3535
"""
36-
Result of executing a prompt.
36+
Result of executing an eval.
3737
"""
3838

3939
name: str
@@ -46,9 +46,9 @@ class ExecResult:
4646
run_stderr: str
4747

4848

49-
class ExecTest(TypedDict):
49+
class EvalSpec(TypedDict):
5050
"""
51-
Test case for executing a prompt.
51+
Specification for an eval/test case.
5252
"""
5353

5454
name: str

gptme/llm.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,10 +92,11 @@ def print_clear():
9292
# need to flush stdout to get the print to show up
9393
sys.stdout.flush()
9494

95+
# TODO: make this more robust/general, maybe with a callback that runs on each char/chunk
9596
# pause inference on finished code-block, letting user run the command before continuing
9697
tooluses = list(ToolUse.iter_from_content(output))
97-
if tooluses:
98-
logger.debug("Found tool use, breaking")
98+
if tooluses and any(tooluse.is_runnable for tooluse in tooluses):
99+
logger.warning("Found tool use, breaking")
99100
break
100101
except KeyboardInterrupt:
101102
return Message("assistant", output + "... ^C Interrupted")

0 commit comments

Comments
 (0)