20
20
from ..message import len_tokens
21
21
from .run import run_evals
22
22
from .suites import suites , tests_default , tests_map
23
- from .types import CaseResult , ExecResult , ExecTest
23
+ from .types import CaseResult , EvalResult , EvalSpec
24
24
25
25
# Configure logging, including fully-qualified module names
26
26
logging .basicConfig (
34
34
project_dir = Path (__file__ ).parent .parent .parent
35
35
36
36
37
- def print_model_results (model_results : dict [str , list [ExecResult ]]):
37
+ def print_model_results (model_results : dict [str , list [EvalResult ]]):
38
38
total_tests = 0
39
39
total_tokens = 0
40
40
@@ -70,7 +70,7 @@ def print_model_results(model_results: dict[str, list[ExecResult]]):
70
70
print (f"Completed { total_tests } tests in { total_tokens } tok" )
71
71
72
72
73
- def print_model_results_table (model_results : dict [str , list [ExecResult ]]):
73
+ def print_model_results_table (model_results : dict [str , list [EvalResult ]]):
74
74
test_names = {
75
75
result .name for results in model_results .values () for result in results
76
76
}
@@ -120,19 +120,23 @@ def main(
120
120
):
121
121
"""
122
122
Run evals for gptme.
123
+ Pass eval or suite names to run, or result files to print.
123
124
124
- Pass test names to run, or result files to print .
125
+ Output from evals will be captured, unless a single eval is run, and saved to the results directory .
125
126
"""
126
127
# init
127
128
multiprocessing_logging .install_mp_handler ()
128
129
129
130
models = _model or [
130
131
"openai/gpt-4o" ,
132
+ "openai/gpt-4o-mini" ,
131
133
"anthropic/claude-3-5-sonnet-20240620" ,
134
+ "anthropic/claude-3-haiku-20240307" ,
132
135
"openrouter/meta-llama/llama-3.1-405b-instruct" ,
133
136
]
134
137
135
138
results_files = [f for f in eval_names_or_result_files if f .endswith (".csv" )]
139
+ eval_names = [f for f in eval_names_or_result_files if f not in results_files ]
136
140
if results_files :
137
141
for results_file in results_files :
138
142
p = Path (results_file )
@@ -148,20 +152,20 @@ def main(
148
152
sys .exit (1 )
149
153
sys .exit (0 )
150
154
151
- tests_to_run : list [ExecTest ] = []
152
- for test_name in eval_names_or_result_files :
153
- if test_name in tests_map :
154
- tests_to_run .append (tests_map [ test_name ] )
155
- elif test_name in suites :
156
- tests_to_run .extend (suites [ test_name ] )
155
+ evals_to_run : list [EvalSpec ] = []
156
+ for eval_name in eval_names :
157
+ if test := tests_map . get ( eval_name ) :
158
+ evals_to_run .append (test )
159
+ elif suite := suites . get ( eval_name ) or suites . get ( eval_name . replace ( "-" , "_" )) :
160
+ evals_to_run .extend (suite )
157
161
else :
158
- raise ValueError (f"Test { test_name } not found" )
162
+ raise ValueError (f"Test { eval_name } not found" )
159
163
160
- if not tests_to_run :
161
- tests_to_run = tests_default
164
+ if not evals_to_run :
165
+ evals_to_run = tests_default
162
166
163
167
print ("=== Running evals ===" )
164
- model_results = run_evals (tests_to_run , models , timeout , parallel )
168
+ model_results = run_evals (evals_to_run , models , timeout , parallel )
165
169
print ("\n === Finished ===\n " )
166
170
167
171
print ("\n === Model Results ===" )
@@ -211,7 +215,7 @@ def read_log_file(file_path: Path) -> str:
211
215
return ""
212
216
213
217
214
- def read_results_from_csv (filename : str ) -> dict [str , list [ExecResult ]]:
218
+ def read_results_from_csv (filename : str ) -> dict [str , list [EvalResult ]]:
215
219
model_results = defaultdict (list )
216
220
results_dir = Path (filename ).parent
217
221
with open (filename , newline = "" ) as csvfile :
@@ -220,7 +224,7 @@ def read_results_from_csv(filename: str) -> dict[str, list[ExecResult]]:
220
224
model = row ["Model" ]
221
225
test_dir = results_dir / model / row ["Test" ]
222
226
223
- result = ExecResult (
227
+ result = EvalResult (
224
228
name = row ["Test" ],
225
229
status = "success" if row ["Passed" ] == "true" else "error" ,
226
230
results = list (_read_case_results (test_dir / "cases.csv" )),
@@ -238,7 +242,7 @@ def read_results_from_csv(filename: str) -> dict[str, list[ExecResult]]:
238
242
return dict (model_results )
239
243
240
244
241
- def write_results (model_results : dict [str , list [ExecResult ]]):
245
+ def write_results (model_results : dict [str , list [EvalResult ]]):
242
246
timestamp = datetime .now ().strftime ("%Y%m%d_%H%M%S" )
243
247
# get current commit hash and dirty status, like: a8b2ef0-dirty
244
248
# TODO: don't assume we are in the gptme repo, use other version identifiers if available
0 commit comments