Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions ci/run_code_cov.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/env python3
import subprocess

cmd = ['pytest', '--cov=simpleval', '--cov-report=term', '--cov-report=term-missing', '--cov-report=html', '--cov-fail-under=90']
try:
subprocess.run(cmd, check=True)
except subprocess.CalledProcessError as e:
print('\nCommand failed with exit code:', e.returncode)
if e.stdout:
print('\nSTDOUT:\n', e.stdout.decode() if isinstance(e.stdout, bytes) else e.stdout)
if e.stderr:
print('\nSTDERR:\n', e.stderr.decode() if isinstance(e.stderr, bytes) else e.stderr)
raise
2 changes: 2 additions & 0 deletions docs/developers/dev-notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@
* Code coverage checks - must be above 90%
* Verify uv.lock and requirements.txt are synced

Optionally also run `./ci/run_code_cov.py` to make sure the code coverage is adequate.

!!! info "React checks"
If you made changes to the `reports-frontend` reports, run the `ci/run_checks_react.py` script
This will run the following steps:
Expand Down
18 changes: 9 additions & 9 deletions simpleval/commands/init_command/base_init.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import logging
import os
import shutil
from abc import ABC, abstractmethod

from colorama import Fore
Expand Down Expand Up @@ -45,16 +46,15 @@ def run_init_command(self):
new_testcases_folder = os.path.join(eval_dir, TESTCASES_FOLDER, testcase)
os.makedirs(new_testcases_folder)

rc = 0
rc += os.system(f'cp {os.path.join(empty_eval_set_folder, EVAL_CONFIG_FILE)} {new_eval_set_folder}') # noqa
rc += os.system(f'cp {os.path.join(empty_eval_set_folder, GROUND_TRUTH_FILE)} {new_eval_set_folder}') # noqa
rc += os.system(f'cp {os.path.join(empty_eval_set_folder, "README.md")} {new_eval_set_folder}') # noqa
try:
shutil.copy(os.path.join(empty_eval_set_folder, EVAL_CONFIG_FILE), new_eval_set_folder)
shutil.copy(os.path.join(empty_eval_set_folder, GROUND_TRUTH_FILE), new_eval_set_folder)
shutil.copy(os.path.join(empty_eval_set_folder, 'README.md'), new_eval_set_folder)

rc += os.system(f'cp {os.path.join(empty_testcase_folder, "__init__.py")} {new_testcases_folder}') # noqa
rc += os.system(f'cp {os.path.join(empty_testcase_folder, PLUGIN_FILE_NAME)} {new_testcases_folder}') # noqa

if rc != 0:
raise TerminationError(f'{Fore.RED}Error occurred during creating new evaluation{Fore.RESET}')
shutil.copy(os.path.join(empty_testcase_folder, '__init__.py'), new_testcases_folder)
shutil.copy(os.path.join(empty_testcase_folder, PLUGIN_FILE_NAME), new_testcases_folder)
except Exception as e:
raise TerminationError(f'{Fore.RED}Error occurred creating the new evaluation: {e}{Fore.RESET}')

with open(os.path.join(new_eval_set_folder, EVAL_CONFIG_FILE), 'w', encoding='utf-8') as file:
json.dump(new_config.model_dump(exclude_none=True), file, indent=4)
Expand Down
61 changes: 61 additions & 0 deletions tests/unit/test_init_command.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import json
import uuid
from unittest import mock

from simpleval.commands.init_command import init_command, init_from_template_command


def verify_eval_config_file(eval_dir, expected):
config_path = eval_dir / 'config.json'
with open(config_path, 'r', encoding='utf-8') as f:
config = json.load(f)

assert config['name'] == eval_dir.name, config
assert config['max_concurrent_judge_tasks'] == expected['max_concurrent_judge_tasks'], config
assert config['max_concurrent_llm_tasks'] == expected['max_concurrent_llm_tasks'], config
assert config['llm_as_a_judge_name'] == expected['llm_as_a_judge_name'], config
assert config['eval_metrics'] == expected['eval_metrics'], config


def test_init_command_creates_eval_set(tmp_path):
eval_dir = tmp_path / f'temp_{uuid.uuid4()}'
testcase = 'testcase'

input_values = [str(eval_dir), testcase]
def input_side_effect(prompt):
return input_values.pop(0)

metrics_from_user = ['correctness', 'accuracy', 'relevancy']

with mock.patch('builtins.input', side_effect=input_side_effect), \
mock.patch('simpleval.commands.init_command.user_functions.pick_judge', return_value='dummy_judge'), \
mock.patch('simpleval.commands.init_command.user_functions.get_model_id_from_user', return_value='dummy_model_id'), \
mock.patch('simpleval.commands.init_command.user_functions.get_metrics_from_user', return_value=metrics_from_user), \
mock.patch('simpleval.commands.init_command.user_functions.get_concurrency_values', return_value=(10, 10)):
init_command.init_command()

verify_eval_config_file(
eval_dir,
{
'max_concurrent_judge_tasks': 10,
'max_concurrent_llm_tasks': 10,
'llm_as_a_judge_name': 'dummy_judge',
'eval_metrics': metrics_from_user,
}
)


def test_init_from_template_command_runs(tmp_path):
eval_dir = tmp_path / f'temp_{uuid.uuid4()}'
testcase = 'testcase'
init_from_template_command.init_from_template_command(str(eval_dir), testcase)

verify_eval_config_file(
eval_dir,
{
'max_concurrent_judge_tasks': 10,
'max_concurrent_llm_tasks': 10,
'llm_as_a_judge_name': 'open_ai',
'eval_metrics': [],
}
)