cyberark · roybeny · Jun 1, 2025 · Jun 1, 2025 · Jun 1, 2025 · Jun 1, 2025
@@ -0,0 +1,13 @@
+#!/usr/bin/env python3
+import subprocess
+
+cmd = ['pytest', '--cov=simpleval', '--cov-report=term', '--cov-report=term-missing', '--cov-report=html', '--cov-fail-under=90']
+try:
+    subprocess.run(cmd, check=True)
+except subprocess.CalledProcessError as e:
+    print('\nCommand failed with exit code:', e.returncode)
+    if e.stdout:
+        print('\nSTDOUT:\n', e.stdout.decode() if isinstance(e.stdout, bytes) else e.stdout)
+    if e.stderr:
+        print('\nSTDERR:\n', e.stderr.decode() if isinstance(e.stderr, bytes) else e.stderr)
+    raise
@@ -39,6 +39,8 @@
     * Code coverage checks - must be above 90%
     * Verify uv.lock and requirements.txt are synced
 
+    Optionally also run `./ci/run_code_cov.py` to make sure the code coverage is adequate.
+
 !!! info "React checks"
       If you made changes to the `reports-frontend` reports, run the `ci/run_checks_react.py` script
       This will run the following steps:

@@ -1,6 +1,7 @@
 import json
 import logging
 import os
+import shutil
 from abc import ABC, abstractmethod
 
 from colorama import Fore
@@ -45,16 +46,15 @@ def run_init_command(self):
         new_testcases_folder = os.path.join(eval_dir, TESTCASES_FOLDER, testcase)
         os.makedirs(new_testcases_folder)
 
-        rc = 0
-        rc += os.system(f'cp {os.path.join(empty_eval_set_folder, EVAL_CONFIG_FILE)} {new_eval_set_folder}')  # noqa
-        rc += os.system(f'cp {os.path.join(empty_eval_set_folder, GROUND_TRUTH_FILE)} {new_eval_set_folder}')  # noqa
-        rc += os.system(f'cp {os.path.join(empty_eval_set_folder, "README.md")} {new_eval_set_folder}')  # noqa
+        try:
+            shutil.copy(os.path.join(empty_eval_set_folder, EVAL_CONFIG_FILE), new_eval_set_folder)
+            shutil.copy(os.path.join(empty_eval_set_folder, GROUND_TRUTH_FILE), new_eval_set_folder)
+            shutil.copy(os.path.join(empty_eval_set_folder, 'README.md'), new_eval_set_folder)
 
-        rc += os.system(f'cp {os.path.join(empty_testcase_folder, "__init__.py")} {new_testcases_folder}')  # noqa
-        rc += os.system(f'cp {os.path.join(empty_testcase_folder, PLUGIN_FILE_NAME)} {new_testcases_folder}')  # noqa
-
-        if rc != 0:
-            raise TerminationError(f'{Fore.RED}Error occurred during creating new evaluation{Fore.RESET}')
+            shutil.copy(os.path.join(empty_testcase_folder, '__init__.py'), new_testcases_folder)
+            shutil.copy(os.path.join(empty_testcase_folder, PLUGIN_FILE_NAME), new_testcases_folder)
+        except Exception as e:
+            raise TerminationError(f'{Fore.RED}Error occurred creating the new evaluation: {e}{Fore.RESET}')
 
         with open(os.path.join(new_eval_set_folder, EVAL_CONFIG_FILE), 'w', encoding='utf-8') as file:
             json.dump(new_config.model_dump(exclude_none=True), file, indent=4)

@@ -0,0 +1,61 @@
+import json
+import uuid
+from unittest import mock
+
+from simpleval.commands.init_command import init_command, init_from_template_command
+
+
+def verify_eval_config_file(eval_dir, expected):
+    config_path = eval_dir / 'config.json'
+    with open(config_path, 'r', encoding='utf-8') as f:
+        config = json.load(f)
+
+    assert config['name'] == eval_dir.name, config
+    assert config['max_concurrent_judge_tasks'] == expected['max_concurrent_judge_tasks'], config
+    assert config['max_concurrent_llm_tasks'] == expected['max_concurrent_llm_tasks'], config
+    assert config['llm_as_a_judge_name'] == expected['llm_as_a_judge_name'], config
+    assert config['eval_metrics'] == expected['eval_metrics'], config
+
+
+def test_init_command_creates_eval_set(tmp_path):
+    eval_dir = tmp_path / f'temp_{uuid.uuid4()}'
+    testcase = 'testcase'
+
+    input_values = [str(eval_dir), testcase]
+    def input_side_effect(prompt):
+        return input_values.pop(0)
+
+    metrics_from_user = ['correctness', 'accuracy', 'relevancy']
+
+    with mock.patch('builtins.input', side_effect=input_side_effect), \
+         mock.patch('simpleval.commands.init_command.user_functions.pick_judge', return_value='dummy_judge'), \
+         mock.patch('simpleval.commands.init_command.user_functions.get_model_id_from_user', return_value='dummy_model_id'), \
+         mock.patch('simpleval.commands.init_command.user_functions.get_metrics_from_user', return_value=metrics_from_user), \
+         mock.patch('simpleval.commands.init_command.user_functions.get_concurrency_values', return_value=(10, 10)):
+        init_command.init_command()
+
+    verify_eval_config_file(
+        eval_dir,
+        {
+            'max_concurrent_judge_tasks': 10,
+            'max_concurrent_llm_tasks': 10,
+            'llm_as_a_judge_name': 'dummy_judge',
+            'eval_metrics': metrics_from_user,
+        }
+    )
+
+
+def test_init_from_template_command_runs(tmp_path):
+    eval_dir = tmp_path / f'temp_{uuid.uuid4()}'
+    testcase = 'testcase'
+    init_from_template_command.init_from_template_command(str(eval_dir), testcase)
+
+    verify_eval_config_file(
+        eval_dir,
+        {
+            'max_concurrent_judge_tasks': 10,
+            'max_concurrent_llm_tasks': 10,
+            'llm_as_a_judge_name': 'open_ai',
+            'eval_metrics': [],
+        }
+    )