diff --git a/codeflash/verification/__init__.py b/codeflash/verification/__init__.py index e69de29bb..9732b6753 100644 --- a/codeflash/verification/__init__.py +++ b/codeflash/verification/__init__.py @@ -0,0 +1,31 @@ +"""Verification module for codeflash. + +This module provides test running and verification functionality. +""" + + +def __getattr__(name: str): # noqa: ANN202 + """Lazy import for LLM tools to avoid circular imports.""" + if name in ( + "AVAILABLE_TOOLS", + "RUN_BEHAVIORAL_TESTS_TOOL_SCHEMA", + "execute_tool", + "get_all_tool_schemas", + "get_tool_schema", + "run_behavioral_tests_tool", + ): + from codeflash.verification import llm_tools + + return getattr(llm_tools, name) + msg = f"module {__name__!r} has no attribute {name!r}" + raise AttributeError(msg) + + +__all__ = [ + "AVAILABLE_TOOLS", + "RUN_BEHAVIORAL_TESTS_TOOL_SCHEMA", + "execute_tool", + "get_all_tool_schemas", + "get_tool_schema", + "run_behavioral_tests_tool", +] diff --git a/codeflash/verification/llm_tools.py b/codeflash/verification/llm_tools.py new file mode 100644 index 000000000..960b70309 --- /dev/null +++ b/codeflash/verification/llm_tools.py @@ -0,0 +1,321 @@ +"""LLM Tool definitions for verification functions. + +This module exposes verification functions as tools that can be called by LLMs. +Each tool has a JSON schema definition and a simplified wrapper function. +""" + +from __future__ import annotations + +import os +from pathlib import Path +from typing import Any + +from pydantic import BaseModel, Field + +from codeflash.models.models import TestFile, TestFiles, TestType +from codeflash.verification.parse_test_output import parse_test_xml +from codeflash.verification.test_runner import run_behavioral_tests +from codeflash.verification.verification_utils import TestConfig + + +class TestFileInput(BaseModel): + """Input schema for a single test file.""" + + test_file_path: str = Field(description="Absolute path to the test file to run") + test_type: str = Field( + default="existing_unit_test", + description="Type of test: 'existing_unit_test', 'generated_regression', 'replay_test', or 'concolic_coverage_test'", + ) + + +class RunBehavioralTestsInput(BaseModel): + """Input schema for the run_behavioral_tests tool.""" + + test_files: list[TestFileInput] = Field(description="List of test files to run") + test_framework: str = Field(default="pytest", description="Test framework to use: 'pytest' or 'unittest'") + project_root: str = Field(description="Absolute path to the project root directory") + pytest_timeout: int | None = Field(default=30, description="Timeout in seconds for each pytest test") + verbose: bool = Field(default=False, description="Enable verbose output") + + +class TestResultOutput(BaseModel): + """Output schema for a single test result.""" + + test_id: str = Field(description="Unique identifier for the test") + test_file: str = Field(description="Path to the test file") + test_function: str | None = Field(description="Name of the test function") + passed: bool = Field(description="Whether the test passed") + runtime_ns: int | None = Field(description="Runtime in nanoseconds, if available") + timed_out: bool = Field(description="Whether the test timed out") + + +class RunBehavioralTestsOutput(BaseModel): + """Output schema for the run_behavioral_tests tool.""" + + success: bool = Field(description="Whether the test run completed successfully") + total_tests: int = Field(description="Total number of tests run") + passed_tests: int = Field(description="Number of tests that passed") + failed_tests: int = Field(description="Number of tests that failed") + results: list[TestResultOutput] = Field(description="Detailed results for each test") + stdout: str = Field(description="Standard output from the test run") + stderr: str = Field(description="Standard error from the test run") + error: str | None = Field(default=None, description="Error message if the run failed") + + +# JSON Schema for OpenAI-style function calling +RUN_BEHAVIORAL_TESTS_TOOL_SCHEMA = { + "type": "function", + "function": { + "name": "run_behavioral_tests", + "description": ( + "Run behavioral tests to verify code correctness. " + "This executes test files using pytest or unittest and returns detailed results " + "including pass/fail status, runtime information, and any errors encountered." + ), + "parameters": { + "type": "object", + "properties": { + "test_files": { + "type": "array", + "description": "List of test files to run", + "items": { + "type": "object", + "properties": { + "test_file_path": { + "type": "string", + "description": "Absolute path to the test file to run", + }, + "test_type": { + "type": "string", + "enum": [ + "existing_unit_test", + "generated_regression", + "replay_test", + "concolic_coverage_test", + ], + "default": "existing_unit_test", + "description": "Type of test being run", + }, + }, + "required": ["test_file_path"], + }, + }, + "test_framework": { + "type": "string", + "enum": ["pytest", "unittest"], + "default": "pytest", + "description": "Test framework to use", + }, + "project_root": {"type": "string", "description": "Absolute path to the project root directory"}, + "pytest_timeout": { + "type": "integer", + "default": 30, + "description": "Timeout in seconds for each pytest test", + }, + "verbose": {"type": "boolean", "default": False, "description": "Enable verbose output"}, + }, + "required": ["test_files", "project_root"], + }, + }, +} + + +def _test_type_from_string(test_type_str: str) -> TestType: + """Convert a string test type to TestType enum.""" + mapping = { + "existing_unit_test": TestType.EXISTING_UNIT_TEST, + "generated_regression": TestType.GENERATED_REGRESSION, + "replay_test": TestType.REPLAY_TEST, + "concolic_test": TestType.CONCOLIC_COVERAGE_TEST, + "concolic_coverage_test": TestType.CONCOLIC_COVERAGE_TEST, + } + return mapping.get(test_type_str.lower(), TestType.EXISTING_UNIT_TEST) + + +def run_behavioral_tests_tool( + test_files: list[dict[str, Any]], + project_root: str, + test_framework: str = "pytest", + pytest_timeout: int | None = 30, + verbose: bool = False, # noqa: FBT002, FBT001 +) -> dict[str, Any]: + """Run behavioral tests and return results in an LLM-friendly format. + + This is a simplified wrapper around run_behavioral_tests that accepts + primitive types suitable for LLM tool calling and returns a structured + dictionary response. + + Args: + test_files: List of dicts with 'test_file_path' and optional 'test_type' + project_root: Absolute path to the project root directory + test_framework: Test framework to use ('pytest' or 'unittest') + pytest_timeout: Timeout in seconds for each pytest test + verbose: Enable verbose output + + Returns: + Dictionary containing test results with success status, counts, and details + + Example: + >>> result = run_behavioral_tests_tool( + ... test_files=[{"test_file_path": "/path/to/test_example.py"}], project_root="/path/to/project" + ... ) + >>> print(result["passed_tests"], "tests passed") + + """ + try: + project_root_path = Path(project_root).resolve() + + # Build TestFiles structure + test_file_objects = [] + for tf in test_files: + test_file_path = Path(tf["test_file_path"]).resolve() + test_type_str = tf.get("test_type", "existing_unit_test") + test_type = _test_type_from_string(test_type_str) + + test_file_objects.append( + TestFile( + instrumented_behavior_file_path=test_file_path, + benchmarking_file_path=test_file_path, + original_file_path=test_file_path, + test_type=test_type, + ) + ) + + test_files_model = TestFiles(test_files=test_file_objects) + + # Set up test environment + test_env = os.environ.copy() + test_env["CODEFLASH_TEST_ITERATION"] = "0" + test_env["CODEFLASH_TRACER_DISABLE"] = "1" + + # Ensure PYTHONPATH includes project root + if "PYTHONPATH" not in test_env: + test_env["PYTHONPATH"] = str(project_root_path) + else: + test_env["PYTHONPATH"] += os.pathsep + str(project_root_path) + + # Run the tests + result_file_path, process, _, _ = run_behavioral_tests( + test_paths=test_files_model, + test_framework=test_framework, + test_env=test_env, + cwd=project_root_path, + pytest_timeout=pytest_timeout, + verbose=verbose, + ) + + # Create test config for parsing results + test_config = TestConfig( + tests_root=project_root_path, + project_root_path=project_root_path, + test_framework=test_framework, + tests_project_rootdir=project_root_path, + ) + + # Parse test results + test_results = parse_test_xml( + test_xml_file_path=result_file_path, + test_files=test_files_model, + test_config=test_config, + run_result=process, + ) + + # Clean up result file + result_file_path.unlink(missing_ok=True) + + # Build response + results_list = [] + passed_count = 0 + failed_count = 0 + + for result in test_results: + passed = result.did_pass + if passed: + passed_count += 1 + else: + failed_count += 1 + + results_list.append( + { + "test_id": result.id.id() if result.id else "", + "test_file": str(result.file_name) if result.file_name else "", + "test_function": result.id.test_function_name if result.id else None, + "passed": passed, + "runtime_ns": result.runtime, + "timed_out": result.timed_out or False, + } + ) + + return { + "success": True, + "total_tests": len(test_results), + "passed_tests": passed_count, + "failed_tests": failed_count, + "results": results_list, + "stdout": process.stdout if process.stdout else "", + "stderr": process.stderr if process.stderr else "", + "error": None, + } + + except Exception as e: + return { + "success": False, + "total_tests": 0, + "passed_tests": 0, + "failed_tests": 0, + "results": [], + "stdout": "", + "stderr": "", + "error": str(e), + } + + +# Registry of available tools +AVAILABLE_TOOLS = { + "run_behavioral_tests": {"schema": RUN_BEHAVIORAL_TESTS_TOOL_SCHEMA, "function": run_behavioral_tests_tool} +} + + +def get_tool_schema(tool_name: str) -> dict[str, Any] | None: + """Get the JSON schema for a tool by name. + + Args: + tool_name: Name of the tool to get schema for + + Returns: + JSON schema dict or None if tool not found + + """ + tool = AVAILABLE_TOOLS.get(tool_name) + return tool["schema"] if tool else None + + +def get_all_tool_schemas() -> list[dict[str, Any]]: + """Get JSON schemas for all available tools. + + Returns: + List of JSON schema dicts for all tools + + """ + return [tool["schema"] for tool in AVAILABLE_TOOLS.values()] + + +def execute_tool(tool_name: str, **kwargs: Any) -> dict[str, Any]: # noqa: ANN401 + """Execute a tool by name with the given arguments. + + Args: + tool_name: Name of the tool to execute + **kwargs: Arguments to pass to the tool function + + Returns: + Tool execution result as a dictionary + + Raises: + ValueError: If tool_name is not found + + """ + tool = AVAILABLE_TOOLS.get(tool_name) + if not tool: + msg = f"Unknown tool: {tool_name}" + raise ValueError(msg) + return tool["function"](**kwargs) diff --git a/tests/test_llm_tools.py b/tests/test_llm_tools.py new file mode 100644 index 000000000..c12769114 --- /dev/null +++ b/tests/test_llm_tools.py @@ -0,0 +1,193 @@ +"""Tests for LLM tools in the verification module.""" + +import tempfile +from pathlib import Path + +import pytest + +from codeflash.verification.llm_tools import ( + AVAILABLE_TOOLS, + RUN_BEHAVIORAL_TESTS_TOOL_SCHEMA, + execute_tool, + get_all_tool_schemas, + get_tool_schema, + run_behavioral_tests_tool, +) + + +def test_run_behavioral_tests_tool_schema_structure(): + """Test that the tool schema has the correct structure.""" + schema = RUN_BEHAVIORAL_TESTS_TOOL_SCHEMA + + assert schema["type"] == "function" + assert "function" in schema + assert schema["function"]["name"] == "run_behavioral_tests" + assert "description" in schema["function"] + assert "parameters" in schema["function"] + + params = schema["function"]["parameters"] + assert params["type"] == "object" + assert "test_files" in params["properties"] + assert "project_root" in params["properties"] + assert "test_framework" in params["properties"] + assert "test_files" in params["required"] + assert "project_root" in params["required"] + + +def test_get_tool_schema(): + """Test getting tool schema by name.""" + schema = get_tool_schema("run_behavioral_tests") + assert schema is not None + assert schema["function"]["name"] == "run_behavioral_tests" + + # Non-existent tool should return None + assert get_tool_schema("non_existent_tool") is None + + +def test_get_all_tool_schemas(): + """Test getting all tool schemas.""" + schemas = get_all_tool_schemas() + assert isinstance(schemas, list) + assert len(schemas) >= 1 + + # Check that run_behavioral_tests is in the list + names = [s["function"]["name"] for s in schemas] + assert "run_behavioral_tests" in names + + +def test_available_tools_registry(): + """Test that the AVAILABLE_TOOLS registry has correct structure.""" + assert "run_behavioral_tests" in AVAILABLE_TOOLS + + tool = AVAILABLE_TOOLS["run_behavioral_tests"] + assert "schema" in tool + assert "function" in tool + assert callable(tool["function"]) + + +def test_execute_tool_unknown_tool(): + """Test that execute_tool raises ValueError for unknown tools.""" + with pytest.raises(ValueError, match="Unknown tool"): + execute_tool("non_existent_tool") + + +def test_run_behavioral_tests_tool_pytest(): + """Test running pytest tests through the LLM tool.""" + test_code = """ +def add(a, b): + return a + b + +def test_add(): + assert add(1, 2) == 3 + assert add(0, 0) == 0 + assert add(-1, 1) == 0 +""" + # Use repo root for project_root to avoid path resolution issues + repo_root = Path(__file__).resolve().parent.parent + + with tempfile.TemporaryDirectory(dir=repo_root) as temp_dir: + test_file_path = Path(temp_dir) / "test_example.py" + test_file_path.write_text(test_code, encoding="utf-8") + + result = run_behavioral_tests_tool( + test_files=[{"test_file_path": str(test_file_path)}], + project_root=str(repo_root), + test_framework="pytest", + pytest_timeout=30, + ) + + assert result["success"] is True + assert result["total_tests"] >= 1 + assert result["passed_tests"] >= 1 + assert result["failed_tests"] == 0 + assert result["error"] is None + assert isinstance(result["results"], list) + + +def test_run_behavioral_tests_tool_failing_test(): + """Test running a failing test through the LLM tool.""" + test_code = """ +def test_failing(): + assert 1 == 2, "This test should fail" +""" + # Use repo root for project_root to avoid path resolution issues + repo_root = Path(__file__).resolve().parent.parent + + with tempfile.TemporaryDirectory(dir=repo_root) as temp_dir: + test_file_path = Path(temp_dir) / "test_failing.py" + test_file_path.write_text(test_code, encoding="utf-8") + + result = run_behavioral_tests_tool( + test_files=[{"test_file_path": str(test_file_path)}], + project_root=str(repo_root), + test_framework="pytest", + pytest_timeout=30, + ) + + assert result["success"] is True # The run completed, even if tests failed + assert result["failed_tests"] >= 1 + + +def test_run_behavioral_tests_tool_via_execute(): + """Test running tests through the execute_tool interface.""" + test_code = """ +def test_simple(): + assert True +""" + # Use repo root for project_root to avoid path resolution issues + repo_root = Path(__file__).resolve().parent.parent + + with tempfile.TemporaryDirectory(dir=repo_root) as temp_dir: + test_file_path = Path(temp_dir) / "test_simple.py" + test_file_path.write_text(test_code, encoding="utf-8") + + result = execute_tool( + "run_behavioral_tests", + test_files=[{"test_file_path": str(test_file_path)}], + project_root=str(repo_root), + ) + + assert result["success"] is True + assert result["error"] is None + + +def test_run_behavioral_tests_tool_invalid_path(): + """Test handling of invalid test file path.""" + # Use repo root for project_root + repo_root = Path(__file__).resolve().parent.parent + + result = run_behavioral_tests_tool( + test_files=[{"test_file_path": "/non/existent/test_file.py"}], + project_root=str(repo_root), + test_framework="pytest", + ) + + # Should complete but with no tests found + assert result["success"] is True + assert result["total_tests"] == 0 + + +def test_run_behavioral_tests_tool_with_test_type(): + """Test specifying test type.""" + test_code = """ +def test_with_type(): + assert True +""" + # Use repo root for project_root to avoid path resolution issues + repo_root = Path(__file__).resolve().parent.parent + + with tempfile.TemporaryDirectory(dir=repo_root) as temp_dir: + test_file_path = Path(temp_dir) / "test_typed.py" + test_file_path.write_text(test_code, encoding="utf-8") + + result = run_behavioral_tests_tool( + test_files=[ + { + "test_file_path": str(test_file_path), + "test_type": "existing_unit_test", + } + ], + project_root=str(repo_root), + ) + + assert result["success"] is True