From 464f6a6678b5179b2de7f81f940446e7dc7c7f86 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Tue, 18 Feb 2025 03:34:20 -0500 Subject: [PATCH 01/13] stdout comparison Update test_results.py Update parse_test_output.py Update equivalence.py --- codeflash/verification/equivalence.py | 9 ++++++++- codeflash/verification/parse_test_output.py | 17 ++++++++++++++++- codeflash/verification/test_results.py | 1 + pyproject.toml | 3 +-- 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/codeflash/verification/equivalence.py b/codeflash/verification/equivalence.py index 48b69e710..c3f19df02 100644 --- a/codeflash/verification/equivalence.py +++ b/codeflash/verification/equivalence.py @@ -1,6 +1,7 @@ +import difflib import sys -from codeflash.cli_cmds.console import logger +from codeflash.cli_cmds.console import console, logger from codeflash.verification.comparator import comparator from codeflash.verification.test_results import TestResults, TestType, VerificationType @@ -61,6 +62,12 @@ def compare_test_results(original_results: TestResults, candidate_results: TestR cdd_test_result.return_value, ) break + if (original_test_result.stdout and cdd_test_result.stdout) and not comparator( + original_test_result.stdout, cdd_test_result.stdout + ): + are_equal = False + break + if original_test_result.test_type in [TestType.EXISTING_UNIT_TEST, TestType.CONCOLIC_COVERAGE_TEST] and ( cdd_test_result.did_pass != original_test_result.did_pass ): diff --git a/codeflash/verification/parse_test_output.py b/codeflash/verification/parse_test_output.py index e69ee78d1..428b34767 100644 --- a/codeflash/verification/parse_test_output.py +++ b/codeflash/verification/parse_test_output.py @@ -42,6 +42,10 @@ def parse_func(file_path: Path) -> XMLParser: return parse(file_path, xml_parser) +matches_re = re.compile(r"!######(.*?):(.*?)([^\.:]*?):(.*?):(.*?):(.*?)######!") +cleaner_re = re.compile(r"!######(.*?)######!") + + def parse_test_return_values_bin(file_location: Path, test_files: TestFiles, test_config: TestConfig) -> TestResults: test_results = TestResults() if not file_location.exists(): @@ -259,7 +263,13 @@ def parse_test_xml( message = testcase.result[0].message.lower() if "timed out" in message: timed_out = True - matches = re.findall(r"!######(.*?):(.*?)([^\.:]*?):(.*?):(.*?):(.*?)######!", testcase.system_out or "") + + sys_stdout = testcase.system_out or "" + matches = matches_re.findall(sys_stdout) + + if sys_stdout: + sys_stdout = cleaner_re.sub("", sys_stdout) + if not matches or not len(matches): test_results.add( FunctionTestInvocation( @@ -278,6 +288,7 @@ def parse_test_xml( test_type=test_type, return_value=None, timed_out=timed_out, + stdout=sys_stdout, ) ) @@ -306,6 +317,7 @@ def parse_test_xml( test_type=test_type, return_value=None, timed_out=timed_out, + stdout=sys_stdout, ) ) @@ -393,6 +405,7 @@ def merge_test_results( verification_type=VerificationType(result_bin.verification_type) if result_bin.verification_type else None, + stdout=xml_result.stdout, ) ) elif xml_results.test_results[0].id.iteration_id is not None: @@ -422,6 +435,7 @@ def merge_test_results( verification_type=VerificationType(bin_result.verification_type) if bin_result.verification_type else None, + stdout=xml_result.stdout, ) ) else: @@ -448,6 +462,7 @@ def merge_test_results( verification_type=VerificationType(bin_result.verification_type) if bin_result.verification_type else None, + stdout=xml_result.stdout, ) ) diff --git a/codeflash/verification/test_results.py b/codeflash/verification/test_results.py index a4ecea816..c7a210a6a 100644 --- a/codeflash/verification/test_results.py +++ b/codeflash/verification/test_results.py @@ -93,6 +93,7 @@ class FunctionTestInvocation: return_value: Optional[object] # The return value of the function invocation timed_out: Optional[bool] verification_type: Optional[str] = VerificationType.FUNCTION_CALL + stdout: Optional[str] = None @property def unique_invocation_loop_id(self) -> str: diff --git a/pyproject.toml b/pyproject.toml index 1a5e63f8e..b4e5b324f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -178,8 +178,7 @@ ignore = [ "TD003", "TD004", "PLR2004", - "UP007", - "N802", # we use a lot of stdlib which follows this convention + "UP007" # remove once we drop 3.9 support. ] [tool.ruff.lint.flake8-type-checking] From 1f25df90615299c49c4a35c6439c188796194d4e Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Tue, 18 Feb 2025 05:02:57 -0500 Subject: [PATCH 02/13] strip --- codeflash/verification/parse_test_output.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codeflash/verification/parse_test_output.py b/codeflash/verification/parse_test_output.py index 428b34767..a5ca5637c 100644 --- a/codeflash/verification/parse_test_output.py +++ b/codeflash/verification/parse_test_output.py @@ -268,7 +268,7 @@ def parse_test_xml( matches = matches_re.findall(sys_stdout) if sys_stdout: - sys_stdout = cleaner_re.sub("", sys_stdout) + sys_stdout = cleaner_re.sub("", sys_stdout).strip() if not matches or not len(matches): test_results.add( From f40c388ef649fa02125f44b60bdc5251bc952489 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Tue, 18 Feb 2025 05:25:31 -0500 Subject: [PATCH 03/13] stdout comparison in E2E --- code_to_optimize/bubble_sort.py | 2 ++ .../end_to_end_test_bubblesort_pytest.py | 8 +++-- tests/scripts/end_to_end_test_utilities.py | 34 ++++++++++++++----- 3 files changed, 33 insertions(+), 11 deletions(-) diff --git a/code_to_optimize/bubble_sort.py b/code_to_optimize/bubble_sort.py index b18994494..787cc4a90 100644 --- a/code_to_optimize/bubble_sort.py +++ b/code_to_optimize/bubble_sort.py @@ -1,8 +1,10 @@ def sorter(arr): + print("codeflash stdout: Sorting list") for i in range(len(arr)): for j in range(len(arr) - 1): if arr[j] > arr[j + 1]: temp = arr[j] arr[j] = arr[j + 1] arr[j + 1] = temp + print(f"result: {arr}") return arr \ No newline at end of file diff --git a/tests/scripts/end_to_end_test_bubblesort_pytest.py b/tests/scripts/end_to_end_test_bubblesort_pytest.py index 08fe3117f..d714703aa 100644 --- a/tests/scripts/end_to_end_test_bubblesort_pytest.py +++ b/tests/scripts/end_to_end_test_bubblesort_pytest.py @@ -11,11 +11,15 @@ def run_test(expected_improvement_pct: int) -> bool: test_framework="pytest", min_improvement_x=1.0, coverage_expectations=[ - CoverageExpectation(function_name="sorter", expected_coverage=100.0, expected_lines=[2, 3, 4, 5, 6, 7, 8]) + CoverageExpectation( + function_name="sorter", expected_coverage=100.0, expected_lines=[2, 3, 4, 5, 6, 7, 8, 9, 10] + ) ], ) cwd = (pathlib.Path(__file__).parent.parent.parent / "code_to_optimize").resolve() - return run_codeflash_command(cwd, config, expected_improvement_pct) + return run_codeflash_command( + cwd, config, expected_improvement_pct, ['print("codeflash stdout: Sorting list")', 'print(f"result: {arr}")'] + ) if __name__ == "__main__": diff --git a/tests/scripts/end_to_end_test_utilities.py b/tests/scripts/end_to_end_test_utilities.py index 891ed29f4..23a67a84a 100644 --- a/tests/scripts/end_to_end_test_utilities.py +++ b/tests/scripts/end_to_end_test_utilities.py @@ -63,19 +63,21 @@ def validate_coverage(stdout: str, expectations: list[CoverageExpectation]) -> b assert coverage_match, f"Failed to find coverage data for {expect.function_name}" coverage = float(coverage_match.group(1)) - assert ( - coverage == expect.expected_coverage - ), f"Coverage was {coverage} instead of {expect.expected_coverage} for function: {expect.function_name}" + assert coverage == expect.expected_coverage, ( + f"Coverage was {coverage} instead of {expect.expected_coverage} for function: {expect.function_name}" + ) executed_lines = list(map(int, coverage_match.group(2).split(", "))) - assert ( - executed_lines == expect.expected_lines - ), f"Executed lines were {executed_lines} instead of {expect.expected_lines} for function: {expect.function_name}" + assert executed_lines == expect.expected_lines, ( + f"Executed lines were {executed_lines} instead of {expect.expected_lines} for function: {expect.function_name}" + ) return True -def run_codeflash_command(cwd: pathlib.Path, config: TestConfig, expected_improvement_pct: int) -> bool: +def run_codeflash_command( + cwd: pathlib.Path, config: TestConfig, expected_improvement_pct: int, expected_in_stdout: list[str] = None +) -> bool: logging.basicConfig(level=logging.INFO) if config.trace_mode: return run_trace_test(cwd, config, expected_improvement_pct) @@ -97,12 +99,21 @@ def run_codeflash_command(cwd: pathlib.Path, config: TestConfig, expected_improv return_code = process.wait() stdout = "".join(output) - if not validate_output(stdout, return_code, expected_improvement_pct, config): + validated = validate_output(stdout, return_code, expected_improvement_pct, config) + if not validated: # Write original file contents back to file path_to_file.write_text(file_contents, "utf-8") logging.info("Codeflash run did not meet expected requirements for testing, reverting file changes.") return False - return True + + if expected_in_stdout: + stdout_validated = validate_stdout_in_candidate(stdout, expected_in_stdout) + if not stdout_validated: + logging.error("Failed to find expected output in candidate output") + validated = False + logging.info(f"Success: Expected output found in candidate output") + + return validated def build_command(cwd: pathlib.Path, config: TestConfig, test_root: pathlib.Path) -> list[str]: @@ -164,6 +175,11 @@ def validate_output(stdout: str, return_code: int, expected_improvement_pct: int return True +def validate_stdout_in_candidate(stdout: str, expected_in_stdout: list[str]) -> bool: + candidate_output = stdout[stdout.find("INFO Best candidate") : stdout.find("Best Candidate Explanation")] + return all(expected in candidate_output for expected in expected_in_stdout) + + def run_trace_test(cwd: pathlib.Path, config: TestConfig, expected_improvement_pct: int) -> bool: # First command: Run the tracer test_root = cwd / "tests" / (config.test_framework or "") From f164fd243190defd04e30d8a5d5924816d7f2dba Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Thu, 20 Feb 2025 19:41:17 -0500 Subject: [PATCH 04/13] add test in instrumentation --- code_to_optimize/bubble_sort_method.py | 1 + tests/test_instrument_all_and_run.py | 10 ++++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/code_to_optimize/bubble_sort_method.py b/code_to_optimize/bubble_sort_method.py index 3928e41fb..c95a241d8 100644 --- a/code_to_optimize/bubble_sort_method.py +++ b/code_to_optimize/bubble_sort_method.py @@ -3,6 +3,7 @@ def __init__(self, x=0): self.x = x def sorter(self, arr): + print("codeflash stdout : BubbleSorter.sorter() called") for i in range(len(arr)): for j in range(len(arr) - 1): if arr[j] > arr[j + 1]: diff --git a/tests/test_instrument_all_and_run.py b/tests/test_instrument_all_and_run.py index 643d4bde7..cb5674ed6 100644 --- a/tests/test_instrument_all_and_run.py +++ b/tests/test_instrument_all_and_run.py @@ -168,6 +168,8 @@ def test_sort(): pytest_max_loops=1, testing_time=0.1, ) + assert "codeflash stdout: Sorting list" in test_results[0].stdout + assert "result: [0, 1, 2, 3, 4, 5]" in test_results[0].stdout assert test_results[0].id.function_getting_tested == "sorter" assert test_results[0].id.iteration_id == "1_0" assert test_results[0].id.test_class_name is None @@ -179,7 +181,8 @@ def test_sort(): assert test_results[0].runtime > 0 assert test_results[0].did_pass assert test_results[0].return_value == ([0, 1, 2, 3, 4, 5],) - + assert "codeflash stdout: Sorting list" in test_results[1].stdout + assert "result: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]" in test_results[1].stdout assert test_results[1].id.function_getting_tested == "sorter" assert test_results[1].id.iteration_id == "4_0" assert test_results[1].id.test_class_name is None @@ -340,13 +343,11 @@ def test_sort(): pytest_max_loops=1, testing_time=0.1, ) - assert len(test_results) == 4 assert test_results[0].id.function_getting_tested == "BubbleSorter.__init__" assert test_results[0].id.test_function_name == "test_sort" assert test_results[0].did_pass assert test_results[0].return_value[0] == {"x": 0} - assert test_results[1].id.function_getting_tested == "BubbleSorter.sorter" assert test_results[1].id.iteration_id == "2_0" assert test_results[1].id.test_class_name is None @@ -358,7 +359,8 @@ def test_sort(): assert test_results[1].runtime > 0 assert test_results[1].did_pass assert test_results[1].return_value == ([0, 1, 2, 3, 4, 5],) - + assert "codeflash stdout : BubbleSorter.sorter() called" in test_results[1].stdout + assert compare_test_results(test_results, test_results) assert test_results[2].id.function_getting_tested == "BubbleSorter.__init__" assert test_results[2].id.test_function_name == "test_sort" assert test_results[2].did_pass From d0710ce6408942db5c9d85b45916594b36b1f531 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Tue, 18 Feb 2025 03:34:20 -0500 Subject: [PATCH 05/13] stdout comparison Update test_results.py Update parse_test_output.py Update equivalence.py --- codeflash/verification/parse_test_output.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codeflash/verification/parse_test_output.py b/codeflash/verification/parse_test_output.py index a5ca5637c..428b34767 100644 --- a/codeflash/verification/parse_test_output.py +++ b/codeflash/verification/parse_test_output.py @@ -268,7 +268,7 @@ def parse_test_xml( matches = matches_re.findall(sys_stdout) if sys_stdout: - sys_stdout = cleaner_re.sub("", sys_stdout).strip() + sys_stdout = cleaner_re.sub("", sys_stdout) if not matches or not len(matches): test_results.add( From 944e9749fd068607e8116d8c5e2d36793d6694b7 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Tue, 18 Feb 2025 05:02:57 -0500 Subject: [PATCH 06/13] strip --- codeflash/verification/parse_test_output.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codeflash/verification/parse_test_output.py b/codeflash/verification/parse_test_output.py index 428b34767..a5ca5637c 100644 --- a/codeflash/verification/parse_test_output.py +++ b/codeflash/verification/parse_test_output.py @@ -268,7 +268,7 @@ def parse_test_xml( matches = matches_re.findall(sys_stdout) if sys_stdout: - sys_stdout = cleaner_re.sub("", sys_stdout) + sys_stdout = cleaner_re.sub("", sys_stdout).strip() if not matches or not len(matches): test_results.add( From 3147c4a927f98291ecc02f5769fc84bfb7f8d59c Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Thu, 20 Feb 2025 21:02:26 -0500 Subject: [PATCH 07/13] update aiservice test --- tests/test_instrumentation_run_results_aiservice.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_instrumentation_run_results_aiservice.py b/tests/test_instrumentation_run_results_aiservice.py index a35556dfd..1224b63d8 100644 --- a/tests/test_instrumentation_run_results_aiservice.py +++ b/tests/test_instrumentation_run_results_aiservice.py @@ -193,6 +193,7 @@ def __init__(self, x=1): self.x = x def sorter(self, arr): + print("codeflash stdout : BubbleSorter.sorter() called") for i in range(len(arr)): for j in range(len(arr) - 1): if arr[j] > arr[j + 1]: @@ -337,6 +338,7 @@ def __init__(self, x=1): self.x = x def sorter(self, arr): + print("codeflash stdout : BubbleSorter.sorter() called") for i in range(len(arr)): for j in range(len(arr) - 1): if arr[j] > arr[j + 1]: @@ -392,6 +394,7 @@ def __init__(self, x=0): self.y = 2 def sorter(self, arr): + print("codeflash stdout : BubbleSorter.sorter() called") for i in range(len(arr)): for j in range(len(arr) - 1): if arr[j] > arr[j + 1]: From 1a4f60e831a501c4f0e8e868d7a755915e83fdb4 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Thu, 20 Feb 2025 21:13:13 -0500 Subject: [PATCH 08/13] Update test_codeflash_capture.py --- tests/test_codeflash_capture.py | 47 ++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/tests/test_codeflash_capture.py b/tests/test_codeflash_capture.py index 83b1efd2b..8e5e237cf 100644 --- a/tests/test_codeflash_capture.py +++ b/tests/test_codeflash_capture.py @@ -485,7 +485,6 @@ def __init__(self, x=2): assert test_results[1].id.test_module_path == "code_to_optimize.tests.pytest.test_codeflash_capture_temp" assert test_results[1].id.function_getting_tested == "some_function" assert test_results[1].id.iteration_id == "11_0" - assert test_results[2].did_pass assert test_results[2].return_value[0]["x"] == 2 assert test_results[2].id.test_function_name == "test_example_test_3" @@ -494,6 +493,17 @@ def __init__(self, x=2): assert test_results[2].id.function_getting_tested == "some_function" assert test_results[2].id.iteration_id == "16_0" + test_results2, _ = func_optimizer.run_and_parse_tests( + testing_type=TestingMode.BEHAVIOR, + test_env=test_env, + test_files=func_optimizer.test_files, + optimization_iteration=0, + pytest_min_loops=1, + pytest_max_loops=1, + testing_time=0.1, + ) + assert compare_test_results(test_results, test_results2) + finally: test_path.unlink(missing_ok=True) sample_code_path.unlink(missing_ok=True) @@ -605,6 +615,18 @@ def __init__(self, *args, **kwargs): assert test_results[2].id.function_getting_tested == "some_function" assert test_results[2].id.iteration_id == "16_0" + results2, _ = func_optimizer.run_and_parse_tests( + testing_type=TestingMode.BEHAVIOR, + test_env=test_env, + test_files=func_optimizer.test_files, + optimization_iteration=0, + pytest_min_loops=1, + pytest_max_loops=1, + testing_time=0.1, + ) + + assert compare_test_results(test_results, results2) + finally: test_path.unlink(missing_ok=True) sample_code_path.unlink(missing_ok=True) @@ -720,6 +742,17 @@ def __init__(self, x=2): assert test_results[2].id.function_getting_tested == "some_function" assert test_results[2].id.iteration_id == "12_2" # Third call + test_results2, _ = func_optimizer.run_and_parse_tests( + testing_type=TestingMode.BEHAVIOR, + test_env=test_env, + test_files=func_optimizer.test_files, + optimization_iteration=0, + pytest_min_loops=1, + pytest_max_loops=1, + testing_time=0.1, + ) + + assert compare_test_results(test_results, test_results2) finally: test_path.unlink(missing_ok=True) sample_code_path.unlink(missing_ok=True) @@ -856,6 +889,18 @@ def another_helper(self): assert test_results[3].id.function_getting_tested == "AnotherHelperClass.__init__" assert test_results[3].verification_type == VerificationType.INIT_STATE_HELPER + results2, _ = func_optimizer.run_and_parse_tests( + testing_type=TestingMode.BEHAVIOR, + test_env=test_env, + test_files=func_optimizer.test_files, + optimization_iteration=0, + pytest_min_loops=1, + pytest_max_loops=1, + testing_time=0.1, + ) + + assert compare_test_results(test_results, results2) + finally: test_path.unlink(missing_ok=True) fto_file_path.unlink(missing_ok=True) From ed970eaff9f259746273e59f08e8a2519dd88f80 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Thu, 20 Feb 2025 21:23:16 -0500 Subject: [PATCH 09/13] add more unit tests --- tests/test_instrument_all_and_run.py | 24 ++++++++++++++++++++++++ tests/test_instrument_tests.py | 8 +++++++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/tests/test_instrument_all_and_run.py b/tests/test_instrument_all_and_run.py index cb5674ed6..d5e9f689a 100644 --- a/tests/test_instrument_all_and_run.py +++ b/tests/test_instrument_all_and_run.py @@ -193,6 +193,18 @@ def test_sort(): ) assert test_results[1].runtime > 0 assert test_results[1].did_pass + results2, _ = func_optimizer.run_and_parse_tests( + testing_type=TestingMode.BEHAVIOR, + test_env=test_env, + test_files=func_optimizer.test_files, + optimization_iteration=0, + pytest_min_loops=1, + pytest_max_loops=1, + testing_time=0.1, + ) + assert "codeflash stdout: Sorting list" in results2[0].stdout + assert "result: [0, 1, 2, 3, 4, 5]" in results2[0].stdout + assert compare_test_results(test_results, results2) finally: fto_path.write_text(original_code, "utf-8") test_path.unlink(missing_ok=True) @@ -377,6 +389,18 @@ def test_sort(): assert test_results[3].runtime > 0 assert test_results[3].did_pass + results2, _ = func_optimizer.run_and_parse_tests( + testing_type=TestingMode.BEHAVIOR, + test_env=test_env, + test_files=func_optimizer.test_files, + optimization_iteration=0, + pytest_min_loops=1, + pytest_max_loops=1, + testing_time=0.1, + ) + + assert compare_test_results(test_results, results2) + # Replace with optimized code that mutated instance attribute optimized_code = """ class BubbleSorter: diff --git a/tests/test_instrument_tests.py b/tests/test_instrument_tests.py index bf7373522..cfc0a8842 100644 --- a/tests/test_instrument_tests.py +++ b/tests/test_instrument_tests.py @@ -482,6 +482,8 @@ def test_sort(): ) assert test_results_perf[1].runtime > 0 assert test_results_perf[1].did_pass + assert "codeflash stdout: Sorting list" in test_results_perf[1].stdout + assert "result: [0, 1, 2, 3, 4, 5]" in test_results_perf[1].stdout finally: test_path.unlink(missing_ok=True) test_path_perf.unlink(missing_ok=True) @@ -693,6 +695,9 @@ def test_sort_parametrized(input, expected_output): assert test_results_perf[1].runtime > 0 assert test_results_perf[1].did_pass + assert "codeflash stdout: Sorting list" in test_results_perf[1].stdout + assert "result: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]" in test_results_perf[1].stdout + assert test_results_perf[2].id.function_getting_tested == "sorter" assert test_results_perf[2].id.iteration_id == "0_2" assert test_results_perf[2].id.test_class_name is None @@ -1230,7 +1235,8 @@ def test_sort(): assert test_results[0].runtime > 0 assert test_results[0].did_pass assert test_results[0].return_value is None - + assert "codeflash stdout: Sorting list" in test_results[0].stdout + assert "result: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]" in test_results[0].stdout assert test_results[1].id.function_getting_tested == "sorter" assert test_results[1].id.iteration_id == "2_2_1" assert test_results[1].id.test_class_name is None From f3972894547191da61d43f993415f3a685a18054 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Fri, 21 Feb 2025 00:18:37 -0800 Subject: [PATCH 10/13] address review. --- code_to_optimize/bubble_sort_method.py | 4 ++++ tests/test_instrument_all_and_run.py | 17 +++++++++++++---- tests/test_instrument_tests.py | 12 ++++++++++-- ...est_instrumentation_run_results_aiservice.py | 12 ++++++++++++ 4 files changed, 39 insertions(+), 6 deletions(-) diff --git a/code_to_optimize/bubble_sort_method.py b/code_to_optimize/bubble_sort_method.py index c95a241d8..962fde339 100644 --- a/code_to_optimize/bubble_sort_method.py +++ b/code_to_optimize/bubble_sort_method.py @@ -1,3 +1,6 @@ +import sys + + class BubbleSorter: def __init__(self, x=0): self.x = x @@ -10,4 +13,5 @@ def sorter(self, arr): temp = arr[j] arr[j] = arr[j + 1] arr[j + 1] = temp + print("stderr test", file=sys.stderr) return arr diff --git a/tests/test_instrument_all_and_run.py b/tests/test_instrument_all_and_run.py index d5e9f689a..bfad32f91 100644 --- a/tests/test_instrument_all_and_run.py +++ b/tests/test_instrument_all_and_run.py @@ -168,8 +168,17 @@ def test_sort(): pytest_max_loops=1, testing_time=0.1, ) - assert "codeflash stdout: Sorting list" in test_results[0].stdout - assert "result: [0, 1, 2, 3, 4, 5]" in test_results[0].stdout + + out_str = """--------------------------------- Captured Log --------------------------------- + +--------------------------------- Captured Out --------------------------------- + +codeflash stdout: Sorting list +result: [0, 1, 2, 3, 4, 5] + +codeflash stdout: Sorting list +result: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]""" + assert out_str == test_results[0].stdout.strip() assert test_results[0].id.function_getting_tested == "sorter" assert test_results[0].id.iteration_id == "1_0" assert test_results[0].id.test_class_name is None @@ -181,8 +190,8 @@ def test_sort(): assert test_results[0].runtime > 0 assert test_results[0].did_pass assert test_results[0].return_value == ([0, 1, 2, 3, 4, 5],) - assert "codeflash stdout: Sorting list" in test_results[1].stdout - assert "result: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]" in test_results[1].stdout + assert out_str == test_results[1].stdout.strip() + assert test_results[1].id.function_getting_tested == "sorter" assert test_results[1].id.iteration_id == "4_0" assert test_results[1].id.test_class_name is None diff --git a/tests/test_instrument_tests.py b/tests/test_instrument_tests.py index cfc0a8842..282c9f1d5 100644 --- a/tests/test_instrument_tests.py +++ b/tests/test_instrument_tests.py @@ -482,8 +482,16 @@ def test_sort(): ) assert test_results_perf[1].runtime > 0 assert test_results_perf[1].did_pass - assert "codeflash stdout: Sorting list" in test_results_perf[1].stdout - assert "result: [0, 1, 2, 3, 4, 5]" in test_results_perf[1].stdout + out_str = """--------------------------------- Captured Log --------------------------------- + +--------------------------------- Captured Out --------------------------------- +codeflash stdout: Sorting list +result: [0, 1, 2, 3, 4, 5] + +codeflash stdout: Sorting list +result: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]""" + assert out_str == test_results_perf[1].stdout + finally: test_path.unlink(missing_ok=True) test_path_perf.unlink(missing_ok=True) diff --git a/tests/test_instrumentation_run_results_aiservice.py b/tests/test_instrumentation_run_results_aiservice.py index 1224b63d8..670b23fb2 100644 --- a/tests/test_instrumentation_run_results_aiservice.py +++ b/tests/test_instrumentation_run_results_aiservice.py @@ -187,6 +187,9 @@ def test_single_element_list(): # Replace with optimized code that mutated instance attribute optimized_code_mutated_attr = """ +import sys + + class BubbleSorter: def __init__(self, x=1): @@ -200,6 +203,7 @@ def sorter(self, arr): temp = arr[j] arr[j] = arr[j + 1] arr[j + 1] = temp + print("stderr test", file=sys.stderr) return arr """ fto_path.write_text(optimized_code_mutated_attr, "utf-8") @@ -332,6 +336,9 @@ def test_single_element_list(): assert test_results[1].return_value[2] == [1, 2, 3] # Replace with optimized code that mutated instance attribute optimized_code_mutated_attr = """ +import sys + + class BubbleSorter: def __init__(self, x=1): @@ -345,6 +352,7 @@ def sorter(self, arr): temp = arr[j] arr[j] = arr[j + 1] arr[j + 1] = temp + print("stderr test", file=sys.stderr) return arr """ fto_path.write_text(optimized_code_mutated_attr, "utf-8") @@ -388,6 +396,9 @@ def sorter(self, arr): ) # The test should fail because the instance attribute was mutated # Replace with optimized code that did not mutate existing instance attribute, but added a new one optimized_code_new_attr = """ +import sys + + class BubbleSorter: def __init__(self, x=0): self.x = x @@ -401,6 +412,7 @@ def sorter(self, arr): temp = arr[j] arr[j] = arr[j + 1] arr[j + 1] = temp + print("stderr test", file=sys.stderr) return arr """ fto_path.write_text(optimized_code_new_attr, "utf-8") From 11e43ea6bb86506bd9f66fa8ccd5a247f309f86d Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Mon, 24 Feb 2025 15:34:39 -0800 Subject: [PATCH 11/13] strip out captured log part --- codeflash/verification/parse_test_output.py | 3 ++- tests/test_instrument_all_and_run.py | 6 +----- tests/test_instrument_tests.py | 5 +---- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/codeflash/verification/parse_test_output.py b/codeflash/verification/parse_test_output.py index a5ca5637c..fa565b5b9 100644 --- a/codeflash/verification/parse_test_output.py +++ b/codeflash/verification/parse_test_output.py @@ -43,7 +43,8 @@ def parse_func(file_path: Path) -> XMLParser: matches_re = re.compile(r"!######(.*?):(.*?)([^\.:]*?):(.*?):(.*?):(.*?)######!") -cleaner_re = re.compile(r"!######(.*?)######!") +cleaner_re = re.compile(r"!######.*?######!|-+\s*Captured\s+(Log|Out)\s*-+\n?") + def parse_test_return_values_bin(file_location: Path, test_files: TestFiles, test_config: TestConfig) -> TestResults: diff --git a/tests/test_instrument_all_and_run.py b/tests/test_instrument_all_and_run.py index bfad32f91..ea0ab0df9 100644 --- a/tests/test_instrument_all_and_run.py +++ b/tests/test_instrument_all_and_run.py @@ -169,11 +169,7 @@ def test_sort(): testing_time=0.1, ) - out_str = """--------------------------------- Captured Log --------------------------------- - ---------------------------------- Captured Out --------------------------------- - -codeflash stdout: Sorting list + out_str = """codeflash stdout: Sorting list result: [0, 1, 2, 3, 4, 5] codeflash stdout: Sorting list diff --git a/tests/test_instrument_tests.py b/tests/test_instrument_tests.py index 282c9f1d5..c38c8c45a 100644 --- a/tests/test_instrument_tests.py +++ b/tests/test_instrument_tests.py @@ -482,10 +482,7 @@ def test_sort(): ) assert test_results_perf[1].runtime > 0 assert test_results_perf[1].did_pass - out_str = """--------------------------------- Captured Log --------------------------------- - ---------------------------------- Captured Out --------------------------------- -codeflash stdout: Sorting list + out_str = """codeflash stdout: Sorting list result: [0, 1, 2, 3, 4, 5] codeflash stdout: Sorting list From bb1bbf9f99c1e38116b0dc614c498b12727b3445 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Thu, 27 Feb 2025 11:35:45 -0800 Subject: [PATCH 12/13] missed a few direct comparison for stdout --- tests/test_instrument_all_and_run.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/test_instrument_all_and_run.py b/tests/test_instrument_all_and_run.py index ea0ab0df9..99ff5047c 100644 --- a/tests/test_instrument_all_and_run.py +++ b/tests/test_instrument_all_and_run.py @@ -207,8 +207,12 @@ def test_sort(): pytest_max_loops=1, testing_time=0.1, ) - assert "codeflash stdout: Sorting list" in results2[0].stdout - assert "result: [0, 1, 2, 3, 4, 5]" in results2[0].stdout + out_str = """codeflash stdout: Sorting list +result: [0, 1, 2, 3, 4, 5] + +codeflash stdout: Sorting list +result: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]""" + assert out_str == results2[0].stdout.strip() assert compare_test_results(test_results, results2) finally: fto_path.write_text(original_code, "utf-8") @@ -376,7 +380,12 @@ def test_sort(): assert test_results[1].runtime > 0 assert test_results[1].did_pass assert test_results[1].return_value == ([0, 1, 2, 3, 4, 5],) - assert "codeflash stdout : BubbleSorter.sorter() called" in test_results[1].stdout + out_str = """ +codeflash stdout : BubbleSorter.sorter() called + + +codeflash stdout : BubbleSorter.sorter() called""" + assert test_results[1].stdout.strip() == out_str.strip() assert compare_test_results(test_results, test_results) assert test_results[2].id.function_getting_tested == "BubbleSorter.__init__" assert test_results[2].id.test_function_name == "test_sort" From dcf9384dc03be361e15088140fea5f6b95cbaf96 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Thu, 27 Feb 2025 22:56:58 -0800 Subject: [PATCH 13/13] adress review Update pyproject.toml Update test_instrument_tests.py Update test_instrument_all_and_run.py missing test --- pyproject.toml | 1 - tests/test_instrument_all_and_run.py | 10 +++------- tests/test_instrument_tests.py | 16 ++++++++++++---- ...test_instrumentation_run_results_aiservice.py | 5 +++++ 4 files changed, 20 insertions(+), 12 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b4e5b324f..2e71f2a0a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -119,7 +119,6 @@ types-gevent = "^24.11.0.20241230" types-greenlet = "^3.1.0.20241221" types-pexpect = "^4.9.0.20241208" types-unidiff = "^0.7.0.20240505" -sqlalchemy = "^2.0.38" uv = ">=0.6.2" [tool.poetry.build] diff --git a/tests/test_instrument_all_and_run.py b/tests/test_instrument_all_and_run.py index 99ff5047c..ce06c855a 100644 --- a/tests/test_instrument_all_and_run.py +++ b/tests/test_instrument_all_and_run.py @@ -174,7 +174,7 @@ def test_sort(): codeflash stdout: Sorting list result: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]""" - assert out_str == test_results[0].stdout.strip() + assert out_str == test_results[0].stdout assert test_results[0].id.function_getting_tested == "sorter" assert test_results[0].id.iteration_id == "1_0" assert test_results[0].id.test_class_name is None @@ -380,12 +380,8 @@ def test_sort(): assert test_results[1].runtime > 0 assert test_results[1].did_pass assert test_results[1].return_value == ([0, 1, 2, 3, 4, 5],) - out_str = """ -codeflash stdout : BubbleSorter.sorter() called - - -codeflash stdout : BubbleSorter.sorter() called""" - assert test_results[1].stdout.strip() == out_str.strip() + out_str = """codeflash stdout : BubbleSorter.sorter() called\n\n\ncodeflash stdout : BubbleSorter.sorter() called""" + assert test_results[1].stdout == out_str assert compare_test_results(test_results, test_results) assert test_results[2].id.function_getting_tested == "BubbleSorter.__init__" assert test_results[2].id.test_function_name == "test_sort" diff --git a/tests/test_instrument_tests.py b/tests/test_instrument_tests.py index c38c8c45a..79f4bc5dd 100644 --- a/tests/test_instrument_tests.py +++ b/tests/test_instrument_tests.py @@ -700,8 +700,9 @@ def test_sort_parametrized(input, expected_output): assert test_results_perf[1].runtime > 0 assert test_results_perf[1].did_pass - assert "codeflash stdout: Sorting list" in test_results_perf[1].stdout - assert "result: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]" in test_results_perf[1].stdout + out_str = """codeflash stdout: Sorting list +result: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]""" + assert out_str == test_results_perf[1].stdout assert test_results_perf[2].id.function_getting_tested == "sorter" assert test_results_perf[2].id.iteration_id == "0_2" @@ -1240,8 +1241,15 @@ def test_sort(): assert test_results[0].runtime > 0 assert test_results[0].did_pass assert test_results[0].return_value is None - assert "codeflash stdout: Sorting list" in test_results[0].stdout - assert "result: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]" in test_results[0].stdout + out_str = """codeflash stdout: Sorting list +result: [0, 1, 2, 3, 4, 5] + +codeflash stdout: Sorting list +result: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] + +codeflash stdout: Sorting list +result: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]""" + assert test_results[1].stdout == out_str assert test_results[1].id.function_getting_tested == "sorter" assert test_results[1].id.iteration_id == "2_2_1" assert test_results[1].id.test_class_name is None diff --git a/tests/test_instrumentation_run_results_aiservice.py b/tests/test_instrumentation_run_results_aiservice.py index 670b23fb2..888c629ef 100644 --- a/tests/test_instrumentation_run_results_aiservice.py +++ b/tests/test_instrumentation_run_results_aiservice.py @@ -179,6 +179,7 @@ def test_single_element_list(): testing_time=0.1, ) assert test_results[0].id.function_getting_tested == "sorter" + assert test_results[0].stdout == "codeflash stdout : BubbleSorter.sorter() called" assert test_results[0].id.test_function_name == "test_single_element_list" assert test_results[0].did_pass assert test_results[0].return_value[1]["arr"] == [42] @@ -222,6 +223,7 @@ def sorter(self, arr): test_results, test_results_mutated_attr ) # Without codeflash capture, the init state was not verified, and the results are verified as correct even with the attribute mutated + assert test_results_mutated_attr[0].stdout == "codeflash stdout : BubbleSorter.sorter() called" finally: fto_path.write_text(original_code, "utf-8") test_path.unlink(missing_ok=True) @@ -322,6 +324,7 @@ def test_single_element_list(): assert test_results[0].id.test_function_name == "test_single_element_list" assert test_results[0].did_pass assert test_results[0].return_value[0] == {"x": 0} + assert test_results[0].stdout == "codeflash stdout : BubbleSorter.sorter() called" # Verify function_to_optimize result assert test_results[1].id.function_getting_tested == "sorter" @@ -391,6 +394,7 @@ def sorter(self, arr): assert test_results_mutated_attr[0].id.function_getting_tested == "BubbleSorter.__init__" assert test_results_mutated_attr[0].return_value[0] == {"x": 1} assert test_results_mutated_attr[0].verification_type == VerificationType.INIT_STATE_FTO + assert test_results_mutated_attr[0].stdout == "codeflash stdout : BubbleSorter.sorter() called" assert not compare_test_results( test_results, test_results_mutated_attr ) # The test should fail because the instance attribute was mutated @@ -442,6 +446,7 @@ def sorter(self, arr): assert test_results_new_attr[0].id.function_getting_tested == "BubbleSorter.__init__" assert test_results_new_attr[0].return_value[0] == {"x": 0, "y": 2} assert test_results_new_attr[0].verification_type == VerificationType.INIT_STATE_FTO + assert test_results_new_attr[0].stdout == "codeflash stdout : BubbleSorter.sorter() called" # assert test_results_new_attr[1].return_value[1]["self"].x == 0 TODO: add self as input # assert test_results_new_attr[1].return_value[1]["self"].y == 2 TODO: add self as input assert compare_test_results(