From 464f6a6678b5179b2de7f81f940446e7dc7c7f86 Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Tue, 18 Feb 2025 03:34:20 -0500
Subject: [PATCH 01/13] stdout comparison

Update test_results.py

Update parse_test_output.py

Update equivalence.py
---
 codeflash/verification/equivalence.py       |  9 ++++++++-
 codeflash/verification/parse_test_output.py | 17 ++++++++++++++++-
 codeflash/verification/test_results.py      |  1 +
 pyproject.toml                              |  3 +--
 4 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/codeflash/verification/equivalence.py b/codeflash/verification/equivalence.py
index 48b69e710..c3f19df02 100644
--- a/codeflash/verification/equivalence.py
+++ b/codeflash/verification/equivalence.py
@@ -1,6 +1,7 @@
+import difflib
 import sys
 
-from codeflash.cli_cmds.console import logger
+from codeflash.cli_cmds.console import console, logger
 from codeflash.verification.comparator import comparator
 from codeflash.verification.test_results import TestResults, TestType, VerificationType
 
@@ -61,6 +62,12 @@ def compare_test_results(original_results: TestResults, candidate_results: TestR
                 cdd_test_result.return_value,
             )
             break
+        if (original_test_result.stdout and cdd_test_result.stdout) and not comparator(
+            original_test_result.stdout, cdd_test_result.stdout
+        ):
+            are_equal = False
+            break
+
         if original_test_result.test_type in [TestType.EXISTING_UNIT_TEST, TestType.CONCOLIC_COVERAGE_TEST] and (
             cdd_test_result.did_pass != original_test_result.did_pass
         ):
diff --git a/codeflash/verification/parse_test_output.py b/codeflash/verification/parse_test_output.py
index e69ee78d1..428b34767 100644
--- a/codeflash/verification/parse_test_output.py
+++ b/codeflash/verification/parse_test_output.py
@@ -42,6 +42,10 @@ def parse_func(file_path: Path) -> XMLParser:
     return parse(file_path, xml_parser)
 
 
+matches_re = re.compile(r"!######(.*?):(.*?)([^\.:]*?):(.*?):(.*?):(.*?)######!")
+cleaner_re = re.compile(r"!######(.*?)######!")
+
+
 def parse_test_return_values_bin(file_location: Path, test_files: TestFiles, test_config: TestConfig) -> TestResults:
     test_results = TestResults()
     if not file_location.exists():
@@ -259,7 +263,13 @@ def parse_test_xml(
                     message = testcase.result[0].message.lower()
                     if "timed out" in message:
                         timed_out = True
-            matches = re.findall(r"!######(.*?):(.*?)([^\.:]*?):(.*?):(.*?):(.*?)######!", testcase.system_out or "")
+
+            sys_stdout = testcase.system_out or ""
+            matches = matches_re.findall(sys_stdout)
+
+            if sys_stdout:
+                sys_stdout = cleaner_re.sub("", sys_stdout)
+
             if not matches or not len(matches):
                 test_results.add(
                     FunctionTestInvocation(
@@ -278,6 +288,7 @@ def parse_test_xml(
                         test_type=test_type,
                         return_value=None,
                         timed_out=timed_out,
+                        stdout=sys_stdout,
                     )
                 )
 
@@ -306,6 +317,7 @@ def parse_test_xml(
                             test_type=test_type,
                             return_value=None,
                             timed_out=timed_out,
+                            stdout=sys_stdout,
                         )
                     )
 
@@ -393,6 +405,7 @@ def merge_test_results(
                         verification_type=VerificationType(result_bin.verification_type)
                         if result_bin.verification_type
                         else None,
+                        stdout=xml_result.stdout,
                     )
                 )
         elif xml_results.test_results[0].id.iteration_id is not None:
@@ -422,6 +435,7 @@ def merge_test_results(
                         verification_type=VerificationType(bin_result.verification_type)
                         if bin_result.verification_type
                         else None,
+                        stdout=xml_result.stdout,
                     )
                 )
         else:
@@ -448,6 +462,7 @@ def merge_test_results(
                         verification_type=VerificationType(bin_result.verification_type)
                         if bin_result.verification_type
                         else None,
+                        stdout=xml_result.stdout,
                     )
                 )
 
diff --git a/codeflash/verification/test_results.py b/codeflash/verification/test_results.py
index a4ecea816..c7a210a6a 100644
--- a/codeflash/verification/test_results.py
+++ b/codeflash/verification/test_results.py
@@ -93,6 +93,7 @@ class FunctionTestInvocation:
     return_value: Optional[object]  # The return value of the function invocation
     timed_out: Optional[bool]
     verification_type: Optional[str] = VerificationType.FUNCTION_CALL
+    stdout: Optional[str] = None
 
     @property
     def unique_invocation_loop_id(self) -> str:
diff --git a/pyproject.toml b/pyproject.toml
index 1a5e63f8e..b4e5b324f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -178,8 +178,7 @@ ignore = [
     "TD003",
     "TD004",
     "PLR2004",
-    "UP007",
-    "N802", # we use a lot of stdlib which follows this convention
+    "UP007" # remove once we drop 3.9 support.
 ]
 
 [tool.ruff.lint.flake8-type-checking]

From 1f25df90615299c49c4a35c6439c188796194d4e Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Tue, 18 Feb 2025 05:02:57 -0500
Subject: [PATCH 02/13] strip

---
 codeflash/verification/parse_test_output.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/codeflash/verification/parse_test_output.py b/codeflash/verification/parse_test_output.py
index 428b34767..a5ca5637c 100644
--- a/codeflash/verification/parse_test_output.py
+++ b/codeflash/verification/parse_test_output.py
@@ -268,7 +268,7 @@ def parse_test_xml(
             matches = matches_re.findall(sys_stdout)
 
             if sys_stdout:
-                sys_stdout = cleaner_re.sub("", sys_stdout)
+                sys_stdout = cleaner_re.sub("", sys_stdout).strip()
 
             if not matches or not len(matches):
                 test_results.add(

From f40c388ef649fa02125f44b60bdc5251bc952489 Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Tue, 18 Feb 2025 05:25:31 -0500
Subject: [PATCH 03/13] stdout comparison in E2E

---
 code_to_optimize/bubble_sort.py               |  2 ++
 .../end_to_end_test_bubblesort_pytest.py      |  8 +++--
 tests/scripts/end_to_end_test_utilities.py    | 34 ++++++++++++++-----
 3 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/code_to_optimize/bubble_sort.py b/code_to_optimize/bubble_sort.py
index b18994494..787cc4a90 100644
--- a/code_to_optimize/bubble_sort.py
+++ b/code_to_optimize/bubble_sort.py
@@ -1,8 +1,10 @@
 def sorter(arr):
+    print("codeflash stdout: Sorting list")
     for i in range(len(arr)):
         for j in range(len(arr) - 1):
             if arr[j] > arr[j + 1]:
                 temp = arr[j]
                 arr[j] = arr[j + 1]
                 arr[j + 1] = temp
+    print(f"result: {arr}")
     return arr
\ No newline at end of file
diff --git a/tests/scripts/end_to_end_test_bubblesort_pytest.py b/tests/scripts/end_to_end_test_bubblesort_pytest.py
index 08fe3117f..d714703aa 100644
--- a/tests/scripts/end_to_end_test_bubblesort_pytest.py
+++ b/tests/scripts/end_to_end_test_bubblesort_pytest.py
@@ -11,11 +11,15 @@ def run_test(expected_improvement_pct: int) -> bool:
         test_framework="pytest",
         min_improvement_x=1.0,
         coverage_expectations=[
-            CoverageExpectation(function_name="sorter", expected_coverage=100.0, expected_lines=[2, 3, 4, 5, 6, 7, 8])
+            CoverageExpectation(
+                function_name="sorter", expected_coverage=100.0, expected_lines=[2, 3, 4, 5, 6, 7, 8, 9, 10]
+            )
         ],
     )
     cwd = (pathlib.Path(__file__).parent.parent.parent / "code_to_optimize").resolve()
-    return run_codeflash_command(cwd, config, expected_improvement_pct)
+    return run_codeflash_command(
+        cwd, config, expected_improvement_pct, ['print("codeflash stdout: Sorting list")', 'print(f"result: {arr}")']
+    )
 
 
 if __name__ == "__main__":
diff --git a/tests/scripts/end_to_end_test_utilities.py b/tests/scripts/end_to_end_test_utilities.py
index 891ed29f4..23a67a84a 100644
--- a/tests/scripts/end_to_end_test_utilities.py
+++ b/tests/scripts/end_to_end_test_utilities.py
@@ -63,19 +63,21 @@ def validate_coverage(stdout: str, expectations: list[CoverageExpectation]) -> b
         assert coverage_match, f"Failed to find coverage data for {expect.function_name}"
 
         coverage = float(coverage_match.group(1))
-        assert (
-            coverage == expect.expected_coverage
-        ), f"Coverage was {coverage} instead of {expect.expected_coverage} for function: {expect.function_name}"
+        assert coverage == expect.expected_coverage, (
+            f"Coverage was {coverage} instead of {expect.expected_coverage} for function: {expect.function_name}"
+        )
 
         executed_lines = list(map(int, coverage_match.group(2).split(", ")))
-        assert (
-            executed_lines == expect.expected_lines
-        ), f"Executed lines were {executed_lines} instead of {expect.expected_lines} for function: {expect.function_name}"
+        assert executed_lines == expect.expected_lines, (
+            f"Executed lines were {executed_lines} instead of {expect.expected_lines} for function: {expect.function_name}"
+        )
 
     return True
 
 
-def run_codeflash_command(cwd: pathlib.Path, config: TestConfig, expected_improvement_pct: int) -> bool:
+def run_codeflash_command(
+    cwd: pathlib.Path, config: TestConfig, expected_improvement_pct: int, expected_in_stdout: list[str] = None
+) -> bool:
     logging.basicConfig(level=logging.INFO)
     if config.trace_mode:
         return run_trace_test(cwd, config, expected_improvement_pct)
@@ -97,12 +99,21 @@ def run_codeflash_command(cwd: pathlib.Path, config: TestConfig, expected_improv
     return_code = process.wait()
     stdout = "".join(output)
 
-    if not validate_output(stdout, return_code, expected_improvement_pct, config):
+    validated = validate_output(stdout, return_code, expected_improvement_pct, config)
+    if not validated:
         # Write original file contents back to file
         path_to_file.write_text(file_contents, "utf-8")
         logging.info("Codeflash run did not meet expected requirements for testing, reverting file changes.")
         return False
-    return True
+
+    if expected_in_stdout:
+        stdout_validated = validate_stdout_in_candidate(stdout, expected_in_stdout)
+        if not stdout_validated:
+            logging.error("Failed to find expected output in candidate output")
+            validated = False
+        logging.info(f"Success: Expected output found in candidate output")
+
+    return validated
 
 
 def build_command(cwd: pathlib.Path, config: TestConfig, test_root: pathlib.Path) -> list[str]:
@@ -164,6 +175,11 @@ def validate_output(stdout: str, return_code: int, expected_improvement_pct: int
     return True
 
 
+def validate_stdout_in_candidate(stdout: str, expected_in_stdout: list[str]) -> bool:
+    candidate_output = stdout[stdout.find("INFO     Best candidate") : stdout.find("Best Candidate Explanation")]
+    return all(expected in candidate_output for expected in expected_in_stdout)
+
+
 def run_trace_test(cwd: pathlib.Path, config: TestConfig, expected_improvement_pct: int) -> bool:
     # First command: Run the tracer
     test_root = cwd / "tests" / (config.test_framework or "")

From f164fd243190defd04e30d8a5d5924816d7f2dba Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Thu, 20 Feb 2025 19:41:17 -0500
Subject: [PATCH 04/13] add test in instrumentation

---
 code_to_optimize/bubble_sort_method.py |  1 +
 tests/test_instrument_all_and_run.py   | 10 ++++++----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/code_to_optimize/bubble_sort_method.py b/code_to_optimize/bubble_sort_method.py
index 3928e41fb..c95a241d8 100644
--- a/code_to_optimize/bubble_sort_method.py
+++ b/code_to_optimize/bubble_sort_method.py
@@ -3,6 +3,7 @@ def __init__(self, x=0):
         self.x = x
 
     def sorter(self, arr):
+        print("codeflash stdout : BubbleSorter.sorter() called")
         for i in range(len(arr)):
             for j in range(len(arr) - 1):
                 if arr[j] > arr[j + 1]:
diff --git a/tests/test_instrument_all_and_run.py b/tests/test_instrument_all_and_run.py
index 643d4bde7..cb5674ed6 100644
--- a/tests/test_instrument_all_and_run.py
+++ b/tests/test_instrument_all_and_run.py
@@ -168,6 +168,8 @@ def test_sort():
             pytest_max_loops=1,
             testing_time=0.1,
         )
+        assert "codeflash stdout: Sorting list" in test_results[0].stdout
+        assert "result: [0, 1, 2, 3, 4, 5]" in test_results[0].stdout
         assert test_results[0].id.function_getting_tested == "sorter"
         assert test_results[0].id.iteration_id == "1_0"
         assert test_results[0].id.test_class_name is None
@@ -179,7 +181,8 @@ def test_sort():
         assert test_results[0].runtime > 0
         assert test_results[0].did_pass
         assert test_results[0].return_value == ([0, 1, 2, 3, 4, 5],)
-
+        assert "codeflash stdout: Sorting list" in test_results[1].stdout
+        assert "result: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]" in test_results[1].stdout
         assert test_results[1].id.function_getting_tested == "sorter"
         assert test_results[1].id.iteration_id == "4_0"
         assert test_results[1].id.test_class_name is None
@@ -340,13 +343,11 @@ def test_sort():
             pytest_max_loops=1,
             testing_time=0.1,
         )
-
         assert len(test_results) == 4
         assert test_results[0].id.function_getting_tested == "BubbleSorter.__init__"
         assert test_results[0].id.test_function_name == "test_sort"
         assert test_results[0].did_pass
         assert test_results[0].return_value[0] == {"x": 0}
-
         assert test_results[1].id.function_getting_tested == "BubbleSorter.sorter"
         assert test_results[1].id.iteration_id == "2_0"
         assert test_results[1].id.test_class_name is None
@@ -358,7 +359,8 @@ def test_sort():
         assert test_results[1].runtime > 0
         assert test_results[1].did_pass
         assert test_results[1].return_value == ([0, 1, 2, 3, 4, 5],)
-
+        assert "codeflash stdout : BubbleSorter.sorter() called" in test_results[1].stdout
+        assert compare_test_results(test_results, test_results)
         assert test_results[2].id.function_getting_tested == "BubbleSorter.__init__"
         assert test_results[2].id.test_function_name == "test_sort"
         assert test_results[2].did_pass

From d0710ce6408942db5c9d85b45916594b36b1f531 Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Tue, 18 Feb 2025 03:34:20 -0500
Subject: [PATCH 05/13] stdout comparison

Update test_results.py

Update parse_test_output.py

Update equivalence.py
---
 codeflash/verification/parse_test_output.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/codeflash/verification/parse_test_output.py b/codeflash/verification/parse_test_output.py
index a5ca5637c..428b34767 100644
--- a/codeflash/verification/parse_test_output.py
+++ b/codeflash/verification/parse_test_output.py
@@ -268,7 +268,7 @@ def parse_test_xml(
             matches = matches_re.findall(sys_stdout)
 
             if sys_stdout:
-                sys_stdout = cleaner_re.sub("", sys_stdout).strip()
+                sys_stdout = cleaner_re.sub("", sys_stdout)
 
             if not matches or not len(matches):
                 test_results.add(

From 944e9749fd068607e8116d8c5e2d36793d6694b7 Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Tue, 18 Feb 2025 05:02:57 -0500
Subject: [PATCH 06/13] strip

---
 codeflash/verification/parse_test_output.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/codeflash/verification/parse_test_output.py b/codeflash/verification/parse_test_output.py
index 428b34767..a5ca5637c 100644
--- a/codeflash/verification/parse_test_output.py
+++ b/codeflash/verification/parse_test_output.py
@@ -268,7 +268,7 @@ def parse_test_xml(
             matches = matches_re.findall(sys_stdout)
 
             if sys_stdout:
-                sys_stdout = cleaner_re.sub("", sys_stdout)
+                sys_stdout = cleaner_re.sub("", sys_stdout).strip()
 
             if not matches or not len(matches):
                 test_results.add(

From 3147c4a927f98291ecc02f5769fc84bfb7f8d59c Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Thu, 20 Feb 2025 21:02:26 -0500
Subject: [PATCH 07/13] update aiservice test

---
 tests/test_instrumentation_run_results_aiservice.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/test_instrumentation_run_results_aiservice.py b/tests/test_instrumentation_run_results_aiservice.py
index a35556dfd..1224b63d8 100644
--- a/tests/test_instrumentation_run_results_aiservice.py
+++ b/tests/test_instrumentation_run_results_aiservice.py
@@ -193,6 +193,7 @@ def __init__(self, x=1):
         self.x = x
 
     def sorter(self, arr):
+        print("codeflash stdout : BubbleSorter.sorter() called")
         for i in range(len(arr)):
             for j in range(len(arr) - 1):
                 if arr[j] > arr[j + 1]:
@@ -337,6 +338,7 @@ def __init__(self, x=1):
         self.x = x
 
     def sorter(self, arr):
+        print("codeflash stdout : BubbleSorter.sorter() called")
         for i in range(len(arr)):
             for j in range(len(arr) - 1):
                 if arr[j] > arr[j + 1]:
@@ -392,6 +394,7 @@ def __init__(self, x=0):
         self.y = 2
 
     def sorter(self, arr):
+        print("codeflash stdout : BubbleSorter.sorter() called")
         for i in range(len(arr)):
             for j in range(len(arr) - 1):
                 if arr[j] > arr[j + 1]:

From 1a4f60e831a501c4f0e8e868d7a755915e83fdb4 Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Thu, 20 Feb 2025 21:13:13 -0500
Subject: [PATCH 08/13] Update test_codeflash_capture.py

---
 tests/test_codeflash_capture.py | 47 ++++++++++++++++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

diff --git a/tests/test_codeflash_capture.py b/tests/test_codeflash_capture.py
index 83b1efd2b..8e5e237cf 100644
--- a/tests/test_codeflash_capture.py
+++ b/tests/test_codeflash_capture.py
@@ -485,7 +485,6 @@ def __init__(self, x=2):
         assert test_results[1].id.test_module_path == "code_to_optimize.tests.pytest.test_codeflash_capture_temp"
         assert test_results[1].id.function_getting_tested == "some_function"
         assert test_results[1].id.iteration_id == "11_0"
-
         assert test_results[2].did_pass
         assert test_results[2].return_value[0]["x"] == 2
         assert test_results[2].id.test_function_name == "test_example_test_3"
@@ -494,6 +493,17 @@ def __init__(self, x=2):
         assert test_results[2].id.function_getting_tested == "some_function"
         assert test_results[2].id.iteration_id == "16_0"
 
+        test_results2, _ = func_optimizer.run_and_parse_tests(
+            testing_type=TestingMode.BEHAVIOR,
+            test_env=test_env,
+            test_files=func_optimizer.test_files,
+            optimization_iteration=0,
+            pytest_min_loops=1,
+            pytest_max_loops=1,
+            testing_time=0.1,
+        )
+        assert compare_test_results(test_results, test_results2)
+
     finally:
         test_path.unlink(missing_ok=True)
         sample_code_path.unlink(missing_ok=True)
@@ -605,6 +615,18 @@ def __init__(self, *args, **kwargs):
         assert test_results[2].id.function_getting_tested == "some_function"
         assert test_results[2].id.iteration_id == "16_0"
 
+        results2, _ = func_optimizer.run_and_parse_tests(
+            testing_type=TestingMode.BEHAVIOR,
+            test_env=test_env,
+            test_files=func_optimizer.test_files,
+            optimization_iteration=0,
+            pytest_min_loops=1,
+            pytest_max_loops=1,
+            testing_time=0.1,
+        )
+
+        assert compare_test_results(test_results, results2)
+
     finally:
         test_path.unlink(missing_ok=True)
         sample_code_path.unlink(missing_ok=True)
@@ -720,6 +742,17 @@ def __init__(self, x=2):
         assert test_results[2].id.function_getting_tested == "some_function"
         assert test_results[2].id.iteration_id == "12_2"  # Third call
 
+        test_results2, _ = func_optimizer.run_and_parse_tests(
+            testing_type=TestingMode.BEHAVIOR,
+            test_env=test_env,
+            test_files=func_optimizer.test_files,
+            optimization_iteration=0,
+            pytest_min_loops=1,
+            pytest_max_loops=1,
+            testing_time=0.1,
+        )
+
+        assert compare_test_results(test_results, test_results2)
     finally:
         test_path.unlink(missing_ok=True)
         sample_code_path.unlink(missing_ok=True)
@@ -856,6 +889,18 @@ def another_helper(self):
         assert test_results[3].id.function_getting_tested == "AnotherHelperClass.__init__"
         assert test_results[3].verification_type == VerificationType.INIT_STATE_HELPER
 
+        results2, _ = func_optimizer.run_and_parse_tests(
+            testing_type=TestingMode.BEHAVIOR,
+            test_env=test_env,
+            test_files=func_optimizer.test_files,
+            optimization_iteration=0,
+            pytest_min_loops=1,
+            pytest_max_loops=1,
+            testing_time=0.1,
+        )
+
+        assert compare_test_results(test_results, results2)
+
     finally:
         test_path.unlink(missing_ok=True)
         fto_file_path.unlink(missing_ok=True)

From ed970eaff9f259746273e59f08e8a2519dd88f80 Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Thu, 20 Feb 2025 21:23:16 -0500
Subject: [PATCH 09/13] add more unit tests

---
 tests/test_instrument_all_and_run.py | 24 ++++++++++++++++++++++++
 tests/test_instrument_tests.py       |  8 +++++++-
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/tests/test_instrument_all_and_run.py b/tests/test_instrument_all_and_run.py
index cb5674ed6..d5e9f689a 100644
--- a/tests/test_instrument_all_and_run.py
+++ b/tests/test_instrument_all_and_run.py
@@ -193,6 +193,18 @@ def test_sort():
         )
         assert test_results[1].runtime > 0
         assert test_results[1].did_pass
+        results2, _ = func_optimizer.run_and_parse_tests(
+            testing_type=TestingMode.BEHAVIOR,
+            test_env=test_env,
+            test_files=func_optimizer.test_files,
+            optimization_iteration=0,
+            pytest_min_loops=1,
+            pytest_max_loops=1,
+            testing_time=0.1,
+        )
+        assert "codeflash stdout: Sorting list" in results2[0].stdout
+        assert "result: [0, 1, 2, 3, 4, 5]" in results2[0].stdout
+        assert compare_test_results(test_results, results2)
     finally:
         fto_path.write_text(original_code, "utf-8")
         test_path.unlink(missing_ok=True)
@@ -377,6 +389,18 @@ def test_sort():
         assert test_results[3].runtime > 0
         assert test_results[3].did_pass
 
+        results2, _ = func_optimizer.run_and_parse_tests(
+            testing_type=TestingMode.BEHAVIOR,
+            test_env=test_env,
+            test_files=func_optimizer.test_files,
+            optimization_iteration=0,
+            pytest_min_loops=1,
+            pytest_max_loops=1,
+            testing_time=0.1,
+        )
+
+        assert compare_test_results(test_results, results2)
+
         # Replace with optimized code that mutated instance attribute
         optimized_code = """
 class BubbleSorter:
diff --git a/tests/test_instrument_tests.py b/tests/test_instrument_tests.py
index bf7373522..cfc0a8842 100644
--- a/tests/test_instrument_tests.py
+++ b/tests/test_instrument_tests.py
@@ -482,6 +482,8 @@ def test_sort():
         )
         assert test_results_perf[1].runtime > 0
         assert test_results_perf[1].did_pass
+        assert "codeflash stdout: Sorting list" in test_results_perf[1].stdout
+        assert "result: [0, 1, 2, 3, 4, 5]" in test_results_perf[1].stdout
     finally:
         test_path.unlink(missing_ok=True)
         test_path_perf.unlink(missing_ok=True)
@@ -693,6 +695,9 @@ def test_sort_parametrized(input, expected_output):
         assert test_results_perf[1].runtime > 0
         assert test_results_perf[1].did_pass
 
+        assert "codeflash stdout: Sorting list" in test_results_perf[1].stdout
+        assert "result: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]" in test_results_perf[1].stdout
+
         assert test_results_perf[2].id.function_getting_tested == "sorter"
         assert test_results_perf[2].id.iteration_id == "0_2"
         assert test_results_perf[2].id.test_class_name is None
@@ -1230,7 +1235,8 @@ def test_sort():
         assert test_results[0].runtime > 0
         assert test_results[0].did_pass
         assert test_results[0].return_value is None
-
+        assert "codeflash stdout: Sorting list" in test_results[0].stdout
+        assert "result: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]" in test_results[0].stdout
         assert test_results[1].id.function_getting_tested == "sorter"
         assert test_results[1].id.iteration_id == "2_2_1"
         assert test_results[1].id.test_class_name is None

From f3972894547191da61d43f993415f3a685a18054 Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Fri, 21 Feb 2025 00:18:37 -0800
Subject: [PATCH 10/13] address review.

---
 code_to_optimize/bubble_sort_method.py          |  4 ++++
 tests/test_instrument_all_and_run.py            | 17 +++++++++++++----
 tests/test_instrument_tests.py                  | 12 ++++++++++--
 ...est_instrumentation_run_results_aiservice.py | 12 ++++++++++++
 4 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/code_to_optimize/bubble_sort_method.py b/code_to_optimize/bubble_sort_method.py
index c95a241d8..962fde339 100644
--- a/code_to_optimize/bubble_sort_method.py
+++ b/code_to_optimize/bubble_sort_method.py
@@ -1,3 +1,6 @@
+import sys
+
+
 class BubbleSorter:
     def __init__(self, x=0):
         self.x = x
@@ -10,4 +13,5 @@ def sorter(self, arr):
                     temp = arr[j]
                     arr[j] = arr[j + 1]
                     arr[j + 1] = temp
+        print("stderr test", file=sys.stderr)
         return arr
diff --git a/tests/test_instrument_all_and_run.py b/tests/test_instrument_all_and_run.py
index d5e9f689a..bfad32f91 100644
--- a/tests/test_instrument_all_and_run.py
+++ b/tests/test_instrument_all_and_run.py
@@ -168,8 +168,17 @@ def test_sort():
             pytest_max_loops=1,
             testing_time=0.1,
         )
-        assert "codeflash stdout: Sorting list" in test_results[0].stdout
-        assert "result: [0, 1, 2, 3, 4, 5]" in test_results[0].stdout
+
+        out_str = """--------------------------------- Captured Log ---------------------------------
+
+--------------------------------- Captured Out ---------------------------------
+
+codeflash stdout: Sorting list
+result: [0, 1, 2, 3, 4, 5]
+
+codeflash stdout: Sorting list
+result: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]"""
+        assert out_str == test_results[0].stdout.strip()
         assert test_results[0].id.function_getting_tested == "sorter"
         assert test_results[0].id.iteration_id == "1_0"
         assert test_results[0].id.test_class_name is None
@@ -181,8 +190,8 @@ def test_sort():
         assert test_results[0].runtime > 0
         assert test_results[0].did_pass
         assert test_results[0].return_value == ([0, 1, 2, 3, 4, 5],)
-        assert "codeflash stdout: Sorting list" in test_results[1].stdout
-        assert "result: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]" in test_results[1].stdout
+        assert out_str == test_results[1].stdout.strip()
+
         assert test_results[1].id.function_getting_tested == "sorter"
         assert test_results[1].id.iteration_id == "4_0"
         assert test_results[1].id.test_class_name is None
diff --git a/tests/test_instrument_tests.py b/tests/test_instrument_tests.py
index cfc0a8842..282c9f1d5 100644
--- a/tests/test_instrument_tests.py
+++ b/tests/test_instrument_tests.py
@@ -482,8 +482,16 @@ def test_sort():
         )
         assert test_results_perf[1].runtime > 0
         assert test_results_perf[1].did_pass
-        assert "codeflash stdout: Sorting list" in test_results_perf[1].stdout
-        assert "result: [0, 1, 2, 3, 4, 5]" in test_results_perf[1].stdout
+        out_str = """--------------------------------- Captured Log ---------------------------------
+
+--------------------------------- Captured Out ---------------------------------
+codeflash stdout: Sorting list
+result: [0, 1, 2, 3, 4, 5]
+
+codeflash stdout: Sorting list
+result: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]"""
+        assert out_str == test_results_perf[1].stdout
+
     finally:
         test_path.unlink(missing_ok=True)
         test_path_perf.unlink(missing_ok=True)
diff --git a/tests/test_instrumentation_run_results_aiservice.py b/tests/test_instrumentation_run_results_aiservice.py
index 1224b63d8..670b23fb2 100644
--- a/tests/test_instrumentation_run_results_aiservice.py
+++ b/tests/test_instrumentation_run_results_aiservice.py
@@ -187,6 +187,9 @@ def test_single_element_list():
 
         # Replace with optimized code that mutated instance attribute
         optimized_code_mutated_attr = """
+import sys
+
+
 class BubbleSorter:
 
     def __init__(self, x=1):
@@ -200,6 +203,7 @@ def sorter(self, arr):
                     temp = arr[j]
                     arr[j] = arr[j + 1]
                     arr[j + 1] = temp
+        print("stderr test", file=sys.stderr)
         return arr
                         """
         fto_path.write_text(optimized_code_mutated_attr, "utf-8")
@@ -332,6 +336,9 @@ def test_single_element_list():
         assert test_results[1].return_value[2] == [1, 2, 3]
         # Replace with optimized code that mutated instance attribute
         optimized_code_mutated_attr = """
+import sys
+
+
 class BubbleSorter:
 
     def __init__(self, x=1):
@@ -345,6 +352,7 @@ def sorter(self, arr):
                     temp = arr[j]
                     arr[j] = arr[j + 1]
                     arr[j + 1] = temp
+        print("stderr test", file=sys.stderr)
         return arr
                         """
         fto_path.write_text(optimized_code_mutated_attr, "utf-8")
@@ -388,6 +396,9 @@ def sorter(self, arr):
         )  # The test should fail because the instance attribute was mutated
         # Replace with optimized code that did not mutate existing instance attribute, but added a new one
         optimized_code_new_attr = """
+import sys
+
+
 class BubbleSorter:
     def __init__(self, x=0):
         self.x = x
@@ -401,6 +412,7 @@ def sorter(self, arr):
                     temp = arr[j]
                     arr[j] = arr[j + 1]
                     arr[j + 1] = temp
+        print("stderr test", file=sys.stderr)
         return arr
                         """
         fto_path.write_text(optimized_code_new_attr, "utf-8")

From 11e43ea6bb86506bd9f66fa8ccd5a247f309f86d Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Mon, 24 Feb 2025 15:34:39 -0800
Subject: [PATCH 11/13] strip out captured log part

---
 codeflash/verification/parse_test_output.py | 3 ++-
 tests/test_instrument_all_and_run.py        | 6 +-----
 tests/test_instrument_tests.py              | 5 +----
 3 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/codeflash/verification/parse_test_output.py b/codeflash/verification/parse_test_output.py
index a5ca5637c..fa565b5b9 100644
--- a/codeflash/verification/parse_test_output.py
+++ b/codeflash/verification/parse_test_output.py
@@ -43,7 +43,8 @@ def parse_func(file_path: Path) -> XMLParser:
 
 
 matches_re = re.compile(r"!######(.*?):(.*?)([^\.:]*?):(.*?):(.*?):(.*?)######!")
-cleaner_re = re.compile(r"!######(.*?)######!")
+cleaner_re = re.compile(r"!######.*?######!|-+\s*Captured\s+(Log|Out)\s*-+\n?")
+
 
 
 def parse_test_return_values_bin(file_location: Path, test_files: TestFiles, test_config: TestConfig) -> TestResults:
diff --git a/tests/test_instrument_all_and_run.py b/tests/test_instrument_all_and_run.py
index bfad32f91..ea0ab0df9 100644
--- a/tests/test_instrument_all_and_run.py
+++ b/tests/test_instrument_all_and_run.py
@@ -169,11 +169,7 @@ def test_sort():
             testing_time=0.1,
         )
 
-        out_str = """--------------------------------- Captured Log ---------------------------------
-
---------------------------------- Captured Out ---------------------------------
-
-codeflash stdout: Sorting list
+        out_str = """codeflash stdout: Sorting list
 result: [0, 1, 2, 3, 4, 5]
 
 codeflash stdout: Sorting list
diff --git a/tests/test_instrument_tests.py b/tests/test_instrument_tests.py
index 282c9f1d5..c38c8c45a 100644
--- a/tests/test_instrument_tests.py
+++ b/tests/test_instrument_tests.py
@@ -482,10 +482,7 @@ def test_sort():
         )
         assert test_results_perf[1].runtime > 0
         assert test_results_perf[1].did_pass
-        out_str = """--------------------------------- Captured Log ---------------------------------
-
---------------------------------- Captured Out ---------------------------------
-codeflash stdout: Sorting list
+        out_str = """codeflash stdout: Sorting list
 result: [0, 1, 2, 3, 4, 5]
 
 codeflash stdout: Sorting list

From bb1bbf9f99c1e38116b0dc614c498b12727b3445 Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Thu, 27 Feb 2025 11:35:45 -0800
Subject: [PATCH 12/13] missed a few direct comparison for stdout

---
 tests/test_instrument_all_and_run.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/tests/test_instrument_all_and_run.py b/tests/test_instrument_all_and_run.py
index ea0ab0df9..99ff5047c 100644
--- a/tests/test_instrument_all_and_run.py
+++ b/tests/test_instrument_all_and_run.py
@@ -207,8 +207,12 @@ def test_sort():
             pytest_max_loops=1,
             testing_time=0.1,
         )
-        assert "codeflash stdout: Sorting list" in results2[0].stdout
-        assert "result: [0, 1, 2, 3, 4, 5]" in results2[0].stdout
+        out_str = """codeflash stdout: Sorting list
+result: [0, 1, 2, 3, 4, 5]
+
+codeflash stdout: Sorting list
+result: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]"""
+        assert out_str == results2[0].stdout.strip()
         assert compare_test_results(test_results, results2)
     finally:
         fto_path.write_text(original_code, "utf-8")
@@ -376,7 +380,12 @@ def test_sort():
         assert test_results[1].runtime > 0
         assert test_results[1].did_pass
         assert test_results[1].return_value == ([0, 1, 2, 3, 4, 5],)
-        assert "codeflash stdout : BubbleSorter.sorter() called" in test_results[1].stdout
+        out_str = """
+codeflash stdout : BubbleSorter.sorter() called
+
+
+codeflash stdout : BubbleSorter.sorter() called"""
+        assert test_results[1].stdout.strip() == out_str.strip()
         assert compare_test_results(test_results, test_results)
         assert test_results[2].id.function_getting_tested == "BubbleSorter.__init__"
         assert test_results[2].id.test_function_name == "test_sort"

From dcf9384dc03be361e15088140fea5f6b95cbaf96 Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Thu, 27 Feb 2025 22:56:58 -0800
Subject: [PATCH 13/13] adress review

Update pyproject.toml
Update test_instrument_tests.py
Update test_instrument_all_and_run.py
missing test
---
 pyproject.toml                                   |  1 -
 tests/test_instrument_all_and_run.py             | 10 +++-------
 tests/test_instrument_tests.py                   | 16 ++++++++++++----
 ...test_instrumentation_run_results_aiservice.py |  5 +++++
 4 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index b4e5b324f..2e71f2a0a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -119,7 +119,6 @@ types-gevent = "^24.11.0.20241230"
 types-greenlet = "^3.1.0.20241221"
 types-pexpect = "^4.9.0.20241208"
 types-unidiff = "^0.7.0.20240505"
-sqlalchemy = "^2.0.38"
 uv = ">=0.6.2"
 
 [tool.poetry.build]
diff --git a/tests/test_instrument_all_and_run.py b/tests/test_instrument_all_and_run.py
index 99ff5047c..ce06c855a 100644
--- a/tests/test_instrument_all_and_run.py
+++ b/tests/test_instrument_all_and_run.py
@@ -174,7 +174,7 @@ def test_sort():
 
 codeflash stdout: Sorting list
 result: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]"""
-        assert out_str == test_results[0].stdout.strip()
+        assert out_str == test_results[0].stdout
         assert test_results[0].id.function_getting_tested == "sorter"
         assert test_results[0].id.iteration_id == "1_0"
         assert test_results[0].id.test_class_name is None
@@ -380,12 +380,8 @@ def test_sort():
         assert test_results[1].runtime > 0
         assert test_results[1].did_pass
         assert test_results[1].return_value == ([0, 1, 2, 3, 4, 5],)
-        out_str = """
-codeflash stdout : BubbleSorter.sorter() called
-
-
-codeflash stdout : BubbleSorter.sorter() called"""
-        assert test_results[1].stdout.strip() == out_str.strip()
+        out_str = """codeflash stdout : BubbleSorter.sorter() called\n\n\ncodeflash stdout : BubbleSorter.sorter() called"""
+        assert test_results[1].stdout == out_str
         assert compare_test_results(test_results, test_results)
         assert test_results[2].id.function_getting_tested == "BubbleSorter.__init__"
         assert test_results[2].id.test_function_name == "test_sort"
diff --git a/tests/test_instrument_tests.py b/tests/test_instrument_tests.py
index c38c8c45a..79f4bc5dd 100644
--- a/tests/test_instrument_tests.py
+++ b/tests/test_instrument_tests.py
@@ -700,8 +700,9 @@ def test_sort_parametrized(input, expected_output):
         assert test_results_perf[1].runtime > 0
         assert test_results_perf[1].did_pass
 
-        assert "codeflash stdout: Sorting list" in test_results_perf[1].stdout
-        assert "result: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]" in test_results_perf[1].stdout
+        out_str = """codeflash stdout: Sorting list
+result: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]"""
+        assert out_str == test_results_perf[1].stdout
 
         assert test_results_perf[2].id.function_getting_tested == "sorter"
         assert test_results_perf[2].id.iteration_id == "0_2"
@@ -1240,8 +1241,15 @@ def test_sort():
         assert test_results[0].runtime > 0
         assert test_results[0].did_pass
         assert test_results[0].return_value is None
-        assert "codeflash stdout: Sorting list" in test_results[0].stdout
-        assert "result: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]" in test_results[0].stdout
+        out_str = """codeflash stdout: Sorting list
+result: [0, 1, 2, 3, 4, 5]
+
+codeflash stdout: Sorting list
+result: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]
+
+codeflash stdout: Sorting list
+result: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]"""
+        assert test_results[1].stdout == out_str
         assert test_results[1].id.function_getting_tested == "sorter"
         assert test_results[1].id.iteration_id == "2_2_1"
         assert test_results[1].id.test_class_name is None
diff --git a/tests/test_instrumentation_run_results_aiservice.py b/tests/test_instrumentation_run_results_aiservice.py
index 670b23fb2..888c629ef 100644
--- a/tests/test_instrumentation_run_results_aiservice.py
+++ b/tests/test_instrumentation_run_results_aiservice.py
@@ -179,6 +179,7 @@ def test_single_element_list():
             testing_time=0.1,
         )
         assert test_results[0].id.function_getting_tested == "sorter"
+        assert test_results[0].stdout == "codeflash stdout : BubbleSorter.sorter() called"
         assert test_results[0].id.test_function_name == "test_single_element_list"
         assert test_results[0].did_pass
         assert test_results[0].return_value[1]["arr"] == [42]
@@ -222,6 +223,7 @@ def sorter(self, arr):
             test_results, test_results_mutated_attr
         )  # Without codeflash capture, the init state was not verified, and the results are verified as correct even with the attribute mutated
 
+        assert test_results_mutated_attr[0].stdout == "codeflash stdout : BubbleSorter.sorter() called"
     finally:
         fto_path.write_text(original_code, "utf-8")
         test_path.unlink(missing_ok=True)
@@ -322,6 +324,7 @@ def test_single_element_list():
         assert test_results[0].id.test_function_name == "test_single_element_list"
         assert test_results[0].did_pass
         assert test_results[0].return_value[0] == {"x": 0}
+        assert test_results[0].stdout == "codeflash stdout : BubbleSorter.sorter() called"
 
         # Verify function_to_optimize result
         assert test_results[1].id.function_getting_tested == "sorter"
@@ -391,6 +394,7 @@ def sorter(self, arr):
         assert test_results_mutated_attr[0].id.function_getting_tested == "BubbleSorter.__init__"
         assert test_results_mutated_attr[0].return_value[0] == {"x": 1}
         assert test_results_mutated_attr[0].verification_type == VerificationType.INIT_STATE_FTO
+        assert test_results_mutated_attr[0].stdout == "codeflash stdout : BubbleSorter.sorter() called"
         assert not compare_test_results(
             test_results, test_results_mutated_attr
         )  # The test should fail because the instance attribute was mutated
@@ -442,6 +446,7 @@ def sorter(self, arr):
         assert test_results_new_attr[0].id.function_getting_tested == "BubbleSorter.__init__"
         assert test_results_new_attr[0].return_value[0] == {"x": 0, "y": 2}
         assert test_results_new_attr[0].verification_type == VerificationType.INIT_STATE_FTO
+        assert test_results_new_attr[0].stdout == "codeflash stdout : BubbleSorter.sorter() called"
         # assert test_results_new_attr[1].return_value[1]["self"].x == 0 TODO: add self as input
         # assert test_results_new_attr[1].return_value[1]["self"].y == 2 TODO: add self as input
         assert compare_test_results(