From fec06275efeb6a0d0595455a88b0a209cf51d6b4 Mon Sep 17 00:00:00 2001
From: Saurabh Misra <misra.saurabh1@gmail.com>
Date: Wed, 11 Jun 2025 22:04:24 -0700
Subject: [PATCH 01/16] first version that does not work correctly

---
 codeflash/optimization/function_optimizer.py |  13 +-
 codeflash/result/create_pr.py                |  61 +-
 tests/test_existing_tests_source_for.py      | 645 +++++++++++++++++++
 3 files changed, 711 insertions(+), 8 deletions(-)
 create mode 100644 tests/test_existing_tests_source_for.py

diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
index f6c7661b4..26d8ede73 100644
--- a/codeflash/optimization/function_optimizer.py
+++ b/codeflash/optimization/function_optimizer.py
@@ -341,12 +341,6 @@ def optimize_function(self) -> Result[BestOptimization, str]:  # noqa: PLR0911
                     optimized_function=best_optimization.candidate.source_code,
                 )
 
-                existing_tests = existing_tests_source_for(
-                    self.function_to_optimize.qualified_name_with_modules_from_root(self.project_root),
-                    function_to_all_tests,
-                    tests_root=self.test_cfg.tests_root,
-                )
-
                 original_code_combined = original_helper_code.copy()
                 original_code_combined[explanation.file_path] = self.function_to_optimize_source_code
                 new_code_combined = new_helper_code.copy()
@@ -369,6 +363,13 @@ def optimize_function(self) -> Result[BestOptimization, str]:  # noqa: PLR0911
                     generated_tests_str = "\n\n".join(
                         [test.generated_original_test_source for test in generated_tests.generated_tests]
                     )
+                    existing_tests = existing_tests_source_for(
+                        self.function_to_optimize.qualified_name_with_modules_from_root(self.project_root),
+                        function_to_all_tests,
+                        tests_root=self.test_cfg.tests_root,
+                        original_test_results=original_code_baseline.benchmarking_test_results,
+                        optimized_test_results=best_optimization.winning_benchmarking_test_results,
+                    )
                     if concolic_test_str:
                         generated_tests_str += "\n\n" + concolic_test_str
 
diff --git a/codeflash/result/create_pr.py b/codeflash/result/create_pr.py
index b9e05e660..a58fc63ea 100644
--- a/codeflash/result/create_pr.py
+++ b/codeflash/result/create_pr.py
@@ -19,7 +19,7 @@
 from codeflash.github.PrComment import FileDiffContent, PrComment
 
 if TYPE_CHECKING:
-    from codeflash.models.models import FunctionCalledInTest
+    from codeflash.models.models import FunctionCalledInTest, TestResults
     from codeflash.result.explanation import Explanation
 
 
@@ -27,12 +27,69 @@ def existing_tests_source_for(
     function_qualified_name_with_modules_from_root: str,
     function_to_tests: dict[str, set[FunctionCalledInTest]],
     tests_root: Path,
+    original_test_results: Optional[TestResults] = None,
+    optimized_test_results: Optional[TestResults] = None,
 ) -> str:
     test_files = function_to_tests.get(function_qualified_name_with_modules_from_root)
     existing_tests_unique = set()
+
     if test_files:
+        # Group test cases by test file
+        test_files_grouped = {}
         for test_file in test_files:
-            existing_tests_unique.add("- " + str(Path(test_file.tests_in_file.test_file).relative_to(tests_root)))
+            file_path = Path(test_file.tests_in_file.test_file)
+            relative_path = str(file_path.relative_to(tests_root))
+
+            if relative_path not in test_files_grouped:
+                test_files_grouped[relative_path] = []
+            test_files_grouped[relative_path].append(test_file)
+
+        # Create detailed report for each test file
+        for relative_path, tests_in_file in sorted(test_files_grouped.items()):
+            file_line = f"- {relative_path}"
+
+            # Add test case details with timing information if available
+            if original_test_results and optimized_test_results:
+                test_case_details = []
+
+                # Use the same pattern as add_runtime_comments_to_generated_tests
+                original_runtime_by_test = original_test_results.usable_runtime_data_by_test_case()
+                optimized_runtime_by_test = optimized_test_results.usable_runtime_data_by_test_case()
+
+                # Collect test function names for this file
+                test_functions_in_file = {test_file.tests_in_file.test_function for test_file in tests_in_file}
+
+                # Create timing report for each test function
+                for test_function_name in sorted(test_functions_in_file):
+                    # Find matching runtime data
+                    original_runtimes = []
+                    optimized_runtimes = []
+
+                    for invocation_id, runtimes in original_runtime_by_test.items():
+                        if invocation_id.test_function_name == test_function_name:
+                            original_runtimes.extend(runtimes)
+
+                    for invocation_id, runtimes in optimized_runtime_by_test.items():
+                        if invocation_id.test_function_name == test_function_name:
+                            optimized_runtimes.extend(runtimes)
+
+                    if original_runtimes and optimized_runtimes:
+                        # Use minimum timing like the generated tests function does
+                        original_time = min(original_runtimes)
+                        optimized_time = min(optimized_runtimes)
+
+                        from codeflash.code_utils.time_utils import format_time
+
+                        original_str = format_time(original_time)
+                        optimized_str = format_time(optimized_time)
+
+                        test_case_details.append(f"    - {test_function_name}: {original_str} -> {optimized_str}")
+
+                if test_case_details:
+                    file_line += "\n" + "\n".join(test_case_details)
+
+            existing_tests_unique.add(file_line)
+
     return "\n".join(sorted(existing_tests_unique))
 
 
diff --git a/tests/test_existing_tests_source_for.py b/tests/test_existing_tests_source_for.py
new file mode 100644
index 000000000..1a00c47c0
--- /dev/null
+++ b/tests/test_existing_tests_source_for.py
@@ -0,0 +1,645 @@
+"""Tests for the existing_tests_source_for function in result/create_pr.py."""
+
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+from codeflash.models.models import (
+    CodePosition,
+    FunctionCalledInTest,
+    FunctionTestInvocation,
+    InvocationId,
+    TestResults,
+    TestsInFile,
+    TestType, VerificationType,
+)
+from codeflash.result.create_pr import existing_tests_source_for
+
+
+@pytest.fixture
+def sample_tests_root(tmp_path: Path) -> Path:
+    """Create a temporary test root directory."""
+    return tmp_path / "tests"
+
+
+@pytest.fixture
+def sample_function_to_tests(sample_tests_root: Path) -> dict[str, set[FunctionCalledInTest]]:
+    """Create sample function to tests mapping."""
+    test_file_1 = sample_tests_root / "test_module1.py"
+    test_file_2 = sample_tests_root / "test_module2.py"
+
+    return {
+        "my_module.my_function": {
+            FunctionCalledInTest(
+                tests_in_file=TestsInFile(
+                    test_file=test_file_1,
+                    test_class=None,
+                    test_function="test_basic_functionality",
+                    test_type=TestType.EXISTING_UNIT_TEST,
+                ),
+                position=CodePosition(line_no=10, col_no=4),
+            ),
+            FunctionCalledInTest(
+                tests_in_file=TestsInFile(
+                    test_file=test_file_1,
+                    test_class="TestMyFunction",
+                    test_function="test_edge_cases",
+                    test_type=TestType.EXISTING_UNIT_TEST,
+                ),
+                position=CodePosition(line_no=20, col_no=8),
+            ),
+            FunctionCalledInTest(
+                tests_in_file=TestsInFile(
+                    test_file=test_file_2,
+                    test_class=None,
+                    test_function="test_performance",
+                    test_type=TestType.EXISTING_UNIT_TEST,
+                ),
+                position=CodePosition(line_no=15, col_no=4),
+            ),
+        }
+    }
+
+
+@pytest.fixture
+def sample_original_test_results() -> TestResults:
+    """Create sample original test results with timing information."""
+    results = TestResults()
+
+    # Test case 1: test_basic_functionality with multiple function calls
+    results.add(
+        FunctionTestInvocation(
+            id=InvocationId(
+                test_module_path="tests.test_module1",
+                test_class_name=None,
+                test_function_name="test_basic_functionality",
+                function_getting_tested="my_function",
+                iteration_id="1",
+            ),
+            file_name=Path("/tmp/tests/test_module1.py"),
+            did_pass=True,
+            runtime=1000,  # 1000 ns
+            test_framework="pytest",
+            test_type=TestType.EXISTING_UNIT_TEST,
+            return_value=None,
+            timed_out=False,
+            loop_index=1,
+        )
+    )
+
+    results.add(
+        FunctionTestInvocation(
+            id=InvocationId(
+                test_module_path="tests.test_module1",
+                test_class_name=None,
+                test_function_name="test_basic_functionality",
+                function_getting_tested="my_function",
+                iteration_id="2",
+            ),
+            file_name=Path("/tmp/tests/test_module1.py"),
+            did_pass=True,
+            runtime=500,  # 500 ns
+            test_framework="pytest",
+            test_type=TestType.EXISTING_UNIT_TEST,
+            return_value=None,
+            timed_out=False,
+            loop_index=1,
+        )
+    )
+
+    # Test case 2: test_edge_cases
+    results.add(
+        FunctionTestInvocation(
+            id=InvocationId(
+                test_module_path="tests.test_module1",
+                test_class_name="TestMyFunction",
+                test_function_name="test_edge_cases",
+                function_getting_tested="my_function",
+                iteration_id="1",
+            ),
+            file_name=Path("/tmp/tests/test_module1.py"),
+            did_pass=True,
+            runtime=2000,  # 2000 ns
+            test_framework="pytest",
+            test_type=TestType.EXISTING_UNIT_TEST,
+            return_value=None,
+            timed_out=False,
+            loop_index=1,
+        )
+    )
+
+    # Test case 3: test_performance
+    results.add(
+        FunctionTestInvocation(
+            id=InvocationId(
+                test_module_path="tests.test_module2",
+                test_class_name=None,
+                test_function_name="test_performance",
+                function_getting_tested="my_function",
+                iteration_id="1",
+            ),
+            file_name=Path("/tmp/tests/test_module2.py"),
+            did_pass=True,
+            runtime=3000,  # 3000 ns
+            test_framework="pytest",
+            test_type=TestType.EXISTING_UNIT_TEST,
+            return_value=None,
+            timed_out=False,
+            loop_index=1,
+        )
+    )
+
+    return results
+
+
+@pytest.fixture
+def sample_optimized_test_results() -> TestResults:
+    """Create sample optimized test results with improved timing information."""
+    results = TestResults()
+
+    # Test case 1: test_basic_functionality with multiple function calls (improved)
+    results.add(
+        FunctionTestInvocation(
+            id=InvocationId(
+                test_module_path="tests.test_module1",
+                test_class_name=None,
+                test_function_name="test_basic_functionality",
+                function_getting_tested="my_function",
+                iteration_id="1",
+            ),
+            file_name=Path("/tmp/tests/test_module1.py"),
+            did_pass=True,
+            runtime=800,  # 800 ns (improved from 1000 ns)
+            test_framework="pytest",
+            test_type=TestType.EXISTING_UNIT_TEST,
+            return_value=None,
+            timed_out=False,
+            loop_index=1,
+        )
+    )
+
+    results.add(
+        FunctionTestInvocation(
+            id=InvocationId(
+                test_module_path="tests.test_module1",
+                test_class_name=None,
+                test_function_name="test_basic_functionality",
+                function_getting_tested="my_function",
+                iteration_id="2",
+            ),
+            file_name=Path("/tmp/tests/test_module1.py"),
+            did_pass=True,
+            runtime=400,  # 400 ns (improved from 500 ns)
+            test_framework="pytest",
+            test_type=TestType.EXISTING_UNIT_TEST,
+            return_value=None,
+            timed_out=False,
+            loop_index=1,
+        )
+    )
+
+    # Test case 2: test_edge_cases (improved)
+    results.add(
+        FunctionTestInvocation(
+            id=InvocationId(
+                test_module_path="tests.test_module1",
+                test_class_name="TestMyFunction",
+                test_function_name="test_edge_cases",
+                function_getting_tested="my_function",
+                iteration_id="1",
+            ),
+            file_name=Path("/tmp/tests/test_module1.py"),
+            did_pass=True,
+            runtime=1500,  # 1500 ns (improved from 2000 ns)
+            test_framework="pytest",
+            test_type=TestType.EXISTING_UNIT_TEST,
+            return_value=None,
+            timed_out=False,
+            loop_index=1,
+        )
+    )
+
+    # Test case 3: test_performance (improved)
+    results.add(
+        FunctionTestInvocation(
+            id=InvocationId(
+                test_module_path="tests.test_module2",
+                test_class_name=None,
+                test_function_name="test_performance",
+                function_getting_tested="my_function",
+                iteration_id="1",
+            ),
+            file_name=Path("/tmp/tests/test_module2.py"),
+            did_pass=True,
+            runtime=2100,  # 2100 ns (improved from 3000 ns)
+            test_framework="pytest",
+            test_type=TestType.EXISTING_UNIT_TEST,
+            return_value=None,
+            timed_out=False,
+            loop_index=1,
+        )
+    )
+
+    return results
+
+
+def test_existing_tests_source_for_without_timing_info(
+    sample_function_to_tests: dict[str, set[FunctionCalledInTest]], sample_tests_root: Path
+):
+    """Test the function works without timing information (backward compatibility)."""
+    result = existing_tests_source_for("my_module.my_function", sample_function_to_tests, sample_tests_root)
+
+    expected_lines = ["- test_module1.py", "- test_module2.py"]
+
+    for line in expected_lines:
+        assert line in result
+
+    # Should not contain any timing information
+    assert "->" not in result
+    assert "ns" not in result
+
+
+def test_existing_tests_source_for_with_timing_info(
+    sample_function_to_tests: dict[str, set[FunctionCalledInTest]],
+    sample_tests_root: Path,
+    sample_original_test_results: TestResults,
+    sample_optimized_test_results: TestResults,
+):
+    """Test the function includes timing information when provided."""
+    with patch("codeflash.code_utils.time_utils.format_time") as mock_format_time:
+        # Mock format_time to return predictable values
+        mock_format_time.side_effect = lambda x: f"{x} ns"
+
+        result = existing_tests_source_for(
+            "my_module.my_function",
+            sample_function_to_tests,
+            sample_tests_root,
+            sample_original_test_results,
+            sample_optimized_test_results,
+        )
+
+    # Should contain file names
+    assert "- test_module1.py" in result
+    assert "- test_module2.py" in result
+
+    # Should contain test function names with timing (using min values now)
+    assert "test_basic_functionality: 500 ns -> 400 ns" in result  # min(1000,500) -> min(800,400)
+    assert "test_edge_cases: 2000 ns -> 1500 ns" in result
+    assert "test_performance: 3000 ns -> 2100 ns" in result
+
+
+def test_existing_tests_source_for_aggregates_multiple_function_calls(
+    sample_function_to_tests: dict[str, set[FunctionCalledInTest]],
+    sample_tests_root: Path,
+    sample_original_test_results: TestResults,
+    sample_optimized_test_results: TestResults,
+):
+    """Test that multiple function calls within a test case use minimum timing."""
+    with patch("codeflash.code_utils.time_utils.format_time") as mock_format_time:
+        mock_format_time.side_effect = lambda x: f"{x} ns"
+
+        result = existing_tests_source_for(
+            "my_module.my_function",
+            sample_function_to_tests,
+            sample_tests_root,
+            sample_original_test_results,
+            sample_optimized_test_results,
+        )
+
+    # test_basic_functionality should show minimum timing: min(1000,500) -> min(800,400)
+    assert "test_basic_functionality: 500 ns -> 400 ns" in result
+
+
+def test_existing_tests_source_for_only_includes_passing_tests(
+    sample_function_to_tests: dict[str, set[FunctionCalledInTest]], sample_tests_root: Path
+):
+    """Test that only passing tests with runtime data are included in timing report."""
+    original_results = TestResults()
+    optimized_results = TestResults()
+
+    # Add a passing test with runtime
+    original_results.add(
+        FunctionTestInvocation(
+            id=InvocationId(
+                test_module_path="tests.test_module1",
+                test_class_name=None,
+                test_function_name="test_basic_functionality",
+                function_getting_tested="my_function",
+                iteration_id="1",
+            ),
+            file_name=Path("/tmp/tests/test_module1.py"),
+            did_pass=True,
+            runtime=1000,
+            test_framework="pytest",
+            test_type=TestType.EXISTING_UNIT_TEST,
+            return_value=None,
+            timed_out=False,
+            loop_index=1,
+        )
+    )
+
+    optimized_results.add(
+        FunctionTestInvocation(
+            id=InvocationId(
+                test_module_path="tests.test_module1",
+                test_class_name=None,
+                test_function_name="test_basic_functionality",
+                function_getting_tested="my_function",
+                iteration_id="1",
+            ),
+            file_name=Path("/tmp/tests/test_module1.py"),
+            did_pass=True,
+            runtime=800,
+            test_framework="pytest",
+            test_type=TestType.EXISTING_UNIT_TEST,
+            return_value=None,
+            timed_out=False,
+            loop_index=1,
+        )
+    )
+
+    # Add a failing test (should be excluded)
+    original_results.add(
+        FunctionTestInvocation(
+            id=InvocationId(
+                test_module_path="tests.test_module1",
+                test_class_name="TestMyFunction",
+                test_function_name="test_edge_cases",
+                function_getting_tested="my_function",
+                iteration_id="1",
+            ),
+            file_name=Path("/tmp/tests/test_module1.py"),
+            did_pass=False,  # Failing test
+            runtime=2000,
+            test_framework="pytest",
+            test_type=TestType.EXISTING_UNIT_TEST,
+            return_value=None,
+            timed_out=False,
+            loop_index=1,
+        )
+    )
+
+    # Add a test without runtime (should be excluded)
+    original_results.add(
+        FunctionTestInvocation(
+            id=InvocationId(
+                test_module_path="tests.test_module2",
+                test_class_name=None,
+                test_function_name="test_performance",
+                function_getting_tested="my_function",
+                iteration_id="1",
+            ),
+            file_name=Path("/tmp/tests/test_module2.py"),
+            did_pass=True,
+            runtime=None,  # No runtime data
+            test_framework="pytest",
+            test_type=TestType.EXISTING_UNIT_TEST,
+            return_value=None,
+            timed_out=False,
+            loop_index=1,
+        )
+    )
+
+    with patch("codeflash.code_utils.time_utils.format_time") as mock_format_time:
+        mock_format_time.side_effect = lambda x: f"{x} ns"
+
+        result = existing_tests_source_for(
+            "my_module.my_function", sample_function_to_tests, sample_tests_root, original_results, optimized_results
+        )
+
+    # Should only include the passing test with runtime data
+    assert "test_basic_functionality: 1000 ns -> 800 ns" in result
+    # Should not include failing test or test without runtime
+    assert "test_edge_cases" not in result
+    assert "test_performance" not in result
+
+
+def test_existing_tests_source_for_with_empty_test_mapping(sample_tests_root: Path):
+    """Test behavior when there are no tests for the function."""
+    result = existing_tests_source_for("nonexistent.function", {}, sample_tests_root)
+
+    assert result == ""
+
+
+def test_existing_tests_source_for_missing_optimized_results(
+    sample_function_to_tests: dict[str, set[FunctionCalledInTest]],
+    sample_tests_root: Path,
+    sample_original_test_results: TestResults,
+):
+    """Test behavior when optimized results are missing for some test cases."""
+    # Create optimized results that are missing some test cases
+    optimized_results = TestResults()
+    optimized_results.add(
+        FunctionTestInvocation(
+            id=InvocationId(
+                test_module_path="tests.test_module1",
+                test_class_name=None,
+                test_function_name="test_basic_functionality",
+                function_getting_tested="my_function",
+                iteration_id="1",
+            ),
+            file_name=Path("/tmp/tests/test_module1.py"),
+            did_pass=True,
+            runtime=800,
+            test_framework="pytest",
+            test_type=TestType.EXISTING_UNIT_TEST,
+            return_value=None,
+            timed_out=False,
+            loop_index=1,
+        )
+    )
+    # Note: Missing test_edge_cases and test_performance optimized results
+
+    with patch("codeflash.code_utils.time_utils.format_time") as mock_format_time:
+        mock_format_time.side_effect = lambda x: f"{x} ns"
+
+        result = existing_tests_source_for(
+            "my_module.my_function",
+            sample_function_to_tests,
+            sample_tests_root,
+            sample_original_test_results,
+            optimized_results,
+        )
+
+    # Should not include test cases without both original and optimized results
+    assert "test_basic_functionality" not in result  # Missing second function call
+    assert "test_edge_cases" not in result
+    assert "test_performance" not in result
+
+    # Should still show file names
+    assert "- test_module1.py" in result
+    assert "- test_module2.py" in result
+
+
+def test_existing_tests_source_for_sorted_output(sample_tests_root: Path):
+    """Test that output is properly sorted by file name and test function name."""
+    # Create a more complex test mapping with multiple files and functions
+    test_file_a = sample_tests_root / "a_test_module.py"
+    test_file_z = sample_tests_root / "z_test_module.py"
+
+    function_to_tests = {
+        "my_module.my_function": {
+            FunctionCalledInTest(
+                tests_in_file=TestsInFile(
+                    test_file=test_file_z,
+                    test_class=None,
+                    test_function="z_test_function",
+                    test_type=TestType.EXISTING_UNIT_TEST,
+                ),
+                position=CodePosition(line_no=10, col_no=4),
+            ),
+            FunctionCalledInTest(
+                tests_in_file=TestsInFile(
+                    test_file=test_file_a,
+                    test_class=None,
+                    test_function="a_test_function",
+                    test_type=TestType.EXISTING_UNIT_TEST,
+                ),
+                position=CodePosition(line_no=20, col_no=8),
+            ),
+            FunctionCalledInTest(
+                tests_in_file=TestsInFile(
+                    test_file=test_file_a,
+                    test_class=None,
+                    test_function="b_test_function",
+                    test_type=TestType.EXISTING_UNIT_TEST,
+                ),
+                position=CodePosition(line_no=30, col_no=8),
+            ),
+        }
+    }
+
+    original_results = TestResults()
+    optimized_results = TestResults()
+
+    # Add test results for all functions
+    for test_func in ["a_test_function", "b_test_function"]:
+        original_results.add(
+            FunctionTestInvocation(
+                id=InvocationId(
+                    test_module_path="tests.a_test_module",
+                    test_class_name=None,
+                    test_function_name=test_func,
+                    function_getting_tested="my_function",
+                    iteration_id="1",
+                ),
+                file_name=Path("/tmp/tests/a_test_module.py"),
+                did_pass=True,
+                runtime=1000,
+                test_framework="pytest",
+                test_type=TestType.EXISTING_UNIT_TEST,
+                return_value=None,
+                timed_out=False,
+                loop_index=1,
+            )
+        )
+
+        optimized_results.add(
+            FunctionTestInvocation(
+                id=InvocationId(
+                    test_module_path="tests.a_test_module",
+                    test_class_name=None,
+                    test_function_name=test_func,
+                    function_getting_tested="my_function",
+                    iteration_id="1",
+                ),
+                file_name=Path("/tmp/tests/a_test_module.py"),
+                did_pass=True,
+                runtime=800,
+                test_framework="pytest",
+                test_type=TestType.EXISTING_UNIT_TEST,
+                return_value=None,
+                timed_out=False,
+                loop_index=1,
+            )
+        )
+
+    original_results.add(
+        FunctionTestInvocation(
+            id=InvocationId(
+                test_module_path="tests.z_test_module",
+                test_class_name=None,
+                test_function_name="z_test_function",
+                function_getting_tested="my_function",
+                iteration_id="1",
+            ),
+            file_name=Path("/tmp/tests/z_test_module.py"),
+            did_pass=True,
+            runtime=1000,
+            test_framework="pytest",
+            test_type=TestType.EXISTING_UNIT_TEST,
+            return_value=None,
+            timed_out=False,
+            loop_index=1,
+        )
+    )
+
+    optimized_results.add(
+        FunctionTestInvocation(
+            id=InvocationId(
+                test_module_path="tests.z_test_module",
+                test_class_name=None,
+                test_function_name="z_test_function",
+                function_getting_tested="my_function",
+                iteration_id="1",
+            ),
+            file_name=Path("/tmp/tests/z_test_module.py"),
+            did_pass=True,
+            runtime=800,
+            test_framework="pytest",
+            test_type=TestType.EXISTING_UNIT_TEST,
+            return_value=None,
+            timed_out=False,
+            loop_index=1,
+        )
+    )
+
+    with patch("codeflash.code_utils.time_utils.format_time") as mock_format_time:
+        mock_format_time.side_effect = lambda x: f"{x} ns"
+
+        result = existing_tests_source_for(
+            "my_module.my_function", function_to_tests, sample_tests_root, original_results, optimized_results
+        )
+
+    lines = result.split("\n")
+
+    # Files should be sorted alphabetically
+    a_file_index = next(i for i, line in enumerate(lines) if "a_test_module.py" in line)
+    z_file_index = next(i for i, line in enumerate(lines) if "z_test_module.py" in line)
+    assert a_file_index < z_file_index
+
+    # Test functions within a file should be sorted alphabetically
+    a_func_index = next(i for i, line in enumerate(lines) if "a_test_function" in line)
+    b_func_index = next(i for i, line in enumerate(lines) if "b_test_function" in line)
+    assert a_func_index < b_func_index
+
+
+
+def test_existing_tests_source_for_format_time_called_correctly(
+    sample_function_to_tests: dict[str, set[FunctionCalledInTest]],
+    sample_tests_root: Path,
+    sample_original_test_results: TestResults,
+    sample_optimized_test_results: TestResults,
+):
+    """Test that format_time is called with correct values (min of runtime lists)."""
+    with patch("codeflash.code_utils.time_utils.format_time") as mock_format_time:
+        mock_format_time.side_effect = lambda x: f"{x} ns"
+
+        existing_tests_source_for(
+            "my_module.my_function",
+            sample_function_to_tests,
+            sample_tests_root,
+            sample_original_test_results,
+            sample_optimized_test_results,
+        )
+
+        # Check that format_time was called with the minimum values
+        call_args = [call[0][0] for call in mock_format_time.call_args_list]
+
+        # Should include minimum values (not aggregated)
+        assert 500 in call_args  # test_basic_functionality original: min(1000, 500)
+        assert 400 in call_args  # test_basic_functionality optimized: min(800, 400)
+        assert 2000 in call_args  # test_edge_cases original
+        assert 1500 in call_args  # test_edge_cases optimized
+        assert 3000 in call_args  # test_performance original
+        assert 2100 in call_args  # test_performance optimized
\ No newline at end of file

From 3f1cbefcbc8f8abd8472c036063f05fed3ce9419 Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Thu, 12 Jun 2025 16:41:36 -0700
Subject: [PATCH 02/16] wip need to do in a single loop

---
 codeflash/models/models.py    |   1 +
 codeflash/result/create_pr.py | 112 +++++++++++++++++-----------------
 2 files changed, 56 insertions(+), 57 deletions(-)

diff --git a/codeflash/models/models.py b/codeflash/models/models.py
index 02db2d0b6..bd4556965 100644
--- a/codeflash/models/models.py
+++ b/codeflash/models/models.py
@@ -557,6 +557,7 @@ def report_to_tree(report: dict[TestType, dict[str, int]], title: str) -> Tree:
 
     def usable_runtime_data_by_test_case(self) -> dict[InvocationId, list[int]]:
         # Efficient single traversal, directly accumulating into a dict.
+        # can track mins here and only sums can be return in total_passed_runtime
         by_id: dict[InvocationId, list[int]] = {}
         for result in self.test_results:
             if result.did_pass:
diff --git a/codeflash/result/create_pr.py b/codeflash/result/create_pr.py
index a58fc63ea..18207837d 100644
--- a/codeflash/result/create_pr.py
+++ b/codeflash/result/create_pr.py
@@ -31,64 +31,62 @@ def existing_tests_source_for(
     optimized_test_results: Optional[TestResults] = None,
 ) -> str:
     test_files = function_to_tests.get(function_qualified_name_with_modules_from_root)
+    if not test_files:
+        return ""
     existing_tests_unique = set()
-
-    if test_files:
-        # Group test cases by test file
-        test_files_grouped = {}
-        for test_file in test_files:
-            file_path = Path(test_file.tests_in_file.test_file)
-            relative_path = str(file_path.relative_to(tests_root))
-
-            if relative_path not in test_files_grouped:
-                test_files_grouped[relative_path] = []
-            test_files_grouped[relative_path].append(test_file)
-
-        # Create detailed report for each test file
-        for relative_path, tests_in_file in sorted(test_files_grouped.items()):
-            file_line = f"- {relative_path}"
-
-            # Add test case details with timing information if available
-            if original_test_results and optimized_test_results:
-                test_case_details = []
-
-                # Use the same pattern as add_runtime_comments_to_generated_tests
-                original_runtime_by_test = original_test_results.usable_runtime_data_by_test_case()
-                optimized_runtime_by_test = optimized_test_results.usable_runtime_data_by_test_case()
-
-                # Collect test function names for this file
-                test_functions_in_file = {test_file.tests_in_file.test_function for test_file in tests_in_file}
-
-                # Create timing report for each test function
-                for test_function_name in sorted(test_functions_in_file):
-                    # Find matching runtime data
-                    original_runtimes = []
-                    optimized_runtimes = []
-
-                    for invocation_id, runtimes in original_runtime_by_test.items():
-                        if invocation_id.test_function_name == test_function_name:
-                            original_runtimes.extend(runtimes)
-
-                    for invocation_id, runtimes in optimized_runtime_by_test.items():
-                        if invocation_id.test_function_name == test_function_name:
-                            optimized_runtimes.extend(runtimes)
-
-                    if original_runtimes and optimized_runtimes:
-                        # Use minimum timing like the generated tests function does
-                        original_time = min(original_runtimes)
-                        optimized_time = min(optimized_runtimes)
-
-                        from codeflash.code_utils.time_utils import format_time
-
-                        original_str = format_time(original_time)
-                        optimized_str = format_time(optimized_time)
-
-                        test_case_details.append(f"    - {test_function_name}: {original_str} -> {optimized_str}")
-
-                if test_case_details:
-                    file_line += "\n" + "\n".join(test_case_details)
-
-            existing_tests_unique.add(file_line)
+    # a lot of loops, need to do in a single loop
+    #original_runtime_by_test = original_test_results.usable_runtime_data_by_test_case()
+    #optimized_runtime_by_test = optimized_test_results.usable_runtime_data_by_test_case()
+    # Group test cases by test file
+    test_files_grouped = {}
+    for test_file in test_files:
+        file_path = Path(test_file.tests_in_file.test_file)
+        relative_path = str(file_path.relative_to(tests_root))
+
+        if relative_path not in test_files_grouped:
+            test_files_grouped[relative_path] = []
+        test_files_grouped.setdefault(relative_path,[]).append(test_file)
+
+    # Create detailed report for each test file
+    # for relative_path, tests_in_file in sorted(test_files_grouped.items()):
+        file_line = f"- {relative_path}"
+
+        # Add test case details with timing information if available
+        #if original_test_results and optimized_test_results:
+        test_case_details = []
+        # Collect test function names for this file
+        test_functions_in_file = {test_file.tests_in_file.test_function for test_file in tests_in_file}
+
+        # Create timing report for each test function
+        for test_function_name in sorted(test_functions_in_file):
+            # Find matching runtime data
+            original_runtimes = []
+            optimized_runtimes = []
+
+            for invocation_id, runtimes in original_runtime_by_test.items():
+                if invocation_id.test_function_name == test_function_name:
+                    original_runtimes.extend(runtimes)
+
+            for invocation_id, runtimes in optimized_runtime_by_test.items():
+                if invocation_id.test_function_name == test_function_name:
+                    optimized_runtimes.extend(runtimes)
+
+            if original_runtimes and optimized_runtimes:
+                # Use minimum timing like the generated tests function does
+                original_time = min(original_runtimes)
+                optimized_time = min(optimized_runtimes)
+
+                from codeflash.code_utils.time_utils import format_time
+
+                original_str = format_time(original_time)
+                optimized_str = format_time(optimized_time)
+
+                test_case_details.append(f"    - {test_function_name}: {original_str} -> {optimized_str}")
+
+        if test_case_details:
+            file_line += "\n" + "\n".join(test_case_details)
+
+        existing_tests_unique.add(file_line)
 
     return "\n".join(sorted(existing_tests_unique))
 

From a09d11cca459d5a4d8ae2300740192b553777a31 Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Mon, 16 Jun 2025 18:57:22 -0700
Subject: [PATCH 03/16] works

---
 codeflash/code_utils/edit_generated_tests.py |  11 +-
 codeflash/optimization/function_optimizer.py |  16 ++-
 codeflash/result/create_pr.py                | 118 +++++++++----------
 3 files changed, 72 insertions(+), 73 deletions(-)

diff --git a/codeflash/code_utils/edit_generated_tests.py b/codeflash/code_utils/edit_generated_tests.py
index 4e6e31072..65b518ac3 100644
--- a/codeflash/code_utils/edit_generated_tests.py
+++ b/codeflash/code_utils/edit_generated_tests.py
@@ -4,7 +4,7 @@
 
 from codeflash.cli_cmds.console import logger
 from codeflash.code_utils.time_utils import format_time
-from codeflash.models.models import GeneratedTests, GeneratedTestsList, TestResults
+from codeflash.models.models import GeneratedTests, GeneratedTestsList
 
 
 def remove_functions_from_generated_tests(
@@ -33,12 +33,9 @@ def remove_functions_from_generated_tests(
 
 
 def add_runtime_comments_to_generated_tests(
-    generated_tests: GeneratedTestsList, original_test_results: TestResults, optimized_test_results: TestResults
+    generated_tests: GeneratedTestsList, original_runtimes: dict, optimized_runtimes: dict
 ) -> GeneratedTestsList:
     """Add runtime performance comments to function calls in generated tests."""
-    # Create dictionaries for fast lookup of runtime data
-    original_runtime_by_test = original_test_results.usable_runtime_data_by_test_case()
-    optimized_runtime_by_test = optimized_test_results.usable_runtime_data_by_test_case()
 
     class RuntimeCommentTransformer(cst.CSTTransformer):
         def __init__(self) -> None:
@@ -84,11 +81,11 @@ def leave_SimpleStatementLine(
                 matching_original_times = []
                 matching_optimized_times = []
 
-                for invocation_id, runtimes in original_runtime_by_test.items():
+                for invocation_id, runtimes in original_runtimes.items():
                     if invocation_id.test_function_name == self.current_test_name:
                         matching_original_times.extend(runtimes)
 
-                for invocation_id, runtimes in optimized_runtime_by_test.items():
+                for invocation_id, runtimes in optimized_runtimes.items():
                     if invocation_id.test_function_name == self.current_test_name:
                         matching_optimized_times.extend(runtimes)
 
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
index 26d8ede73..4d5dc6f5b 100644
--- a/codeflash/optimization/function_optimizer.py
+++ b/codeflash/optimization/function_optimizer.py
@@ -354,11 +354,15 @@ def optimize_function(self) -> Result[BestOptimization, str]:  # noqa: PLR0911
                     generated_tests = remove_functions_from_generated_tests(
                         generated_tests=generated_tests, test_functions_to_remove=test_functions_to_remove
                     )
+                    original_runtime_by_test = (
+                        original_code_baseline.benchmarking_test_results.usable_runtime_data_by_test_case()
+                    )
+                    optimized_runtime_by_test = (
+                        best_optimization.winning_benchmarking_test_results.usable_runtime_data_by_test_case()
+                    )
                     # Add runtime comments to generated tests before creating the PR
                     generated_tests = add_runtime_comments_to_generated_tests(
-                        generated_tests,
-                        original_code_baseline.benchmarking_test_results,
-                        best_optimization.winning_benchmarking_test_results,
+                        generated_tests, original_runtime_by_test, optimized_runtime_by_test
                     )
                     generated_tests_str = "\n\n".join(
                         [test.generated_original_test_source for test in generated_tests.generated_tests]
@@ -366,9 +370,9 @@ def optimize_function(self) -> Result[BestOptimization, str]:  # noqa: PLR0911
                     existing_tests = existing_tests_source_for(
                         self.function_to_optimize.qualified_name_with_modules_from_root(self.project_root),
                         function_to_all_tests,
-                        tests_root=self.test_cfg.tests_root,
-                        original_test_results=original_code_baseline.benchmarking_test_results,
-                        optimized_test_results=best_optimization.winning_benchmarking_test_results,
+                        test_cfg=self.test_cfg,
+                        original_runtimes_all=original_runtime_by_test,
+                        optimized_runtimes_all=optimized_runtime_by_test,
                     )
                     if concolic_test_str:
                         generated_tests_str += "\n\n" + concolic_test_str
diff --git a/codeflash/result/create_pr.py b/codeflash/result/create_pr.py
index 18207837d..4830c0080 100644
--- a/codeflash/result/create_pr.py
+++ b/codeflash/result/create_pr.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import os
 from pathlib import Path
 from typing import TYPE_CHECKING, Optional
 
@@ -16,79 +17,76 @@
     git_root_dir,
 )
 from codeflash.code_utils.github_utils import github_pr_url
+from codeflash.code_utils.time_utils import format_time
 from codeflash.github.PrComment import FileDiffContent, PrComment
 
 if TYPE_CHECKING:
-    from codeflash.models.models import FunctionCalledInTest, TestResults
+    from codeflash.models.models import FunctionCalledInTest
     from codeflash.result.explanation import Explanation
+    from codeflash.verification.verification_utils import TestConfig
 
 
 def existing_tests_source_for(
     function_qualified_name_with_modules_from_root: str,
     function_to_tests: dict[str, set[FunctionCalledInTest]],
-    tests_root: Path,
-    original_test_results: Optional[TestResults] = None,
-    optimized_test_results: Optional[TestResults] = None,
+    test_cfg: TestConfig,
+    original_runtimes_all: dict,
+    optimized_runtimes_all: dict,
 ) -> str:
     test_files = function_to_tests.get(function_qualified_name_with_modules_from_root)
     if not test_files:
         return ""
-    existing_tests_unique = set()
-    # a lot of loops, need to do in a single loop
-    #original_runtime_by_test = original_test_results.usable_runtime_data_by_test_case()
-    #optimized_runtime_by_test = optimized_test_results.usable_runtime_data_by_test_case()
-    # Group test cases by test file
-    test_files_grouped = {}
-    for test_file in test_files:
-        file_path = Path(test_file.tests_in_file.test_file)
-        relative_path = str(file_path.relative_to(tests_root))
-
-        if relative_path not in test_files_grouped:
-            test_files_grouped[relative_path] = []
-        test_files_grouped.setdefault(relative_path,[]).append(test_file)
-
-    # Create detailed report for each test file
-    # for relative_path, tests_in_file in sorted(test_files_grouped.items()):
-        file_line = f"- {relative_path}"
-
-        # Add test case details with timing information if available
-        #if original_test_results and optimized_test_results:
-        test_case_details = []
-        # Collect test function names for this file
-        test_functions_in_file = {test_file.tests_in_file.test_function for test_file in tests_in_file}
-
-        # Create timing report for each test function
-        for test_function_name in sorted(test_functions_in_file):
-            # Find matching runtime data
-            original_runtimes = []
-            optimized_runtimes = []
-
-            for invocation_id, runtimes in original_runtime_by_test.items():
-                if invocation_id.test_function_name == test_function_name:
-                    original_runtimes.extend(runtimes)
-
-            for invocation_id, runtimes in optimized_runtime_by_test.items():
-                if invocation_id.test_function_name == test_function_name:
-                    optimized_runtimes.extend(runtimes)
-
-            if original_runtimes and optimized_runtimes:
-                # Use minimum timing like the generated tests function does
-                original_time = min(original_runtimes)
-                optimized_time = min(optimized_runtimes)
-
-                from codeflash.code_utils.time_utils import format_time
-
-                original_str = format_time(original_time)
-                optimized_str = format_time(optimized_time)
-
-                test_case_details.append(f"    - {test_function_name}: {original_str} -> {optimized_str}")
-
-        if test_case_details:
-            file_line += "\n" + "\n".join(test_case_details)
-
-        existing_tests_unique.add(file_line)
-
-    return "\n".join(sorted(existing_tests_unique))
+    output = ""
+    tests_root = test_cfg.tests_root
+    module_root = test_cfg.project_root_path
+    rel_tests_root = tests_root.relative_to(module_root)
+    original_tests_to_runtimes = {}
+    optimized_tests_to_runtimes = {}
+    # TODO confirm that original and optimized have the same keys
+    all_invocation_ids = original_runtimes_all.keys() | optimized_runtimes_all.keys()
+    for invocation_id in all_invocation_ids:
+        rel_path = (
+            Path(invocation_id.test_module_path.replace(".", os.sep)).with_suffix(".py").relative_to(rel_tests_root)
+        )
+        if rel_path not in original_tests_to_runtimes:
+            original_tests_to_runtimes[rel_path] = {}
+        if rel_path not in optimized_tests_to_runtimes:
+            optimized_tests_to_runtimes[rel_path] = {}
+        qualified_name = (
+            invocation_id.test_class_name + "." + invocation_id.test_function_name
+            if invocation_id.test_class_name
+            else invocation_id.test_function_name
+        )
+        if qualified_name not in original_tests_to_runtimes[rel_path]:
+            original_tests_to_runtimes[rel_path][qualified_name] = 0
+        if qualified_name not in optimized_tests_to_runtimes[rel_path]:
+            optimized_tests_to_runtimes[rel_path][qualified_name] = 0
+        if invocation_id in original_runtimes_all:
+            original_tests_to_runtimes[rel_path][qualified_name] += min(original_runtimes_all[invocation_id])
+        if invocation_id in optimized_runtimes_all:
+            optimized_tests_to_runtimes[rel_path][qualified_name] += min(optimized_runtimes_all[invocation_id])
+    # parse into string
+    all_rel_paths = (
+        original_tests_to_runtimes.keys()
+    )  # both will have the same keys as some default values are assigned in the previous loop
+    for filename in sorted(all_rel_paths):
+        output += f"- {filename}\n"
+        all_qualified_names = original_tests_to_runtimes[
+            filename
+        ].keys()  # both will have the same keys as some default values are assigned in the previous loop
+        for qualified_name in sorted(all_qualified_names):
+            # if not present in optimized output nan
+            if optimized_tests_to_runtimes[filename][qualified_name] == 0:
+                print_optimized_runtime = "NaN"
+            else:
+                print_optimized_runtime = format_time(optimized_tests_to_runtimes[filename][qualified_name])
+            if original_tests_to_runtimes[filename][qualified_name] == 0:
+                print_original_runtime = "NaN"
+            else:
+                print_original_runtime = format_time(original_tests_to_runtimes[filename][qualified_name])
+            output += f"    - {qualified_name}: {print_original_runtime} -> {print_optimized_runtime}\n"
+        output += "\n"
+    return output
 
 
 def check_create_pr(

From 30259c0ad893111be5031d9719915d8f4fe9d562 Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Mon, 16 Jun 2025 20:44:49 -0700
Subject: [PATCH 04/16] improve runtimecomments

---
 codeflash/code_utils/edit_generated_tests.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/codeflash/code_utils/edit_generated_tests.py b/codeflash/code_utils/edit_generated_tests.py
index 65b518ac3..2d713c1a3 100644
--- a/codeflash/code_utils/edit_generated_tests.py
+++ b/codeflash/code_utils/edit_generated_tests.py
@@ -37,6 +37,7 @@ def add_runtime_comments_to_generated_tests(
 ) -> GeneratedTestsList:
     """Add runtime performance comments to function calls in generated tests."""
 
+    # TODO: reduce for loops to one
     class RuntimeCommentTransformer(cst.CSTTransformer):
         def __init__(self) -> None:
             self.in_test_function = False
@@ -80,7 +81,7 @@ def leave_SimpleStatementLine(
                 # Find matching test cases by looking for this test function name in the test results
                 matching_original_times = []
                 matching_optimized_times = []
-
+                # TODO : will not work if there are multiple test cases with the same name, match filename + test class + test function name
                 for invocation_id, runtimes in original_runtimes.items():
                     if invocation_id.test_function_name == self.current_test_name:
                         matching_original_times.extend(runtimes)

From 0d566bf4cb2689b9de6319d7e6ab20c9167762e9 Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Mon, 16 Jun 2025 22:32:25 -0700
Subject: [PATCH 05/16] text highlight doesnt work

---
 codeflash/result/create_pr.py | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/codeflash/result/create_pr.py b/codeflash/result/create_pr.py
index 4830c0080..b0ee64b87 100644
--- a/codeflash/result/create_pr.py
+++ b/codeflash/result/create_pr.py
@@ -42,12 +42,17 @@ def existing_tests_source_for(
     rel_tests_root = tests_root.relative_to(module_root)
     original_tests_to_runtimes = {}
     optimized_tests_to_runtimes = {}
+    non_generated_tests = set()
+    for test_file in test_files:
+        non_generated_tests.add(Path(test_file.tests_in_file.test_file).relative_to(tests_root))
     # TODO confirm that original and optimized have the same keys
     all_invocation_ids = original_runtimes_all.keys() | optimized_runtimes_all.keys()
     for invocation_id in all_invocation_ids:
         rel_path = (
             Path(invocation_id.test_module_path.replace(".", os.sep)).with_suffix(".py").relative_to(rel_tests_root)
         )
+        if rel_path not in non_generated_tests:
+            continue
         if rel_path not in original_tests_to_runtimes:
             original_tests_to_runtimes[rel_path] = {}
         if rel_path not in optimized_tests_to_runtimes:
@@ -84,7 +89,23 @@ def existing_tests_source_for(
                 print_original_runtime = "NaN"
             else:
                 print_original_runtime = format_time(original_tests_to_runtimes[filename][qualified_name])
-            output += f"    - {qualified_name}: {print_original_runtime} -> {print_optimized_runtime}\n"
+            arrow = "\\rightarrow"
+            if (
+                original_tests_to_runtimes[filename][qualified_name] != 0
+                and optimized_tests_to_runtimes[filename][qualified_name] != 0
+            ):
+                greater = (
+                    optimized_tests_to_runtimes[filename][qualified_name]
+                    > original_tests_to_runtimes[filename][qualified_name]
+                )
+                if greater:
+                    output += f"    - $$\\color{{red}}{qualified_name}: {print_original_runtime} {arrow} {print_optimized_runtime}$$\n"
+                else:
+                    output += f"    - $$\\color{{green}}{qualified_name}: {print_original_runtime} {arrow} {print_optimized_runtime}$$\n"
+            else:
+                # one of them is NaN
+                output += f"    - $$\\color{{blue}}{qualified_name}: {print_original_runtime} {arrow} {print_optimized_runtime}$$\n"
+            # output += f"$$\\colorbox{{pink}}\{{    - {qualified_name}: {print_original_runtime} {arrow} {print_optimized_runtime}}}$$\n"
         output += "\n"
     return output
 

From 30410ff60512aeaedcd47de6f7b7dbd41f09320e Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Tue, 17 Jun 2025 14:42:27 -0700
Subject: [PATCH 06/16] works

---
 codeflash/code_utils/edit_generated_tests.py | 76 ++++++++++++++------
 codeflash/optimization/function_optimizer.py |  2 +-
 2 files changed, 56 insertions(+), 22 deletions(-)

diff --git a/codeflash/code_utils/edit_generated_tests.py b/codeflash/code_utils/edit_generated_tests.py
index 2d713c1a3..0a149fb97 100644
--- a/codeflash/code_utils/edit_generated_tests.py
+++ b/codeflash/code_utils/edit_generated_tests.py
@@ -1,10 +1,13 @@
+import os
 import re
+from pathlib import Path
 
 import libcst as cst
 
 from codeflash.cli_cmds.console import logger
 from codeflash.code_utils.time_utils import format_time
 from codeflash.models.models import GeneratedTests, GeneratedTestsList
+from codeflash.verification.verification_utils import TestConfig
 
 
 def remove_functions_from_generated_tests(
@@ -33,28 +36,36 @@ def remove_functions_from_generated_tests(
 
 
 def add_runtime_comments_to_generated_tests(
-    generated_tests: GeneratedTestsList, original_runtimes: dict, optimized_runtimes: dict
+    test_cfg: TestConfig, generated_tests: GeneratedTestsList, original_runtimes: dict, optimized_runtimes: dict
 ) -> GeneratedTestsList:
     """Add runtime performance comments to function calls in generated tests."""
+    tests_root = test_cfg.tests_root
+    module_root = test_cfg.project_root_path
+    rel_tests_root = tests_root.relative_to(module_root)
 
     # TODO: reduce for loops to one
     class RuntimeCommentTransformer(cst.CSTTransformer):
-        def __init__(self) -> None:
-            self.in_test_function = False
-            self.current_test_name: str | None = None
+        def __init__(self, test: GeneratedTests, tests_root: Path, rel_tests_root: Path) -> None:
+            self.test = test
+            self.context_stack = []
+            self.tests_root = tests_root
+            self.rel_tests_root = rel_tests_root
+
+        def visit_ClassDef(self, node: cst.ClassDef) -> None:
+            # Track when we enter a class
+            self.context_stack.append(node.name.value)
+
+        def leave_ClassDef(self, original_node: cst.ClassDef, updated_node: cst.ClassDef) -> cst.ClassDef:  # noqa: ARG002
+            # Pop the context when we leave a class
+            self.context_stack.pop()
+            return updated_node
 
         def visit_FunctionDef(self, node: cst.FunctionDef) -> None:
-            if node.name.value.startswith("test_"):
-                self.in_test_function = True
-                self.current_test_name = node.name.value
-            else:
-                self.in_test_function = False
-                self.current_test_name = None
-
-        def leave_FunctionDef(self, original_node: cst.FunctionDef, updated_node: cst.FunctionDef) -> cst.FunctionDef:
-            if original_node.name.value.startswith("test_"):
-                self.in_test_function = False
-                self.current_test_name = None
+            self.context_stack.append(node.name.value)
+
+        def leave_FunctionDef(self, original_node: cst.FunctionDef, updated_node: cst.FunctionDef) -> cst.FunctionDef:  # noqa: ARG002
+            # Pop the context when we leave a function
+            self.context_stack.pop()
             return updated_node
 
         def leave_SimpleStatementLine(
@@ -62,9 +73,6 @@ def leave_SimpleStatementLine(
             original_node: cst.SimpleStatementLine,  # noqa: ARG002
             updated_node: cst.SimpleStatementLine,
         ) -> cst.SimpleStatementLine:
-            if not self.in_test_function or not self.current_test_name:
-                return updated_node
-
             # Look for assignment statements that assign to codeflash_output
             # Handle both single statements and multiple statements on one line
             codeflash_assignment_found = False
@@ -83,11 +91,37 @@ def leave_SimpleStatementLine(
                 matching_optimized_times = []
                 # TODO : will not work if there are multiple test cases with the same name, match filename + test class + test function name
                 for invocation_id, runtimes in original_runtimes.items():
-                    if invocation_id.test_function_name == self.current_test_name:
+                    qualified_name = (
+                        invocation_id.test_class_name + "." + invocation_id.test_function_name
+                        if invocation_id.test_class_name
+                        else invocation_id.test_function_name
+                    )
+                    rel_path = (
+                        Path(invocation_id.test_module_path.replace(".", os.sep))
+                        .with_suffix(".py")
+                        .relative_to(self.rel_tests_root)
+                    )
+                    if qualified_name == ".".join(self.context_stack) and rel_path in [
+                        self.test.behavior_file_path.relative_to(self.tests_root),
+                        self.test.perf_file_path.relative_to(self.tests_root),
+                    ]:
                         matching_original_times.extend(runtimes)
 
                 for invocation_id, runtimes in optimized_runtimes.items():
-                    if invocation_id.test_function_name == self.current_test_name:
+                    qualified_name = (
+                        invocation_id.test_class_name + "." + invocation_id.test_function_name
+                        if invocation_id.test_class_name
+                        else invocation_id.test_function_name
+                    )
+                    rel_path = (
+                        Path(invocation_id.test_module_path.replace(".", os.sep))
+                        .with_suffix(".py")
+                        .relative_to(self.rel_tests_root)
+                    )
+                    if qualified_name == ".".join(self.context_stack) and rel_path in [
+                        self.test.behavior_file_path.relative_to(self.tests_root),
+                        self.test.perf_file_path.relative_to(self.tests_root),
+                    ]:
                         matching_optimized_times.extend(runtimes)
 
                 if matching_original_times and matching_optimized_times:
@@ -116,7 +150,7 @@ def leave_SimpleStatementLine(
             tree = cst.parse_module(test.generated_original_test_source)
 
             # Transform the tree to add runtime comments
-            transformer = RuntimeCommentTransformer()
+            transformer = RuntimeCommentTransformer(test, tests_root, rel_tests_root)
             modified_tree = tree.visit(transformer)
 
             # Convert back to source code
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
index 4d5dc6f5b..c94759369 100644
--- a/codeflash/optimization/function_optimizer.py
+++ b/codeflash/optimization/function_optimizer.py
@@ -362,7 +362,7 @@ def optimize_function(self) -> Result[BestOptimization, str]:  # noqa: PLR0911
                     )
                     # Add runtime comments to generated tests before creating the PR
                     generated_tests = add_runtime_comments_to_generated_tests(
-                        generated_tests, original_runtime_by_test, optimized_runtime_by_test
+                        self.test_cfg, generated_tests, original_runtime_by_test, optimized_runtime_by_test
                     )
                     generated_tests_str = "\n\n".join(
                         [test.generated_original_test_source for test in generated_tests.generated_tests]

From 4a68aa0854b1856f94d8e1e6473348a86d2c1fc9 Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Tue, 17 Jun 2025 15:35:40 -0700
Subject: [PATCH 07/16] minor fixes

---
 codeflash/code_utils/edit_generated_tests.py |  5 +++--
 codeflash/result/create_pr.py                | 13 +++++++++----
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/codeflash/code_utils/edit_generated_tests.py b/codeflash/code_utils/edit_generated_tests.py
index 0a149fb97..03e9ad2e1 100644
--- a/codeflash/code_utils/edit_generated_tests.py
+++ b/codeflash/code_utils/edit_generated_tests.py
@@ -7,6 +7,7 @@
 from codeflash.cli_cmds.console import logger
 from codeflash.code_utils.time_utils import format_time
 from codeflash.models.models import GeneratedTests, GeneratedTestsList
+from codeflash.result.critic import performance_gain
 from codeflash.verification.verification_utils import TestConfig
 
 
@@ -127,9 +128,9 @@ def leave_SimpleStatementLine(
                 if matching_original_times and matching_optimized_times:
                     original_time = min(matching_original_times)
                     optimized_time = min(matching_optimized_times)
-
+                    perf_gain = performance_gain(original_runtime_ns=original_time, optimized_runtime_ns=optimized_time)
                     # Create the runtime comment
-                    comment_text = f"# {format_time(original_time)} -> {format_time(optimized_time)}"
+                    comment_text = f"# {format_time(original_time)} -> {format_time(optimized_time)} ({perf_gain:.2f}%)"
 
                     # Add comment to the trailing whitespace
                     new_trailing_whitespace = cst.TrailingWhitespace(
diff --git a/codeflash/result/create_pr.py b/codeflash/result/create_pr.py
index b0ee64b87..81afd303a 100644
--- a/codeflash/result/create_pr.py
+++ b/codeflash/result/create_pr.py
@@ -19,6 +19,7 @@
 from codeflash.code_utils.github_utils import github_pr_url
 from codeflash.code_utils.time_utils import format_time
 from codeflash.github.PrComment import FileDiffContent, PrComment
+from codeflash.result.critic import performance_gain
 
 if TYPE_CHECKING:
     from codeflash.models.models import FunctionCalledInTest
@@ -89,7 +90,7 @@ def existing_tests_source_for(
                 print_original_runtime = "NaN"
             else:
                 print_original_runtime = format_time(original_tests_to_runtimes[filename][qualified_name])
-            arrow = "\\rightarrow"
+            arrow = "->"
             if (
                 original_tests_to_runtimes[filename][qualified_name] != 0
                 and optimized_tests_to_runtimes[filename][qualified_name] != 0
@@ -98,13 +99,17 @@ def existing_tests_source_for(
                     optimized_tests_to_runtimes[filename][qualified_name]
                     > original_tests_to_runtimes[filename][qualified_name]
                 )
+                perf_gain = performance_gain(
+                    original_runtime_ns=original_tests_to_runtimes[filename][qualified_name],
+                    optimized_runtime_ns=optimized_tests_to_runtimes[filename][qualified_name],
+                )
                 if greater:
-                    output += f"    - $$\\color{{red}}{qualified_name}: {print_original_runtime} {arrow} {print_optimized_runtime}$$\n"
+                    output += f"    - {qualified_name}: {print_original_runtime} {arrow} {print_optimized_runtime} $$\\color{{red}}({perf_gain:.2f}\\\\%)$$\n"
                 else:
-                    output += f"    - $$\\color{{green}}{qualified_name}: {print_original_runtime} {arrow} {print_optimized_runtime}$$\n"
+                    output += f"    - {qualified_name}: {print_original_runtime} {arrow} {print_optimized_runtime} $$\\color{{green}}({perf_gain:.2f}\\\\%)$$\n"
             else:
                 # one of them is NaN
-                output += f"    - $$\\color{{blue}}{qualified_name}: {print_original_runtime} {arrow} {print_optimized_runtime}$$\n"
+                output += f"    - {qualified_name}: {print_original_runtime} {arrow} {print_optimized_runtime}\n"
             # output += f"$$\\colorbox{{pink}}\{{    - {qualified_name}: {print_original_runtime} {arrow} {print_optimized_runtime}}}$$\n"
         output += "\n"
     return output

From e202289686e033e5a819d5c32a66528ac52c282f Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Tue, 17 Jun 2025 16:01:25 -0700
Subject: [PATCH 08/16] minor fixes

---
 codeflash/code_utils/edit_generated_tests.py | 13 ++++++++-----
 codeflash/result/create_pr.py                | 20 ++++++++++----------
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/codeflash/code_utils/edit_generated_tests.py b/codeflash/code_utils/edit_generated_tests.py
index 03e9ad2e1..ee50af0bd 100644
--- a/codeflash/code_utils/edit_generated_tests.py
+++ b/codeflash/code_utils/edit_generated_tests.py
@@ -6,7 +6,7 @@
 
 from codeflash.cli_cmds.console import logger
 from codeflash.code_utils.time_utils import format_time
-from codeflash.models.models import GeneratedTests, GeneratedTestsList
+from codeflash.models.models import GeneratedTests, GeneratedTestsList, InvocationId
 from codeflash.result.critic import performance_gain
 from codeflash.verification.verification_utils import TestConfig
 
@@ -37,7 +37,10 @@ def remove_functions_from_generated_tests(
 
 
 def add_runtime_comments_to_generated_tests(
-    test_cfg: TestConfig, generated_tests: GeneratedTestsList, original_runtimes: dict, optimized_runtimes: dict
+    test_cfg: TestConfig,
+    generated_tests: GeneratedTestsList,
+    original_runtimes: dict[InvocationId, list[int]],
+    optimized_runtimes: dict[InvocationId, list[int]],
 ) -> GeneratedTestsList:
     """Add runtime performance comments to function calls in generated tests."""
     tests_root = test_cfg.tests_root
@@ -48,7 +51,7 @@ def add_runtime_comments_to_generated_tests(
     class RuntimeCommentTransformer(cst.CSTTransformer):
         def __init__(self, test: GeneratedTests, tests_root: Path, rel_tests_root: Path) -> None:
             self.test = test
-            self.context_stack = []
+            self.context_stack: list[str] = []
             self.tests_root = tests_root
             self.rel_tests_root = rel_tests_root
 
@@ -93,7 +96,7 @@ def leave_SimpleStatementLine(
                 # TODO : will not work if there are multiple test cases with the same name, match filename + test class + test function name
                 for invocation_id, runtimes in original_runtimes.items():
                     qualified_name = (
-                        invocation_id.test_class_name + "." + invocation_id.test_function_name
+                        invocation_id.test_class_name + "." + invocation_id.test_function_name  # type: ignore[operator]
                         if invocation_id.test_class_name
                         else invocation_id.test_function_name
                     )
@@ -110,7 +113,7 @@ def leave_SimpleStatementLine(
 
                 for invocation_id, runtimes in optimized_runtimes.items():
                     qualified_name = (
-                        invocation_id.test_class_name + "." + invocation_id.test_function_name
+                        invocation_id.test_class_name + "." + invocation_id.test_function_name  # type: ignore[operator]
                         if invocation_id.test_class_name
                         else invocation_id.test_function_name
                     )
diff --git a/codeflash/result/create_pr.py b/codeflash/result/create_pr.py
index 81afd303a..8b184e5cb 100644
--- a/codeflash/result/create_pr.py
+++ b/codeflash/result/create_pr.py
@@ -22,7 +22,7 @@
 from codeflash.result.critic import performance_gain
 
 if TYPE_CHECKING:
-    from codeflash.models.models import FunctionCalledInTest
+    from codeflash.models.models import FunctionCalledInTest, InvocationId
     from codeflash.result.explanation import Explanation
     from codeflash.verification.verification_utils import TestConfig
 
@@ -31,8 +31,8 @@ def existing_tests_source_for(
     function_qualified_name_with_modules_from_root: str,
     function_to_tests: dict[str, set[FunctionCalledInTest]],
     test_cfg: TestConfig,
-    original_runtimes_all: dict,
-    optimized_runtimes_all: dict,
+    original_runtimes_all: dict[InvocationId, list[int]],
+    optimized_runtimes_all: dict[InvocationId, list[int]],
 ) -> str:
     test_files = function_to_tests.get(function_qualified_name_with_modules_from_root)
     if not test_files:
@@ -41,8 +41,8 @@ def existing_tests_source_for(
     tests_root = test_cfg.tests_root
     module_root = test_cfg.project_root_path
     rel_tests_root = tests_root.relative_to(module_root)
-    original_tests_to_runtimes = {}
-    optimized_tests_to_runtimes = {}
+    original_tests_to_runtimes: dict[Path, dict[str, int]] = {}
+    optimized_tests_to_runtimes: dict[Path, dict[str, int]] = {}
     non_generated_tests = set()
     for test_file in test_files:
         non_generated_tests.add(Path(test_file.tests_in_file.test_file).relative_to(tests_root))
@@ -59,18 +59,18 @@ def existing_tests_source_for(
         if rel_path not in optimized_tests_to_runtimes:
             optimized_tests_to_runtimes[rel_path] = {}
         qualified_name = (
-            invocation_id.test_class_name + "." + invocation_id.test_function_name
+            invocation_id.test_class_name + "." + invocation_id.test_function_name  # type: ignore[operator]
             if invocation_id.test_class_name
             else invocation_id.test_function_name
         )
         if qualified_name not in original_tests_to_runtimes[rel_path]:
-            original_tests_to_runtimes[rel_path][qualified_name] = 0
+            original_tests_to_runtimes[rel_path][qualified_name] = 0  # type: ignore[index]
         if qualified_name not in optimized_tests_to_runtimes[rel_path]:
-            optimized_tests_to_runtimes[rel_path][qualified_name] = 0
+            optimized_tests_to_runtimes[rel_path][qualified_name] = 0  # type: ignore[index]
         if invocation_id in original_runtimes_all:
-            original_tests_to_runtimes[rel_path][qualified_name] += min(original_runtimes_all[invocation_id])
+            original_tests_to_runtimes[rel_path][qualified_name] += min(original_runtimes_all[invocation_id])  # type: ignore[index]
         if invocation_id in optimized_runtimes_all:
-            optimized_tests_to_runtimes[rel_path][qualified_name] += min(optimized_runtimes_all[invocation_id])
+            optimized_tests_to_runtimes[rel_path][qualified_name] += min(optimized_runtimes_all[invocation_id])  # type: ignore[index]
     # parse into string
     all_rel_paths = (
         original_tests_to_runtimes.keys()

From 7b4bdd053addca49e5491a2ada13b36dd322ea54 Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Tue, 17 Jun 2025 16:29:54 -0700
Subject: [PATCH 09/16] fix tests, correct perf calc

---
 codeflash/code_utils/edit_generated_tests.py |  4 +++-
 codeflash/result/create_pr.py                |  9 ++++++---
 tests/test_add_runtime_comments.py           | 13 +++++++++++++
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/codeflash/code_utils/edit_generated_tests.py b/codeflash/code_utils/edit_generated_tests.py
index ee50af0bd..94c18ab5c 100644
--- a/codeflash/code_utils/edit_generated_tests.py
+++ b/codeflash/code_utils/edit_generated_tests.py
@@ -131,7 +131,9 @@ def leave_SimpleStatementLine(
                 if matching_original_times and matching_optimized_times:
                     original_time = min(matching_original_times)
                     optimized_time = min(matching_optimized_times)
-                    perf_gain = performance_gain(original_runtime_ns=original_time, optimized_runtime_ns=optimized_time)
+                    perf_gain = (
+                        performance_gain(original_runtime_ns=original_time, optimized_runtime_ns=optimized_time) * 100
+                    )
                     # Create the runtime comment
                     comment_text = f"# {format_time(original_time)} -> {format_time(optimized_time)} ({perf_gain:.2f}%)"
 
diff --git a/codeflash/result/create_pr.py b/codeflash/result/create_pr.py
index 8b184e5cb..72ef6d244 100644
--- a/codeflash/result/create_pr.py
+++ b/codeflash/result/create_pr.py
@@ -99,9 +99,12 @@ def existing_tests_source_for(
                     optimized_tests_to_runtimes[filename][qualified_name]
                     > original_tests_to_runtimes[filename][qualified_name]
                 )
-                perf_gain = performance_gain(
-                    original_runtime_ns=original_tests_to_runtimes[filename][qualified_name],
-                    optimized_runtime_ns=optimized_tests_to_runtimes[filename][qualified_name],
+                perf_gain = (
+                    performance_gain(
+                        original_runtime_ns=original_tests_to_runtimes[filename][qualified_name],
+                        optimized_runtime_ns=optimized_tests_to_runtimes[filename][qualified_name],
+                    )
+                    * 100
                 )
                 if greater:
                     output += f"    - {qualified_name}: {print_original_runtime} {arrow} {print_optimized_runtime} $$\\color{{red}}({perf_gain:.2f}\\\\%)$$\n"
diff --git a/tests/test_add_runtime_comments.py b/tests/test_add_runtime_comments.py
index 51c1ef052..6de1b461c 100644
--- a/tests/test_add_runtime_comments.py
+++ b/tests/test_add_runtime_comments.py
@@ -12,6 +12,7 @@
     TestType,
     VerificationType,
 )
+from codeflash.verification.verification_utils import TestConfig
 
 
 class TestAddRuntimeComments:
@@ -48,6 +49,18 @@ def test_basic_runtime_comment_addition(self):
     assert codeflash_output == [1, 2, 3]
 """
 
+        """test_cfg: TestConfig,
+    generated_tests: GeneratedTestsList,
+    original_runtimes: dict[InvocationId, list[int]],
+    optimized_runtimes: dict[InvocationId, list[int]]"""
+        project_root_path = file_path.parent.resolve()
+        test_config = TestConfig(
+            tests_root="tests",
+            tests_project_rootdir=Path.cwd(),
+            project_root_path=project_root_path,
+            test_framework="pytest",
+            pytest_cmd="pytest",
+        )
         generated_test = GeneratedTests(
             generated_original_test_source=test_source,
             instrumented_behavior_test_source="",

From 2ad102972a1aa2d2dd8586bee0c71849e6ed5d36 Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Tue, 17 Jun 2025 17:00:25 -0700
Subject: [PATCH 10/16] tests wip

---
 tests/test_add_runtime_comments.py      | 589 ++++++----------
 tests/test_existing_tests_source_for.py | 901 +++++++++---------------
 2 files changed, 548 insertions(+), 942 deletions(-)

diff --git a/tests/test_add_runtime_comments.py b/tests/test_add_runtime_comments.py
index 6de1b461c..bbb833751 100644
--- a/tests/test_add_runtime_comments.py
+++ b/tests/test_add_runtime_comments.py
@@ -1,477 +1,338 @@
-"""Tests for the add_runtime_comments_to_generated_tests functionality."""
-
+import os
 from pathlib import Path
+from unittest.mock import Mock
+
+import pytest
 
 from codeflash.code_utils.edit_generated_tests import add_runtime_comments_to_generated_tests
-from codeflash.models.models import (
-    FunctionTestInvocation,
-    GeneratedTests,
-    GeneratedTestsList,
-    InvocationId,
-    TestResults,
-    TestType,
-    VerificationType,
-)
+from codeflash.models.models import GeneratedTests, GeneratedTestsList, InvocationId
 from codeflash.verification.verification_utils import TestConfig
 
 
-class TestAddRuntimeComments:
-    """Test cases for add_runtime_comments_to_generated_tests method."""
-
-    def create_test_invocation(
-        self, test_function_name: str, runtime: int, loop_index: int = 1, iteration_id: str = "1", did_pass: bool = True
-    ) -> FunctionTestInvocation:
-        """Helper to create test invocation objects."""
-        return FunctionTestInvocation(
-            loop_index=loop_index,
-            id=InvocationId(
-                test_module_path="test_module",
-                test_class_name=None,
-                test_function_name=test_function_name,
-                function_getting_tested="test_function",
-                iteration_id=iteration_id,
-            ),
-            file_name=Path("test.py"),
-            did_pass=did_pass,
-            runtime=runtime,
-            test_framework="pytest",
-            test_type=TestType.GENERATED_REGRESSION,
-            return_value=None,
-            timed_out=False,
-            verification_type=VerificationType.FUNCTION_CALL,
-        )
-
-    def test_basic_runtime_comment_addition(self):
-        """Test basic functionality of adding runtime comments."""
-        # Create test source code
-        test_source = """def test_bubble_sort():
-    codeflash_output = bubble_sort([3, 1, 2])
-    assert codeflash_output == [1, 2, 3]
-"""
-
-        """test_cfg: TestConfig,
-    generated_tests: GeneratedTestsList,
-    original_runtimes: dict[InvocationId, list[int]],
-    optimized_runtimes: dict[InvocationId, list[int]]"""
-        project_root_path = file_path.parent.resolve()
-        test_config = TestConfig(
-            tests_root="tests",
-            tests_project_rootdir=Path.cwd(),
-            project_root_path=project_root_path,
-            test_framework="pytest",
-            pytest_cmd="pytest",
-        )
-        generated_test = GeneratedTests(
-            generated_original_test_source=test_source,
-            instrumented_behavior_test_source="",
-            instrumented_perf_test_source="",
-            behavior_file_path=Path("test_behavior.py"),
-            perf_file_path=Path("test_perf.py"),
-        )
-
-        generated_tests = GeneratedTestsList(generated_tests=[generated_test])
-
-        # Create test results
-        original_test_results = TestResults()
-        optimized_test_results = TestResults()
-
-        # Add test invocations with different runtimes
-        original_invocation = self.create_test_invocation("test_bubble_sort", 500_000)  # 500μs
-        optimized_invocation = self.create_test_invocation("test_bubble_sort", 300_000)  # 300μs
+@pytest.fixture
+def test_config():
+    """Create a mock TestConfig for testing."""
+    config = Mock(spec=TestConfig)
+    config.project_root_path = Path("/project")
+    config.tests_root = Path("/project/tests")
+    return config
 
-        original_test_results.add(original_invocation)
-        optimized_test_results.add(optimized_invocation)
 
-        # Test the functionality
-        result = add_runtime_comments_to_generated_tests(generated_tests, original_test_results, optimized_test_results)
+@pytest.fixture
+def sample_invocation_id():
+    """Create a sample InvocationId for testing."""
+    return InvocationId(
+        test_module_path="tests.test_module",
+        test_class_name="TestClass",
+        test_function_name="test_function",
+    )
 
-        # Check that comments were added
-        modified_source = result.generated_tests[0].generated_original_test_source
-        assert "# 500μs -> 300μs" in modified_source
-        assert "codeflash_output = bubble_sort([3, 1, 2]) # 500μs -> 300μs" in modified_source
 
-    def test_multiple_test_functions(self):
-        """Test handling multiple test functions in the same file."""
-        test_source = """def test_bubble_sort():
-    codeflash_output = bubble_sort([3, 1, 2])
-    assert codeflash_output == [1, 2, 3]
+@pytest.fixture
+def sample_invocation_id_no_class():
+    """Create a sample InvocationId without class for testing."""
+    return InvocationId(
+        test_module_path="tests.test_module",
+        test_class_name=None,
+        test_function_name="test_function",
+    )
 
-def test_quick_sort():
-    codeflash_output = quick_sort([5, 2, 8])
-    assert codeflash_output == [2, 5, 8]
 
-def helper_function():
-    return "not a test"
-"""
+class TestAddRuntimeCommentsToGeneratedTests:
+    def test_add_runtime_comments_simple_function(self, test_config):
+        """Test adding runtime comments to a simple test function."""
+        test_source = '''def test_function():
+    codeflash_output = some_function()
+    assert codeflash_output == expected
+'''
 
         generated_test = GeneratedTests(
             generated_original_test_source=test_source,
             instrumented_behavior_test_source="",
             instrumented_perf_test_source="",
-            behavior_file_path=Path("test_behavior.py"),
-            perf_file_path=Path("test_perf.py"),
+            behavior_file_path=Path("/project/tests/test_module.py"),
+            perf_file_path=Path("/project/tests/test_module_perf.py"),
         )
 
         generated_tests = GeneratedTestsList(generated_tests=[generated_test])
 
-        # Create test results for both functions
-        original_test_results = TestResults()
-        optimized_test_results = TestResults()
-
-        # Add test invocations for both test functions
-        original_test_results.add(self.create_test_invocation("test_bubble_sort", 500_000))
-        original_test_results.add(self.create_test_invocation("test_quick_sort", 800_000))
-
-        optimized_test_results.add(self.create_test_invocation("test_bubble_sort", 300_000))
-        optimized_test_results.add(self.create_test_invocation("test_quick_sort", 600_000))
-
-        # Test the functionality
-        result = add_runtime_comments_to_generated_tests(generated_tests, original_test_results, optimized_test_results)
-
-        modified_source = result.generated_tests[0].generated_original_test_source
-
-        # Check that comments were added to both test functions
-        assert "# 500μs -> 300μs" in modified_source
-        assert "# 800μs -> 600μs" in modified_source
-        # Helper function should not have comments
-        assert (
-            "helper_function():" in modified_source
-            and "# " not in modified_source.split("helper_function():")[1].split("\n")[0]
+        invocation_id = InvocationId(
+            test_module_path="tests.test_module",
+            test_class_name=None,
+            test_function_name="test_function",
         )
 
-    def test_different_time_formats(self):
-        """Test that different time ranges are formatted correctly with new precision rules."""
-        test_cases = [
-            (999, 500, "999ns -> 500ns"),  # nanoseconds
-            (25_000, 18_000, "25.0μs -> 18.0μs"),  # microseconds with precision
-            (500_000, 300_000, "500μs -> 300μs"),  # microseconds full integers
-            (1_500_000, 800_000, "1.50ms -> 800μs"),  # milliseconds with precision
-            (365_000_000, 290_000_000, "365ms -> 290ms"),  # milliseconds full integers
-            (2_000_000_000, 1_500_000_000, "2.00s -> 1.50s"),  # seconds with precision
-        ]
-
-        for original_time, optimized_time, expected_comment in test_cases:
-            test_source = """def test_function():
-    codeflash_output = some_function()
-    assert codeflash_output is not None
-"""
-
-            generated_test = GeneratedTests(
-                generated_original_test_source=test_source,
-                instrumented_behavior_test_source="",
-                instrumented_perf_test_source="",
-                behavior_file_path=Path("test_behavior.py"),
-                perf_file_path=Path("test_perf.py"),
-            )
-
-            generated_tests = GeneratedTestsList(generated_tests=[generated_test])
-
-            # Create test results
-            original_test_results = TestResults()
-            optimized_test_results = TestResults()
+        original_runtimes = {invocation_id: [1000000000, 1200000000]}  # 1s, 1.2s in nanoseconds
+        optimized_runtimes = {invocation_id: [500000000, 600000000]}   # 0.5s, 0.6s in nanoseconds
 
-            original_test_results.add(self.create_test_invocation("test_function", original_time))
-            optimized_test_results.add(self.create_test_invocation("test_function", optimized_time))
+        result = add_runtime_comments_to_generated_tests(
+            test_config, generated_tests, original_runtimes, optimized_runtimes
+        )
 
-            # Test the functionality
-            result = add_runtime_comments_to_generated_tests(
-                generated_tests, original_test_results, optimized_test_results
-            )
+        expected_source = '''def test_function():
+    codeflash_output = some_function() # 1.00s -> 500.00ms (50.00%)
+    assert codeflash_output == expected
+'''
 
-            modified_source = result.generated_tests[0].generated_original_test_source
-            assert f"# {expected_comment}" in modified_source
+        assert len(result.generated_tests) == 1
+        assert result.generated_tests[0].generated_original_test_source == expected_source
 
-    def test_missing_test_results(self):
-        """Test behavior when test results are missing for a test function."""
-        test_source = """def test_bubble_sort():
-    codeflash_output = bubble_sort([3, 1, 2])
-    assert codeflash_output == [1, 2, 3]
-"""
+    def test_add_runtime_comments_class_method(self, test_config):
+        """Test adding runtime comments to a test method within a class."""
+        test_source = '''class TestClass:
+    def test_function(self):
+        codeflash_output = some_function()
+        assert codeflash_output == expected
+'''
 
         generated_test = GeneratedTests(
             generated_original_test_source=test_source,
             instrumented_behavior_test_source="",
             instrumented_perf_test_source="",
-            behavior_file_path=Path("test_behavior.py"),
-            perf_file_path=Path("test_perf.py"),
+            behavior_file_path=Path("/project/tests/test_module.py"),
+            perf_file_path=Path("/project/tests/test_module_perf.py"),
         )
 
         generated_tests = GeneratedTestsList(generated_tests=[generated_test])
 
-        # Create empty test results
-        original_test_results = TestResults()
-        optimized_test_results = TestResults()
-
-        # Test the functionality
-        result = add_runtime_comments_to_generated_tests(generated_tests, original_test_results, optimized_test_results)
-
-        # Check that no comments were added
-        modified_source = result.generated_tests[0].generated_original_test_source
-        assert modified_source == test_source  # Should be unchanged
-
-    def test_partial_test_results(self):
-        """Test behavior when only one set of test results is available."""
-        test_source = """def test_bubble_sort():
-    codeflash_output = bubble_sort([3, 1, 2])
-    assert codeflash_output == [1, 2, 3]
-"""
-
-        generated_test = GeneratedTests(
-            generated_original_test_source=test_source,
-            instrumented_behavior_test_source="",
-            instrumented_perf_test_source="",
-            behavior_file_path=Path("test_behavior.py"),
-            perf_file_path=Path("test_perf.py"),
+        invocation_id = InvocationId(
+            test_module_path="tests.test_module",
+            test_class_name="TestClass",
+            test_function_name="test_function",
         )
 
-        generated_tests = GeneratedTestsList(generated_tests=[generated_test])
-
-        # Create test results with only original data
-        original_test_results = TestResults()
-        optimized_test_results = TestResults()
+        original_runtimes = {invocation_id: [2000000000]}  # 2s in nanoseconds
+        optimized_runtimes = {invocation_id: [1000000000]} # 1s in nanoseconds
 
-        original_test_results.add(self.create_test_invocation("test_bubble_sort", 500_000))
-        # No optimized results
+        result = add_runtime_comments_to_generated_tests(
+            test_config, generated_tests, original_runtimes, optimized_runtimes
+        )
 
-        # Test the functionality
-        result = add_runtime_comments_to_generated_tests(generated_tests, original_test_results, optimized_test_results)
+        expected_source = '''class TestClass:
+    def test_function(self):
+        codeflash_output = some_function() # 2.00s -> 1.00s (50.00%)
+        assert codeflash_output == expected
+'''
 
-        # Check that no comments were added
-        modified_source = result.generated_tests[0].generated_original_test_source
-        assert modified_source == test_source  # Should be unchanged
+        assert len(result.generated_tests) == 1
+        assert result.generated_tests[0].generated_original_test_source == expected_source
 
-    def test_multiple_runtimes_uses_minimum(self):
-        """Test that when multiple runtimes exist, the minimum is used."""
-        test_source = """def test_bubble_sort():
-    codeflash_output = bubble_sort([3, 1, 2])
-    assert codeflash_output == [1, 2, 3]
-"""
+    def test_add_runtime_comments_multiple_assignments(self, test_config):
+        """Test adding runtime comments when there are multiple codeflash_output assignments."""
+        test_source = '''def test_function():
+    setup_data = prepare_test()
+    codeflash_output = some_function()
+    assert codeflash_output == expected
+    codeflash_output = another_function()
+    assert codeflash_output == expected2
+'''
 
         generated_test = GeneratedTests(
             generated_original_test_source=test_source,
             instrumented_behavior_test_source="",
             instrumented_perf_test_source="",
-            behavior_file_path=Path("test_behavior.py"),
-            perf_file_path=Path("test_perf.py"),
+            behavior_file_path=Path("/project/tests/test_module.py"),
+            perf_file_path=Path("/project/tests/test_module_perf.py"),
         )
 
         generated_tests = GeneratedTestsList(generated_tests=[generated_test])
 
-        # Create test results with multiple loop iterations
-        original_test_results = TestResults()
-        optimized_test_results = TestResults()
+        invocation_id = InvocationId(
+            test_module_path="tests.test_module",
+            test_class_name=None,
+            test_function_name="test_function",
+        )
 
-        # Add multiple runs with different runtimes
-        original_test_results.add(self.create_test_invocation("test_bubble_sort", 600_000, loop_index=1))
-        original_test_results.add(self.create_test_invocation("test_bubble_sort", 500_000, loop_index=2))
-        original_test_results.add(self.create_test_invocation("test_bubble_sort", 550_000, loop_index=3))
+        original_runtimes = {invocation_id: [1500000000]}  # 1.5s in nanoseconds
+        optimized_runtimes = {invocation_id: [750000000]}  # 0.75s in nanoseconds
 
-        optimized_test_results.add(self.create_test_invocation("test_bubble_sort", 350_000, loop_index=1))
-        optimized_test_results.add(self.create_test_invocation("test_bubble_sort", 300_000, loop_index=2))
-        optimized_test_results.add(self.create_test_invocation("test_bubble_sort", 320_000, loop_index=3))
+        result = add_runtime_comments_to_generated_tests(
+            test_config, generated_tests, original_runtimes, optimized_runtimes
+        )
 
-        # Test the functionality
-        result = add_runtime_comments_to_generated_tests(generated_tests, original_test_results, optimized_test_results)
+        expected_source = '''def test_function():
+    setup_data = prepare_test()
+    codeflash_output = some_function() # 1.50s -> 750.00ms (50.00%)
+    assert codeflash_output == expected
+    codeflash_output = another_function() # 1.50s -> 750.00ms (50.00%)
+    assert codeflash_output == expected2
+'''
 
-        # Check that minimum times were used (500μs -> 300μs)
-        modified_source = result.generated_tests[0].generated_original_test_source
-        assert "# 500μs -> 300μs" in modified_source
+        assert len(result.generated_tests) == 1
+        assert result.generated_tests[0].generated_original_test_source == expected_source
 
-    def test_no_codeflash_output_assignment(self):
-        """Test behavior when test doesn't have codeflash_output assignment."""
-        test_source = """def test_bubble_sort():
-    result = bubble_sort([3, 1, 2])
-    assert result == [1, 2, 3]
-"""
+    def test_add_runtime_comments_no_matching_runtimes(self, test_config):
+        """Test that source remains unchanged when no matching runtimes are found."""
+        test_source = '''def test_function():
+    codeflash_output = some_function()
+    assert codeflash_output == expected
+'''
 
         generated_test = GeneratedTests(
             generated_original_test_source=test_source,
             instrumented_behavior_test_source="",
             instrumented_perf_test_source="",
-            behavior_file_path=Path("test_behavior.py"),
-            perf_file_path=Path("test_perf.py"),
+            behavior_file_path=Path("/project/tests/test_module.py"),
+            perf_file_path=Path("/project/tests/test_module_perf.py"),
         )
 
         generated_tests = GeneratedTestsList(generated_tests=[generated_test])
 
-        # Create test results
-        original_test_results = TestResults()
-        optimized_test_results = TestResults()
+        # Different invocation ID that won't match
+        invocation_id = InvocationId(
+            test_module_path="tests.other_module",
+            test_class_name=None,
+            test_function_name="other_function",
+        )
 
-        original_test_results.add(self.create_test_invocation("test_bubble_sort", 500_000))
-        optimized_test_results.add(self.create_test_invocation("test_bubble_sort", 300_000))
+        original_runtimes = {invocation_id: [1000000000]}
+        optimized_runtimes = {invocation_id: [500000000]}
 
-        # Test the functionality
-        result = add_runtime_comments_to_generated_tests(generated_tests, original_test_results, optimized_test_results)
+        result = add_runtime_comments_to_generated_tests(
+            test_config, generated_tests, original_runtimes, optimized_runtimes
+        )
 
-        # Check that no comments were added (no codeflash_output assignment)
-        modified_source = result.generated_tests[0].generated_original_test_source
-        assert modified_source == test_source  # Should be unchanged
+        # Source should remain unchanged
+        assert len(result.generated_tests) == 1
+        assert result.generated_tests[0].generated_original_test_source == test_source
 
-    def test_invalid_python_code_handling(self):
-        """Test behavior when test source code is invalid Python."""
-        test_source = """def test_bubble_sort(:
-    codeflash_output = bubble_sort([3, 1, 2])
-    assert codeflash_output == [1, 2, 3]
-"""  # Invalid syntax: extra colon
+    def test_add_runtime_comments_no_codeflash_output(self, test_config):
+        """Test that source remains unchanged when there's no codeflash_output assignment."""
+        test_source = '''def test_function():
+    result = some_function()
+    assert result == expected
+'''
 
         generated_test = GeneratedTests(
             generated_original_test_source=test_source,
             instrumented_behavior_test_source="",
             instrumented_perf_test_source="",
-            behavior_file_path=Path("test_behavior.py"),
-            perf_file_path=Path("test_perf.py"),
+            behavior_file_path=Path("/project/tests/test_module.py"),
+            perf_file_path=Path("/project/tests/test_module_perf.py"),
         )
 
         generated_tests = GeneratedTestsList(generated_tests=[generated_test])
 
-        # Create test results
-        original_test_results = TestResults()
-        optimized_test_results = TestResults()
+        invocation_id = InvocationId(
+            test_module_path="tests.test_module",
+            test_class_name=None,
+            test_function_name="test_function",
+        )
 
-        original_test_results.add(self.create_test_invocation("test_bubble_sort", 500_000))
-        optimized_test_results.add(self.create_test_invocation("test_bubble_sort", 300_000))
+        original_runtimes = {invocation_id: [1000000000]}
+        optimized_runtimes = {invocation_id: [500000000]}
 
-        # Test the functionality - should handle parse error gracefully
-        result = add_runtime_comments_to_generated_tests(generated_tests, original_test_results, optimized_test_results)
+        result = add_runtime_comments_to_generated_tests(
+            test_config, generated_tests, original_runtimes, optimized_runtimes
+        )
 
-        # Check that original test is preserved when parsing fails
-        modified_source = result.generated_tests[0].generated_original_test_source
-        assert modified_source == test_source  # Should be unchanged due to parse error
+        # Source should remain unchanged
+        assert len(result.generated_tests) == 1
+        assert result.generated_tests[0].generated_original_test_source == test_source
 
-    def test_multiple_generated_tests(self):
-        """Test handling multiple generated test objects."""
-        test_source_1 = """def test_bubble_sort():
-    codeflash_output = bubble_sort([3, 1, 2])
-    assert codeflash_output == [1, 2, 3]
-"""
+    def test_add_runtime_comments_multiple_tests(self, test_config):
+        """Test adding runtime comments to multiple generated tests."""
+        test_source1 = '''def test_function1():
+    codeflash_output = some_function()
+    assert codeflash_output == expected
+'''
 
-        test_source_2 = """def test_quick_sort():
-    codeflash_output = quick_sort([5, 2, 8])
-    assert codeflash_output == [2, 5, 8]
-"""
+        test_source2 = '''def test_function2():
+    codeflash_output = another_function()
+    assert codeflash_output == expected
+'''
 
-        generated_test_1 = GeneratedTests(
-            generated_original_test_source=test_source_1,
+        generated_test1 = GeneratedTests(
+            generated_original_test_source=test_source1,
             instrumented_behavior_test_source="",
             instrumented_perf_test_source="",
-            behavior_file_path=Path("test_behavior_1.py"),
-            perf_file_path=Path("test_perf_1.py"),
+            behavior_file_path=Path("/project/tests/test_module1.py"),
+            perf_file_path=Path("/project/tests/test_module1_perf.py"),
         )
 
-        generated_test_2 = GeneratedTests(
-            generated_original_test_source=test_source_2,
+        generated_test2 = GeneratedTests(
+            generated_original_test_source=test_source2,
             instrumented_behavior_test_source="",
             instrumented_perf_test_source="",
-            behavior_file_path=Path("test_behavior_2.py"),
-            perf_file_path=Path("test_perf_2.py"),
+            behavior_file_path=Path("/project/tests/test_module2.py"),
+            perf_file_path=Path("/project/tests/test_module2_perf.py"),
         )
 
-        generated_tests = GeneratedTestsList(generated_tests=[generated_test_1, generated_test_2])
-
-        # Create test results
-        original_test_results = TestResults()
-        optimized_test_results = TestResults()
-
-        original_test_results.add(self.create_test_invocation("test_bubble_sort", 500_000))
-        original_test_results.add(self.create_test_invocation("test_quick_sort", 800_000))
-
-        optimized_test_results.add(self.create_test_invocation("test_bubble_sort", 300_000))
-        optimized_test_results.add(self.create_test_invocation("test_quick_sort", 600_000))
+        generated_tests = GeneratedTestsList(generated_tests=[generated_test1, generated_test2])
 
-        # Test the functionality
-        result = add_runtime_comments_to_generated_tests(generated_tests, original_test_results, optimized_test_results)
-
-        # Check that comments were added to both test files
-        modified_source_1 = result.generated_tests[0].generated_original_test_source
-        modified_source_2 = result.generated_tests[1].generated_original_test_source
-
-        assert "# 500μs -> 300μs" in modified_source_1
-        assert "# 800μs -> 600μs" in modified_source_2
-
-    def test_preserved_test_attributes(self):
-        """Test that other test attributes are preserved during modification."""
-        test_source = """def test_bubble_sort():
-    codeflash_output = bubble_sort([3, 1, 2])
-    assert codeflash_output == [1, 2, 3]
-"""
-
-        original_behavior_source = "behavior test source"
-        original_perf_source = "perf test source"
-        original_behavior_path = Path("test_behavior.py")
-        original_perf_path = Path("test_perf.py")
-
-        generated_test = GeneratedTests(
-            generated_original_test_source=test_source,
-            instrumented_behavior_test_source=original_behavior_source,
-            instrumented_perf_test_source=original_perf_source,
-            behavior_file_path=original_behavior_path,
-            perf_file_path=original_perf_path,
+        invocation_id1 = InvocationId(
+            test_module_path="tests.test_module1",
+            test_class_name=None,
+            test_function_name="test_function1",
         )
 
-        generated_tests = GeneratedTestsList(generated_tests=[generated_test])
-
-        # Create test results
-        original_test_results = TestResults()
-        optimized_test_results = TestResults()
+        invocation_id2 = InvocationId(
+            test_module_path="tests.test_module2",
+            test_class_name=None,
+            test_function_name="test_function2",
+        )
 
-        original_test_results.add(self.create_test_invocation("test_bubble_sort", 500_000))
-        optimized_test_results.add(self.create_test_invocation("test_bubble_sort", 300_000))
+        original_runtimes = {
+            invocation_id1: [1000000000],  # 1s
+            invocation_id2: [2000000000],  # 2s
+        }
+        optimized_runtimes = {
+            invocation_id1: [500000000],   # 0.5s
+            invocation_id2: [800000000],   # 0.8s
+        }
+
+        result = add_runtime_comments_to_generated_tests(
+            test_config, generated_tests, original_runtimes, optimized_runtimes
+        )
 
-        # Test the functionality
-        result = add_runtime_comments_to_generated_tests(generated_tests, original_test_results, optimized_test_results)
+        expected_source1 = '''def test_function1():
+    codeflash_output = some_function() # 1.00s -> 500.00ms (50.00%)
+    assert codeflash_output == expected
+'''
 
-        # Check that other attributes are preserved
-        modified_test = result.generated_tests[0]
-        assert modified_test.instrumented_behavior_test_source == original_behavior_source
-        assert modified_test.instrumented_perf_test_source == original_perf_source
-        assert modified_test.behavior_file_path == original_behavior_path
-        assert modified_test.perf_file_path == original_perf_path
+        expected_source2 = '''def test_function2():
+    codeflash_output = another_function() # 2.00s -> 800.00ms (60.00%)
+    assert codeflash_output == expected
+'''
 
-        # Check that only the generated_original_test_source was modified
-        assert "# 500μs -> 300μs" in modified_test.generated_original_test_source
+        assert len(result.generated_tests) == 2
+        assert result.generated_tests[0].generated_original_test_source == expected_source1
+        assert result.generated_tests[1].generated_original_test_source == expected_source2
 
-    def test_multistatement_line_handling(self):
-        """Test that runtime comments work correctly with multiple statements on one line."""
-        test_source = """def test_mutation_of_input():
-    # Test that the input list is mutated in-place and returned
-    arr = [3, 1, 2]
-    codeflash_output = sorter(arr); result = codeflash_output
-    assert result == [1, 2, 3]
-    assert arr == [1, 2, 3]  # Input should be mutated
-"""
+    def test_add_runtime_comments_performance_regression(self, test_config):
+        """Test adding runtime comments when optimized version is slower (negative performance gain)."""
+        test_source = '''def test_function():
+    codeflash_output = some_function()
+    assert codeflash_output == expected
+'''
 
         generated_test = GeneratedTests(
             generated_original_test_source=test_source,
             instrumented_behavior_test_source="",
             instrumented_perf_test_source="",
-            behavior_file_path=Path("test_behavior.py"),
-            perf_file_path=Path("test_perf.py"),
+            behavior_file_path=Path("/project/tests/test_module.py"),
+            perf_file_path=Path("/project/tests/test_module_perf.py"),
         )
 
         generated_tests = GeneratedTestsList(generated_tests=[generated_test])
 
-        # Create test results
-        original_test_results = TestResults()
-        optimized_test_results = TestResults()
-
-        original_test_results.add(self.create_test_invocation("test_mutation_of_input", 19_000))  # 19μs
-        optimized_test_results.add(self.create_test_invocation("test_mutation_of_input", 14_000))  # 14μs
+        invocation_id = InvocationId(
+            test_module_path="tests.test_module",
+            test_class_name=None,
+            test_function_name="test_function",
+        )
 
-        # Test the functionality
-        result = add_runtime_comments_to_generated_tests(generated_tests, original_test_results, optimized_test_results)
+        original_runtimes = {invocation_id: [1000000000]}  # 1s
+        optimized_runtimes = {invocation_id: [1500000000]} # 1.5s (slower!)
 
-        # Check that comments were added to the correct line
-        modified_source = result.generated_tests[0].generated_original_test_source
-        assert "# 19.0μs -> 14.0μs" in modified_source
+        result = add_runtime_comments_to_generated_tests(
+            test_config, generated_tests, original_runtimes, optimized_runtimes
+        )
 
-        # Verify the comment is on the line with codeflash_output assignment
-        lines = modified_source.split("\n")
-        codeflash_line = None
-        for line in lines:
-            if "codeflash_output = sorter(arr)" in line:
-                codeflash_line = line
-                break
+        expected_source = '''def test_function():
+    codeflash_output = some_function() # 1.00s -> 1.50s (-50.00%)
+    assert codeflash_output == expected
+'''
 
-        assert codeflash_line is not None, "Could not find codeflash_output assignment line"
-        assert "# 19.0μs -> 14.0μs" in codeflash_line, f"Comment not found in the correct line: {codeflash_line}"
+        assert len(result.generated_tests) == 1
+        assert result.generated_tests[0].generated_original_test_source == expected_source
diff --git a/tests/test_existing_tests_source_for.py b/tests/test_existing_tests_source_for.py
index 1a00c47c0..27495939c 100644
--- a/tests/test_existing_tests_source_for.py
+++ b/tests/test_existing_tests_source_for.py
@@ -1,645 +1,390 @@
-"""Tests for the existing_tests_source_for function in result/create_pr.py."""
+from __future__ import annotations
 
+import os
 from pathlib import Path
-from unittest.mock import patch
+from typing import NamedTuple
 
 import pytest
-from codeflash.models.models import (
-    CodePosition,
-    FunctionCalledInTest,
-    FunctionTestInvocation,
-    InvocationId,
-    TestResults,
-    TestsInFile,
-    TestType, VerificationType,
-)
+
 from codeflash.result.create_pr import existing_tests_source_for
 
 
-@pytest.fixture
-def sample_tests_root(tmp_path: Path) -> Path:
-    """Create a temporary test root directory."""
-    return tmp_path / "tests"
+class MockInvocationId(NamedTuple):
+    test_module_path: str
+    test_class_name: str | None
+    test_function_name: str
 
 
-@pytest.fixture
-def sample_function_to_tests(sample_tests_root: Path) -> dict[str, set[FunctionCalledInTest]]:
-    """Create sample function to tests mapping."""
-    test_file_1 = sample_tests_root / "test_module1.py"
-    test_file_2 = sample_tests_root / "test_module2.py"
+class MockTestsInFile(NamedTuple):
+    test_file: str
 
-    return {
-        "my_module.my_function": {
-            FunctionCalledInTest(
-                tests_in_file=TestsInFile(
-                    test_file=test_file_1,
-                    test_class=None,
-                    test_function="test_basic_functionality",
-                    test_type=TestType.EXISTING_UNIT_TEST,
-                ),
-                position=CodePosition(line_no=10, col_no=4),
-            ),
-            FunctionCalledInTest(
-                tests_in_file=TestsInFile(
-                    test_file=test_file_1,
-                    test_class="TestMyFunction",
-                    test_function="test_edge_cases",
-                    test_type=TestType.EXISTING_UNIT_TEST,
-                ),
-                position=CodePosition(line_no=20, col_no=8),
-            ),
-            FunctionCalledInTest(
-                tests_in_file=TestsInFile(
-                    test_file=test_file_2,
-                    test_class=None,
-                    test_function="test_performance",
-                    test_type=TestType.EXISTING_UNIT_TEST,
-                ),
-                position=CodePosition(line_no=15, col_no=4),
-            ),
-        }
-    }
-
-
-@pytest.fixture
-def sample_original_test_results() -> TestResults:
-    """Create sample original test results with timing information."""
-    results = TestResults()
-
-    # Test case 1: test_basic_functionality with multiple function calls
-    results.add(
-        FunctionTestInvocation(
-            id=InvocationId(
-                test_module_path="tests.test_module1",
-                test_class_name=None,
-                test_function_name="test_basic_functionality",
-                function_getting_tested="my_function",
-                iteration_id="1",
-            ),
-            file_name=Path("/tmp/tests/test_module1.py"),
-            did_pass=True,
-            runtime=1000,  # 1000 ns
-            test_framework="pytest",
-            test_type=TestType.EXISTING_UNIT_TEST,
-            return_value=None,
-            timed_out=False,
-            loop_index=1,
+
+class MockFunctionCalledInTest(NamedTuple):
+    tests_in_file: MockTestsInFile
+
+
+class MockTestConfig(NamedTuple):
+    tests_root: Path
+    project_root_path: Path
+
+
+class TestExistingTestsSourceFor:
+    """Test cases for existing_tests_source_for function."""
+
+    def test_no_test_files_found(self):
+        """Test when no test files are found for the function."""
+        function_qualified_name = "module.function_name"
+        function_to_tests = {}
+        test_cfg = MockTestConfig(
+            tests_root=Path("/project/tests"),
+            project_root_path=Path("/project")
         )
-    )
-
-    results.add(
-        FunctionTestInvocation(
-            id=InvocationId(
-                test_module_path="tests.test_module1",
-                test_class_name=None,
-                test_function_name="test_basic_functionality",
-                function_getting_tested="my_function",
-                iteration_id="2",
-            ),
-            file_name=Path("/tmp/tests/test_module1.py"),
-            did_pass=True,
-            runtime=500,  # 500 ns
-            test_framework="pytest",
-            test_type=TestType.EXISTING_UNIT_TEST,
-            return_value=None,
-            timed_out=False,
-            loop_index=1,
+        original_runtimes = {}
+        optimized_runtimes = {}
+
+        result = existing_tests_source_for(
+            function_qualified_name,
+            function_to_tests,
+            test_cfg,
+            original_runtimes,
+            optimized_runtimes
         )
-    )
-
-    # Test case 2: test_edge_cases
-    results.add(
-        FunctionTestInvocation(
-            id=InvocationId(
-                test_module_path="tests.test_module1",
-                test_class_name="TestMyFunction",
-                test_function_name="test_edge_cases",
-                function_getting_tested="my_function",
-                iteration_id="1",
-            ),
-            file_name=Path("/tmp/tests/test_module1.py"),
-            did_pass=True,
-            runtime=2000,  # 2000 ns
-            test_framework="pytest",
-            test_type=TestType.EXISTING_UNIT_TEST,
-            return_value=None,
-            timed_out=False,
-            loop_index=1,
+
+        assert result == ""
+
+    def test_single_test_file_with_function_test(self):
+        """Test with a single test file containing one test function."""
+        function_qualified_name = "module.function_name"
+        test_file_path = "/project/tests/test_module.py"
+
+        function_to_tests = {
+            function_qualified_name: {
+                MockFunctionCalledInTest(
+                    tests_in_file=MockTestsInFile(test_file=test_file_path)
+                )
+            }
+        }
+
+        test_cfg = MockTestConfig(
+            tests_root=Path("/project/tests"),
+            project_root_path=Path("/project")
         )
-    )
-
-    # Test case 3: test_performance
-    results.add(
-        FunctionTestInvocation(
-            id=InvocationId(
-                test_module_path="tests.test_module2",
-                test_class_name=None,
-                test_function_name="test_performance",
-                function_getting_tested="my_function",
-                iteration_id="1",
-            ),
-            file_name=Path("/tmp/tests/test_module2.py"),
-            did_pass=True,
-            runtime=3000,  # 3000 ns
-            test_framework="pytest",
-            test_type=TestType.EXISTING_UNIT_TEST,
-            return_value=None,
-            timed_out=False,
-            loop_index=1,
+
+        invocation_id = MockInvocationId(
+            test_module_path="tests.test_module",
+            test_class_name=None,
+            test_function_name="test_function"
         )
-    )
-
-    return results
-
-
-@pytest.fixture
-def sample_optimized_test_results() -> TestResults:
-    """Create sample optimized test results with improved timing information."""
-    results = TestResults()
-
-    # Test case 1: test_basic_functionality with multiple function calls (improved)
-    results.add(
-        FunctionTestInvocation(
-            id=InvocationId(
-                test_module_path="tests.test_module1",
-                test_class_name=None,
-                test_function_name="test_basic_functionality",
-                function_getting_tested="my_function",
-                iteration_id="1",
-            ),
-            file_name=Path("/tmp/tests/test_module1.py"),
-            did_pass=True,
-            runtime=800,  # 800 ns (improved from 1000 ns)
-            test_framework="pytest",
-            test_type=TestType.EXISTING_UNIT_TEST,
-            return_value=None,
-            timed_out=False,
-            loop_index=1,
+
+        original_runtimes = {invocation_id: [1000000, 1100000, 900000]}  # 1ms, 1.1ms, 0.9ms
+        optimized_runtimes = {invocation_id: [500000, 600000, 400000]}   # 0.5ms, 0.6ms, 0.4ms
+
+        result = existing_tests_source_for(
+            function_qualified_name,
+            function_to_tests,
+            test_cfg,
+            original_runtimes,
+            optimized_runtimes
         )
-    )
-
-    results.add(
-        FunctionTestInvocation(
-            id=InvocationId(
-                test_module_path="tests.test_module1",
-                test_class_name=None,
-                test_function_name="test_basic_functionality",
-                function_getting_tested="my_function",
-                iteration_id="2",
-            ),
-            file_name=Path("/tmp/tests/test_module1.py"),
-            did_pass=True,
-            runtime=400,  # 400 ns (improved from 500 ns)
-            test_framework="pytest",
-            test_type=TestType.EXISTING_UNIT_TEST,
-            return_value=None,
-            timed_out=False,
-            loop_index=1,
+
+        expected = """- test_module.py
+    - test_function: 900μs -> 400μs $\\color{green}(55.56\\%)$
+
+"""
+        assert result == expected
+
+    def test_single_test_file_with_class_test(self):
+        """Test with a single test file containing a test method in a class."""
+        function_qualified_name = "module.function_name"
+        test_file_path = "/project/tests/test_module.py"
+
+        function_to_tests = {
+            function_qualified_name: {
+                MockFunctionCalledInTest(
+                    tests_in_file=MockTestsInFile(test_file=test_file_path)
+                )
+            }
+        }
+
+        test_cfg = MockTestConfig(
+            tests_root=Path("/project/tests"),
+            project_root_path=Path("/project")
         )
-    )
-
-    # Test case 2: test_edge_cases (improved)
-    results.add(
-        FunctionTestInvocation(
-            id=InvocationId(
-                test_module_path="tests.test_module1",
-                test_class_name="TestMyFunction",
-                test_function_name="test_edge_cases",
-                function_getting_tested="my_function",
-                iteration_id="1",
-            ),
-            file_name=Path("/tmp/tests/test_module1.py"),
-            did_pass=True,
-            runtime=1500,  # 1500 ns (improved from 2000 ns)
-            test_framework="pytest",
-            test_type=TestType.EXISTING_UNIT_TEST,
-            return_value=None,
-            timed_out=False,
-            loop_index=1,
+
+        invocation_id = MockInvocationId(
+            test_module_path="tests.test_module",
+            test_class_name="TestClass",
+            test_function_name="test_method"
         )
-    )
-
-    # Test case 3: test_performance (improved)
-    results.add(
-        FunctionTestInvocation(
-            id=InvocationId(
-                test_module_path="tests.test_module2",
-                test_class_name=None,
-                test_function_name="test_performance",
-                function_getting_tested="my_function",
-                iteration_id="1",
-            ),
-            file_name=Path("/tmp/tests/test_module2.py"),
-            did_pass=True,
-            runtime=2100,  # 2100 ns (improved from 3000 ns)
-            test_framework="pytest",
-            test_type=TestType.EXISTING_UNIT_TEST,
-            return_value=None,
-            timed_out=False,
-            loop_index=1,
+
+        original_runtimes = {invocation_id: [2000000]}  # 2ms
+        optimized_runtimes = {invocation_id: [3000000]}  # 3ms (slower)
+
+        result = existing_tests_source_for(
+            function_qualified_name,
+            function_to_tests,
+            test_cfg,
+            original_runtimes,
+            optimized_runtimes
         )
-    )
 
-    return results
+        expected = """- test_module.py
+    - TestClass.test_method: 2.00ms -> 3.00ms $\\color{red}(-50.00\\%)$
 
+"""
+        assert result == expected
 
-def test_existing_tests_source_for_without_timing_info(
-    sample_function_to_tests: dict[str, set[FunctionCalledInTest]], sample_tests_root: Path
-):
-    """Test the function works without timing information (backward compatibility)."""
-    result = existing_tests_source_for("my_module.my_function", sample_function_to_tests, sample_tests_root)
+    def test_multiple_test_files_and_methods(self):
+        """Test with multiple test files and multiple test methods."""
+        function_qualified_name = "module.function_name"
+        test_file_path1 = "/project/tests/test_module1.py"
+        test_file_path2 = "/project/tests/test_module2.py"
 
-    expected_lines = ["- test_module1.py", "- test_module2.py"]
+        function_to_tests = {
+            function_qualified_name: {
+                MockFunctionCalledInTest(
+                    tests_in_file=MockTestsInFile(test_file=test_file_path1)
+                ),
+                MockFunctionCalledInTest(
+                    tests_in_file=MockTestsInFile(test_file=test_file_path2)
+                )
+            }
+        }
+
+        test_cfg = MockTestConfig(
+            tests_root=Path("/project/tests"),
+            project_root_path=Path("/project")
+        )
 
-    for line in expected_lines:
-        assert line in result
+        invocation_id1 = MockInvocationId(
+            test_module_path="tests.test_module1",
+            test_class_name=None,
+            test_function_name="test_function1"
+        )
 
-    # Should not contain any timing information
-    assert "->" not in result
-    assert "ns" not in result
+        invocation_id2 = MockInvocationId(
+            test_module_path="tests.test_module1",
+            test_class_name="TestClass",
+            test_function_name="test_method1"
+        )
 
+        invocation_id3 = MockInvocationId(
+            test_module_path="tests.test_module2",
+            test_class_name=None,
+            test_function_name="test_function2"
+        )
 
-def test_existing_tests_source_for_with_timing_info(
-    sample_function_to_tests: dict[str, set[FunctionCalledInTest]],
-    sample_tests_root: Path,
-    sample_original_test_results: TestResults,
-    sample_optimized_test_results: TestResults,
-):
-    """Test the function includes timing information when provided."""
-    with patch("codeflash.code_utils.time_utils.format_time") as mock_format_time:
-        # Mock format_time to return predictable values
-        mock_format_time.side_effect = lambda x: f"{x} ns"
+        original_runtimes = {
+            invocation_id1: [1000000],  # 1ms
+            invocation_id2: [2000000],  # 2ms
+            invocation_id3: [500000]    # 0.5ms
+        }
+        optimized_runtimes = {
+            invocation_id1: [800000],   # 0.8ms
+            invocation_id2: [1500000],  # 1.5ms
+            invocation_id3: [400000]    # 0.4ms
+        }
 
         result = existing_tests_source_for(
-            "my_module.my_function",
-            sample_function_to_tests,
-            sample_tests_root,
-            sample_original_test_results,
-            sample_optimized_test_results,
+            function_qualified_name,
+            function_to_tests,
+            test_cfg,
+            original_runtimes,
+            optimized_runtimes
         )
 
-    # Should contain file names
-    assert "- test_module1.py" in result
-    assert "- test_module2.py" in result
+        expected = """- test_module1.py
+    - TestClass.test_method1: 2.00ms -> 1.50ms $\\color{green}(25.00\\%)$
+    - test_function1: 1.00ms -> 800μs $\\color{green}(20.00\\%)$
 
-    # Should contain test function names with timing (using min values now)
-    assert "test_basic_functionality: 500 ns -> 400 ns" in result  # min(1000,500) -> min(800,400)
-    assert "test_edge_cases: 2000 ns -> 1500 ns" in result
-    assert "test_performance: 3000 ns -> 2100 ns" in result
+- test_module2.py
+    - test_function2: 500μs -> 400μs $\\color{green}(20.00\\%)$
 
+"""
+        assert result == expected
 
-def test_existing_tests_source_for_aggregates_multiple_function_calls(
-    sample_function_to_tests: dict[str, set[FunctionCalledInTest]],
-    sample_tests_root: Path,
-    sample_original_test_results: TestResults,
-    sample_optimized_test_results: TestResults,
-):
-    """Test that multiple function calls within a test case use minimum timing."""
-    with patch("codeflash.code_utils.time_utils.format_time") as mock_format_time:
-        mock_format_time.side_effect = lambda x: f"{x} ns"
+    def test_missing_runtime_data(self):
+        """Test when runtime data is missing for some tests."""
+        function_qualified_name = "module.function_name"
+        test_file_path = "/project/tests/test_module.py"
 
-        result = existing_tests_source_for(
-            "my_module.my_function",
-            sample_function_to_tests,
-            sample_tests_root,
-            sample_original_test_results,
-            sample_optimized_test_results,
-        )
+        function_to_tests = {
+            function_qualified_name: {
+                MockFunctionCalledInTest(
+                    tests_in_file=MockTestsInFile(test_file=test_file_path)
+                )
+            }
+        }
 
-    # test_basic_functionality should show minimum timing: min(1000,500) -> min(800,400)
-    assert "test_basic_functionality: 500 ns -> 400 ns" in result
-
-
-def test_existing_tests_source_for_only_includes_passing_tests(
-    sample_function_to_tests: dict[str, set[FunctionCalledInTest]], sample_tests_root: Path
-):
-    """Test that only passing tests with runtime data are included in timing report."""
-    original_results = TestResults()
-    optimized_results = TestResults()
-
-    # Add a passing test with runtime
-    original_results.add(
-        FunctionTestInvocation(
-            id=InvocationId(
-                test_module_path="tests.test_module1",
-                test_class_name=None,
-                test_function_name="test_basic_functionality",
-                function_getting_tested="my_function",
-                iteration_id="1",
-            ),
-            file_name=Path("/tmp/tests/test_module1.py"),
-            did_pass=True,
-            runtime=1000,
-            test_framework="pytest",
-            test_type=TestType.EXISTING_UNIT_TEST,
-            return_value=None,
-            timed_out=False,
-            loop_index=1,
-        )
-    )
-
-    optimized_results.add(
-        FunctionTestInvocation(
-            id=InvocationId(
-                test_module_path="tests.test_module1",
-                test_class_name=None,
-                test_function_name="test_basic_functionality",
-                function_getting_tested="my_function",
-                iteration_id="1",
-            ),
-            file_name=Path("/tmp/tests/test_module1.py"),
-            did_pass=True,
-            runtime=800,
-            test_framework="pytest",
-            test_type=TestType.EXISTING_UNIT_TEST,
-            return_value=None,
-            timed_out=False,
-            loop_index=1,
+        test_cfg = MockTestConfig(
+            tests_root=Path("/project/tests"),
+            project_root_path=Path("/project")
         )
-    )
-
-    # Add a failing test (should be excluded)
-    original_results.add(
-        FunctionTestInvocation(
-            id=InvocationId(
-                test_module_path="tests.test_module1",
-                test_class_name="TestMyFunction",
-                test_function_name="test_edge_cases",
-                function_getting_tested="my_function",
-                iteration_id="1",
-            ),
-            file_name=Path("/tmp/tests/test_module1.py"),
-            did_pass=False,  # Failing test
-            runtime=2000,
-            test_framework="pytest",
-            test_type=TestType.EXISTING_UNIT_TEST,
-            return_value=None,
-            timed_out=False,
-            loop_index=1,
+
+        invocation_id1 = MockInvocationId(
+            test_module_path="tests.test_module",
+            test_class_name=None,
+            test_function_name="test_with_original_only"
         )
-    )
-
-    # Add a test without runtime (should be excluded)
-    original_results.add(
-        FunctionTestInvocation(
-            id=InvocationId(
-                test_module_path="tests.test_module2",
-                test_class_name=None,
-                test_function_name="test_performance",
-                function_getting_tested="my_function",
-                iteration_id="1",
-            ),
-            file_name=Path("/tmp/tests/test_module2.py"),
-            did_pass=True,
-            runtime=None,  # No runtime data
-            test_framework="pytest",
-            test_type=TestType.EXISTING_UNIT_TEST,
-            return_value=None,
-            timed_out=False,
-            loop_index=1,
+
+        invocation_id2 = MockInvocationId(
+            test_module_path="tests.test_module",
+            test_class_name=None,
+            test_function_name="test_with_optimized_only"
         )
-    )
 
-    with patch("codeflash.code_utils.time_utils.format_time") as mock_format_time:
-        mock_format_time.side_effect = lambda x: f"{x} ns"
+        original_runtimes = {invocation_id1: [1000000]}  # Only original
+        optimized_runtimes = {invocation_id2: [500000]}  # Only optimized
 
         result = existing_tests_source_for(
-            "my_module.my_function", sample_function_to_tests, sample_tests_root, original_results, optimized_results
+            function_qualified_name,
+            function_to_tests,
+            test_cfg,
+            original_runtimes,
+            optimized_runtimes
+        )
+
+        expected = """- test_module.py
+    - test_with_optimized_only: NaN -> 500μs
+    - test_with_original_only: 1.00ms -> NaN
+
+"""
+        assert result == expected
+
+    def test_nested_test_directory(self):
+        """Test with nested test directories."""
+        function_qualified_name = "module.function_name"
+        test_file_path = "/project/tests/unit/test_module.py"
+
+        function_to_tests = {
+            function_qualified_name: {
+                MockFunctionCalledInTest(
+                    tests_in_file=MockTestsInFile(test_file=test_file_path)
+                )
+            }
+        }
+
+        test_cfg = MockTestConfig(
+            tests_root=Path("/project/tests"),
+            project_root_path=Path("/project")
         )
 
-    # Should only include the passing test with runtime data
-    assert "test_basic_functionality: 1000 ns -> 800 ns" in result
-    # Should not include failing test or test without runtime
-    assert "test_edge_cases" not in result
-    assert "test_performance" not in result
-
-
-def test_existing_tests_source_for_with_empty_test_mapping(sample_tests_root: Path):
-    """Test behavior when there are no tests for the function."""
-    result = existing_tests_source_for("nonexistent.function", {}, sample_tests_root)
-
-    assert result == ""
-
-
-def test_existing_tests_source_for_missing_optimized_results(
-    sample_function_to_tests: dict[str, set[FunctionCalledInTest]],
-    sample_tests_root: Path,
-    sample_original_test_results: TestResults,
-):
-    """Test behavior when optimized results are missing for some test cases."""
-    # Create optimized results that are missing some test cases
-    optimized_results = TestResults()
-    optimized_results.add(
-        FunctionTestInvocation(
-            id=InvocationId(
-                test_module_path="tests.test_module1",
-                test_class_name=None,
-                test_function_name="test_basic_functionality",
-                function_getting_tested="my_function",
-                iteration_id="1",
-            ),
-            file_name=Path("/tmp/tests/test_module1.py"),
-            did_pass=True,
-            runtime=800,
-            test_framework="pytest",
-            test_type=TestType.EXISTING_UNIT_TEST,
-            return_value=None,
-            timed_out=False,
-            loop_index=1,
+        invocation_id = MockInvocationId(
+            test_module_path="tests.unit.test_module",
+            test_class_name=None,
+            test_function_name="test_function"
         )
-    )
-    # Note: Missing test_edge_cases and test_performance optimized results
 
-    with patch("codeflash.code_utils.time_utils.format_time") as mock_format_time:
-        mock_format_time.side_effect = lambda x: f"{x} ns"
+        original_runtimes = {invocation_id: [1000000]}
+        optimized_runtimes = {invocation_id: [800000]}
 
         result = existing_tests_source_for(
-            "my_module.my_function",
-            sample_function_to_tests,
-            sample_tests_root,
-            sample_original_test_results,
-            optimized_results,
+            function_qualified_name,
+            function_to_tests,
+            test_cfg,
+            original_runtimes,
+            optimized_runtimes
         )
 
-    # Should not include test cases without both original and optimized results
-    assert "test_basic_functionality" not in result  # Missing second function call
-    assert "test_edge_cases" not in result
-    assert "test_performance" not in result
-
-    # Should still show file names
-    assert "- test_module1.py" in result
-    assert "- test_module2.py" in result
-
-
-def test_existing_tests_source_for_sorted_output(sample_tests_root: Path):
-    """Test that output is properly sorted by file name and test function name."""
-    # Create a more complex test mapping with multiple files and functions
-    test_file_a = sample_tests_root / "a_test_module.py"
-    test_file_z = sample_tests_root / "z_test_module.py"
-
-    function_to_tests = {
-        "my_module.my_function": {
-            FunctionCalledInTest(
-                tests_in_file=TestsInFile(
-                    test_file=test_file_z,
-                    test_class=None,
-                    test_function="z_test_function",
-                    test_type=TestType.EXISTING_UNIT_TEST,
-                ),
-                position=CodePosition(line_no=10, col_no=4),
-            ),
-            FunctionCalledInTest(
-                tests_in_file=TestsInFile(
-                    test_file=test_file_a,
-                    test_class=None,
-                    test_function="a_test_function",
-                    test_type=TestType.EXISTING_UNIT_TEST,
-                ),
-                position=CodePosition(line_no=20, col_no=8),
-            ),
-            FunctionCalledInTest(
-                tests_in_file=TestsInFile(
-                    test_file=test_file_a,
-                    test_class=None,
-                    test_function="b_test_function",
-                    test_type=TestType.EXISTING_UNIT_TEST,
-                ),
-                position=CodePosition(line_no=30, col_no=8),
-            ),
+        expected = """- unit/test_module.py
+    - test_function: 1.00ms -> 800μs $\\color{green}(20.00\\%)$
+
+"""
+        assert result == expected
+
+    def test_multiple_invocations_same_test(self):
+        """Test when the same test has multiple invocations (runtimes are summed)."""
+        function_qualified_name = "module.function_name"
+        test_file_path = "/project/tests/test_module.py"
+
+        function_to_tests = {
+            function_qualified_name: {
+                MockFunctionCalledInTest(
+                    tests_in_file=MockTestsInFile(test_file=test_file_path)
+                )
+            }
         }
-    }
-
-    original_results = TestResults()
-    optimized_results = TestResults()
-
-    # Add test results for all functions
-    for test_func in ["a_test_function", "b_test_function"]:
-        original_results.add(
-            FunctionTestInvocation(
-                id=InvocationId(
-                    test_module_path="tests.a_test_module",
-                    test_class_name=None,
-                    test_function_name=test_func,
-                    function_getting_tested="my_function",
-                    iteration_id="1",
-                ),
-                file_name=Path("/tmp/tests/a_test_module.py"),
-                did_pass=True,
-                runtime=1000,
-                test_framework="pytest",
-                test_type=TestType.EXISTING_UNIT_TEST,
-                return_value=None,
-                timed_out=False,
-                loop_index=1,
-            )
-        )
 
-        optimized_results.add(
-            FunctionTestInvocation(
-                id=InvocationId(
-                    test_module_path="tests.a_test_module",
-                    test_class_name=None,
-                    test_function_name=test_func,
-                    function_getting_tested="my_function",
-                    iteration_id="1",
-                ),
-                file_name=Path("/tmp/tests/a_test_module.py"),
-                did_pass=True,
-                runtime=800,
-                test_framework="pytest",
-                test_type=TestType.EXISTING_UNIT_TEST,
-                return_value=None,
-                timed_out=False,
-                loop_index=1,
-            )
+        test_cfg = MockTestConfig(
+            tests_root=Path("/project/tests"),
+            project_root_path=Path("/project")
         )
 
-    original_results.add(
-        FunctionTestInvocation(
-            id=InvocationId(
-                test_module_path="tests.z_test_module",
-                test_class_name=None,
-                test_function_name="z_test_function",
-                function_getting_tested="my_function",
-                iteration_id="1",
-            ),
-            file_name=Path("/tmp/tests/z_test_module.py"),
-            did_pass=True,
-            runtime=1000,
-            test_framework="pytest",
-            test_type=TestType.EXISTING_UNIT_TEST,
-            return_value=None,
-            timed_out=False,
-            loop_index=1,
+        # Same test function with multiple invocations
+        invocation_id1 = MockInvocationId(
+            test_module_path="tests.test_module",
+            test_class_name=None,
+            test_function_name="test_function"
         )
-    )
-
-    optimized_results.add(
-        FunctionTestInvocation(
-            id=InvocationId(
-                test_module_path="tests.z_test_module",
-                test_class_name=None,
-                test_function_name="z_test_function",
-                function_getting_tested="my_function",
-                iteration_id="1",
-            ),
-            file_name=Path("/tmp/tests/z_test_module.py"),
-            did_pass=True,
-            runtime=800,
-            test_framework="pytest",
-            test_type=TestType.EXISTING_UNIT_TEST,
-            return_value=None,
-            timed_out=False,
-            loop_index=1,
+
+        invocation_id2 = MockInvocationId(
+            test_module_path="tests.test_module",
+            test_class_name=None,
+            test_function_name="test_function"
         )
-    )
 
-    with patch("codeflash.code_utils.time_utils.format_time") as mock_format_time:
-        mock_format_time.side_effect = lambda x: f"{x} ns"
+        original_runtimes = {
+            invocation_id1: [1000000, 1200000],  # min: 1ms
+            invocation_id2: [800000, 900000]     # min: 0.8ms
+        }
+        optimized_runtimes = {
+            invocation_id1: [600000, 700000],    # min: 0.6ms
+            invocation_id2: [400000, 500000]     # min: 0.4ms
+        }
 
         result = existing_tests_source_for(
-            "my_module.my_function", function_to_tests, sample_tests_root, original_results, optimized_results
+            function_qualified_name,
+            function_to_tests,
+            test_cfg,
+            original_runtimes,
+            optimized_runtimes
         )
 
-    lines = result.split("\n")
-
-    # Files should be sorted alphabetically
-    a_file_index = next(i for i, line in enumerate(lines) if "a_test_module.py" in line)
-    z_file_index = next(i for i, line in enumerate(lines) if "z_test_module.py" in line)
-    assert a_file_index < z_file_index
-
-    # Test functions within a file should be sorted alphabetically
-    a_func_index = next(i for i, line in enumerate(lines) if "a_test_function" in line)
-    b_func_index = next(i for i, line in enumerate(lines) if "b_test_function" in line)
-    assert a_func_index < b_func_index
+        # Total original: 1ms + 0.8ms = 1.8ms
+        # Total optimized: 0.6ms + 0.4ms = 1ms
+        expected = """- test_module.py
+    - test_function: 1.80ms -> 1.00ms $\\color{green}(44.44\\%)$
+
+"""
+        assert result == expected
+
+    def test_zero_runtime_values(self):
+        """Test handling of zero runtime values."""
+        function_qualified_name = "module.function_name"
+        test_file_path = "/project/tests/test_module.py"
+
+        function_to_tests = {
+            function_qualified_name: {
+                MockFunctionCalledInTest(
+                    tests_in_file=MockTestsInFile(test_file=test_file_path)
+                )
+            }
+        }
 
+        test_cfg = MockTestConfig(
+            tests_root=Path("/project/tests"),
+            project_root_path=Path("/project")
+        )
 
+        invocation_id = MockInvocationId(
+            test_module_path="tests.test_module",
+            test_class_name=None,
+            test_function_name="test_function"
+        )
 
-def test_existing_tests_source_for_format_time_called_correctly(
-    sample_function_to_tests: dict[str, set[FunctionCalledInTest]],
-    sample_tests_root: Path,
-    sample_original_test_results: TestResults,
-    sample_optimized_test_results: TestResults,
-):
-    """Test that format_time is called with correct values (min of runtime lists)."""
-    with patch("codeflash.code_utils.time_utils.format_time") as mock_format_time:
-        mock_format_time.side_effect = lambda x: f"{x} ns"
+        original_runtimes = {invocation_id: [0]}
+        optimized_runtimes = {invocation_id: [0]}
 
-        existing_tests_source_for(
-            "my_module.my_function",
-            sample_function_to_tests,
-            sample_tests_root,
-            sample_original_test_results,
-            sample_optimized_test_results,
+        result = existing_tests_source_for(
+            function_qualified_name,
+            function_to_tests,
+            test_cfg,
+            original_runtimes,
+            optimized_runtimes
         )
 
-        # Check that format_time was called with the minimum values
-        call_args = [call[0][0] for call in mock_format_time.call_args_list]
+        expected = """- test_module.py
+    - test_function: NaN -> NaN
 
-        # Should include minimum values (not aggregated)
-        assert 500 in call_args  # test_basic_functionality original: min(1000, 500)
-        assert 400 in call_args  # test_basic_functionality optimized: min(800, 400)
-        assert 2000 in call_args  # test_edge_cases original
-        assert 1500 in call_args  # test_edge_cases optimized
-        assert 3000 in call_args  # test_performance original
-        assert 2100 in call_args  # test_performance optimized
\ No newline at end of file
+"""
+        assert result == expected

From d2289e54a80d0ac86f8083f995a658f29f96a662 Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Tue, 17 Jun 2025 20:15:24 -0700
Subject: [PATCH 11/16] tests work now, ready to merge

---
 codeflash/result/create_pr.py           |  33 +-
 tests/test_add_runtime_comments.py      |  73 ++--
 tests/test_existing_tests_source_for.py | 479 +++++++++++-------------
 3 files changed, 295 insertions(+), 290 deletions(-)

diff --git a/codeflash/result/create_pr.py b/codeflash/result/create_pr.py
index 72ef6d244..518781a9e 100644
--- a/codeflash/result/create_pr.py
+++ b/codeflash/result/create_pr.py
@@ -17,6 +17,7 @@
     git_root_dir,
 )
 from codeflash.code_utils.github_utils import github_pr_url
+from codeflash.code_utils.tabulate import tabulate
 from codeflash.code_utils.time_utils import format_time
 from codeflash.github.PrComment import FileDiffContent, PrComment
 from codeflash.result.critic import performance_gain
@@ -38,6 +39,8 @@ def existing_tests_source_for(
     if not test_files:
         return ""
     output = ""
+    rows = []
+    headers = ["Test File::Test Function", "Original ⏱️", "Optimized ⏱️", "Improvement"]
     tests_root = test_cfg.tests_root
     module_root = test_cfg.project_root_path
     rel_tests_root = tests_root.relative_to(module_root)
@@ -76,7 +79,6 @@ def existing_tests_source_for(
         original_tests_to_runtimes.keys()
     )  # both will have the same keys as some default values are assigned in the previous loop
     for filename in sorted(all_rel_paths):
-        output += f"- {filename}\n"
         all_qualified_names = original_tests_to_runtimes[
             filename
         ].keys()  # both will have the same keys as some default values are assigned in the previous loop
@@ -90,7 +92,6 @@ def existing_tests_source_for(
                 print_original_runtime = "NaN"
             else:
                 print_original_runtime = format_time(original_tests_to_runtimes[filename][qualified_name])
-            arrow = "->"
             if (
                 original_tests_to_runtimes[filename][qualified_name] != 0
                 and optimized_tests_to_runtimes[filename][qualified_name] != 0
@@ -107,14 +108,32 @@ def existing_tests_source_for(
                     * 100
                 )
                 if greater:
-                    output += f"    - {qualified_name}: {print_original_runtime} {arrow} {print_optimized_runtime} $$\\color{{red}}({perf_gain:.2f}\\\\%)$$\n"
+                    rows.append(
+                        [
+                            f"`{filename}::{qualified_name}`",
+                            f"{print_original_runtime}",
+                            f"{print_optimized_runtime}",
+                            f"⚠️{perf_gain:.2f}%",
+                        ]
+                    )
                 else:
-                    output += f"    - {qualified_name}: {print_original_runtime} {arrow} {print_optimized_runtime} $$\\color{{green}}({perf_gain:.2f}\\\\%)$$\n"
+                    rows.append(
+                        [
+                            f"`{filename}::{qualified_name}`",
+                            f"{print_original_runtime}",
+                            f"{print_optimized_runtime}",
+                            f"✅{perf_gain:.2f}%",
+                        ]
+                    )
             else:
                 # one of them is NaN
-                output += f"    - {qualified_name}: {print_original_runtime} {arrow} {print_optimized_runtime}\n"
-            # output += f"$$\\colorbox{{pink}}\{{    - {qualified_name}: {print_original_runtime} {arrow} {print_optimized_runtime}}}$$\n"
-        output += "\n"
+                rows.append(
+                    [f"`{filename}::{qualified_name}`", f"{print_original_runtime}", f"{print_optimized_runtime}", "❌"]
+                )
+    output += tabulate(
+        headers=headers, tabular_data=rows, tablefmt="pipe", colglobalalign=None, preserve_whitespace=True
+    )
+    output += "\n"
     return output
 
 
diff --git a/tests/test_add_runtime_comments.py b/tests/test_add_runtime_comments.py
index bbb833751..c553845e8 100644
--- a/tests/test_add_runtime_comments.py
+++ b/tests/test_add_runtime_comments.py
@@ -14,28 +14,34 @@ def test_config():
     """Create a mock TestConfig for testing."""
     config = Mock(spec=TestConfig)
     config.project_root_path = Path("/project")
+    config.test_framework= "pytest"
+    config.tests_project_rootdir = Path("/project/tests")
     config.tests_root = Path("/project/tests")
     return config
 
 
-@pytest.fixture
-def sample_invocation_id():
-    """Create a sample InvocationId for testing."""
-    return InvocationId(
-        test_module_path="tests.test_module",
-        test_class_name="TestClass",
-        test_function_name="test_function",
-    )
-
-
-@pytest.fixture
-def sample_invocation_id_no_class():
-    """Create a sample InvocationId without class for testing."""
-    return InvocationId(
-        test_module_path="tests.test_module",
-        test_class_name=None,
-        test_function_name="test_function",
-    )
+# @pytest.fixture
+# def sample_invocation_id():
+#     """Create a sample InvocationId for testing."""
+#     return InvocationId(
+#                 test_module_path="test_module_path",
+#                 test_class_name="test_class_name",
+#                 test_function_name="test_function_name",
+#                 function_getting_tested="function_getting_tested",
+#                 iteration_id="0",
+#             )
+#
+#
+# @pytest.fixture
+# def sample_invocation_id_no_class():
+#     """Create a sample InvocationId without class for testing."""
+#     return InvocationId(
+#                 test_module_path="test_module_path",
+#                 test_class_name=None,
+#                 test_function_name="test_function_name",
+#                 function_getting_tested="function_getting_tested",
+#                 iteration_id="0",
+#             )
 
 
 class TestAddRuntimeCommentsToGeneratedTests:
@@ -60,6 +66,8 @@ def test_add_runtime_comments_simple_function(self, test_config):
             test_module_path="tests.test_module",
             test_class_name=None,
             test_function_name="test_function",
+            function_getting_tested="some_function",
+            iteration_id="0",
         )
 
         original_runtimes = {invocation_id: [1000000000, 1200000000]}  # 1s, 1.2s in nanoseconds
@@ -70,7 +78,7 @@ def test_add_runtime_comments_simple_function(self, test_config):
         )
 
         expected_source = '''def test_function():
-    codeflash_output = some_function() # 1.00s -> 500.00ms (50.00%)
+    codeflash_output = some_function() # 1.00s -> 500ms (100.00%)
     assert codeflash_output == expected
 '''
 
@@ -99,6 +107,9 @@ def test_function(self):
             test_module_path="tests.test_module",
             test_class_name="TestClass",
             test_function_name="test_function",
+            function_getting_tested="some_function",
+            iteration_id="0",
+
         )
 
         original_runtimes = {invocation_id: [2000000000]}  # 2s in nanoseconds
@@ -110,7 +121,7 @@ def test_function(self):
 
         expected_source = '''class TestClass:
     def test_function(self):
-        codeflash_output = some_function() # 2.00s -> 1.00s (50.00%)
+        codeflash_output = some_function() # 2.00s -> 1.00s (100.00%)
         assert codeflash_output == expected
 '''
 
@@ -141,6 +152,8 @@ def test_add_runtime_comments_multiple_assignments(self, test_config):
             test_module_path="tests.test_module",
             test_class_name=None,
             test_function_name="test_function",
+            function_getting_tested="some_function",
+            iteration_id="0",
         )
 
         original_runtimes = {invocation_id: [1500000000]}  # 1.5s in nanoseconds
@@ -152,9 +165,9 @@ def test_add_runtime_comments_multiple_assignments(self, test_config):
 
         expected_source = '''def test_function():
     setup_data = prepare_test()
-    codeflash_output = some_function() # 1.50s -> 750.00ms (50.00%)
+    codeflash_output = some_function() # 1.50s -> 750ms (100.00%)
     assert codeflash_output == expected
-    codeflash_output = another_function() # 1.50s -> 750.00ms (50.00%)
+    codeflash_output = another_function() # 1.50s -> 750ms (100.00%)
     assert codeflash_output == expected2
 '''
 
@@ -183,6 +196,8 @@ def test_add_runtime_comments_no_matching_runtimes(self, test_config):
             test_module_path="tests.other_module",
             test_class_name=None,
             test_function_name="other_function",
+            function_getting_tested="some_other_function",
+            iteration_id="0",
         )
 
         original_runtimes = {invocation_id: [1000000000]}
@@ -217,6 +232,8 @@ def test_add_runtime_comments_no_codeflash_output(self, test_config):
             test_module_path="tests.test_module",
             test_class_name=None,
             test_function_name="test_function",
+            function_getting_tested="some_function",
+            iteration_id="0",
         )
 
         original_runtimes = {invocation_id: [1000000000]}
@@ -264,12 +281,16 @@ def test_add_runtime_comments_multiple_tests(self, test_config):
             test_module_path="tests.test_module1",
             test_class_name=None,
             test_function_name="test_function1",
+            function_getting_tested="some_function",
+            iteration_id="0",
         )
 
         invocation_id2 = InvocationId(
             test_module_path="tests.test_module2",
             test_class_name=None,
             test_function_name="test_function2",
+            function_getting_tested="another_function",
+            iteration_id = "0",
         )
 
         original_runtimes = {
@@ -286,12 +307,12 @@ def test_add_runtime_comments_multiple_tests(self, test_config):
         )
 
         expected_source1 = '''def test_function1():
-    codeflash_output = some_function() # 1.00s -> 500.00ms (50.00%)
+    codeflash_output = some_function() # 1.00s -> 500ms (100.00%)
     assert codeflash_output == expected
 '''
 
         expected_source2 = '''def test_function2():
-    codeflash_output = another_function() # 2.00s -> 800.00ms (60.00%)
+    codeflash_output = another_function() # 2.00s -> 800ms (150.00%)
     assert codeflash_output == expected
 '''
 
@@ -320,6 +341,8 @@ def test_add_runtime_comments_performance_regression(self, test_config):
             test_module_path="tests.test_module",
             test_class_name=None,
             test_function_name="test_function",
+            function_getting_tested="some_function",
+            iteration_id="0",
         )
 
         original_runtimes = {invocation_id: [1000000000]}  # 1s
@@ -330,7 +353,7 @@ def test_add_runtime_comments_performance_regression(self, test_config):
         )
 
         expected_source = '''def test_function():
-    codeflash_output = some_function() # 1.00s -> 1.50s (-50.00%)
+    codeflash_output = some_function() # 1.00s -> 1.50s (-33.33%)
     assert codeflash_output == expected
 '''
 
diff --git a/tests/test_existing_tests_source_for.py b/tests/test_existing_tests_source_for.py
index 27495939c..945de6d84 100644
--- a/tests/test_existing_tests_source_for.py
+++ b/tests/test_existing_tests_source_for.py
@@ -1,390 +1,353 @@
-from __future__ import annotations
-
 import os
 from pathlib import Path
-from typing import NamedTuple
+from unittest.mock import Mock
 
 import pytest
 
 from codeflash.result.create_pr import existing_tests_source_for
 
 
-class MockInvocationId(NamedTuple):
-    test_module_path: str
-    test_class_name: str | None
-    test_function_name: str
-
-
-class MockTestsInFile(NamedTuple):
-    test_file: str
-
-
-class MockFunctionCalledInTest(NamedTuple):
-    tests_in_file: MockTestsInFile
-
-
-class MockTestConfig(NamedTuple):
-    tests_root: Path
-    project_root_path: Path
-
-
 class TestExistingTestsSourceFor:
     """Test cases for existing_tests_source_for function."""
 
-    def test_no_test_files_found(self):
-        """Test when no test files are found for the function."""
-        function_qualified_name = "module.function_name"
+    def setup_method(self):
+        """Set up test fixtures."""
+        # Mock test config
+        self.test_cfg = Mock()
+        self.test_cfg.tests_root = Path("/project/tests")
+        self.test_cfg.project_root_path = Path("/project")
+
+        # Mock invocation ID
+        self.mock_invocation_id = Mock()
+        self.mock_invocation_id.test_module_path = "tests.test_module"
+        self.mock_invocation_id.test_class_name = "TestClass"
+        self.mock_invocation_id.test_function_name = "test_function"
+
+        # Mock function called in test
+        self.mock_function_called_in_test = Mock()
+        self.mock_function_called_in_test.tests_in_file = Mock()
+        self.mock_function_called_in_test.tests_in_file.test_file = "/project/tests/test_module.py"
+
+    def test_no_test_files_returns_empty_string(self):
+        """Test that function returns empty string when no test files exist."""
         function_to_tests = {}
-        test_cfg = MockTestConfig(
-            tests_root=Path("/project/tests"),
-            project_root_path=Path("/project")
-        )
         original_runtimes = {}
         optimized_runtimes = {}
 
         result = existing_tests_source_for(
-            function_qualified_name,
+            "module.function",
             function_to_tests,
-            test_cfg,
+            self.test_cfg,
             original_runtimes,
             optimized_runtimes
         )
 
         assert result == ""
 
-    def test_single_test_file_with_function_test(self):
-        """Test with a single test file containing one test function."""
-        function_qualified_name = "module.function_name"
-        test_file_path = "/project/tests/test_module.py"
-
+    def test_single_test_with_improvement(self):
+        """Test single test showing performance improvement."""
         function_to_tests = {
-            function_qualified_name: {
-                MockFunctionCalledInTest(
-                    tests_in_file=MockTestsInFile(test_file=test_file_path)
-                )
-            }
+            "module.function": {self.mock_function_called_in_test}
+        }
+        original_runtimes = {
+            self.mock_invocation_id: [1000000]  # 1ms in nanoseconds
+        }
+        optimized_runtimes = {
+            self.mock_invocation_id: [500000]   # 0.5ms in nanoseconds
         }
-
-        test_cfg = MockTestConfig(
-            tests_root=Path("/project/tests"),
-            project_root_path=Path("/project")
-        )
-
-        invocation_id = MockInvocationId(
-            test_module_path="tests.test_module",
-            test_class_name=None,
-            test_function_name="test_function"
-        )
-
-        original_runtimes = {invocation_id: [1000000, 1100000, 900000]}  # 1ms, 1.1ms, 0.9ms
-        optimized_runtimes = {invocation_id: [500000, 600000, 400000]}   # 0.5ms, 0.6ms, 0.4ms
 
         result = existing_tests_source_for(
-            function_qualified_name,
+            "module.function",
             function_to_tests,
-            test_cfg,
+            self.test_cfg,
             original_runtimes,
             optimized_runtimes
         )
 
-        expected = """- test_module.py
-    - test_function: 900μs -> 400μs $\\color{green}(55.56\\%)$
-
+        expected = """| Test File::Test Function                  | Original ⏱️   | Optimized ⏱️   | Improvement   |
+|:------------------------------------------|:--------------|:---------------|:--------------|
+| `test_module.py::TestClass.test_function` | 1.00ms        | 500μs          | ✅100.00%     |
 """
-        assert result == expected
 
-    def test_single_test_file_with_class_test(self):
-        """Test with a single test file containing a test method in a class."""
-        function_qualified_name = "module.function_name"
-        test_file_path = "/project/tests/test_module.py"
+        assert result == expected
 
+    def test_single_test_with_regression(self):
+        """Test single test showing performance regression."""
         function_to_tests = {
-            function_qualified_name: {
-                MockFunctionCalledInTest(
-                    tests_in_file=MockTestsInFile(test_file=test_file_path)
-                )
-            }
+            "module.function": {self.mock_function_called_in_test}
+        }
+        original_runtimes = {
+            self.mock_invocation_id: [500000]   # 0.5ms in nanoseconds
+        }
+        optimized_runtimes = {
+            self.mock_invocation_id: [1000000]  # 1ms in nanoseconds
         }
-
-        test_cfg = MockTestConfig(
-            tests_root=Path("/project/tests"),
-            project_root_path=Path("/project")
-        )
-
-        invocation_id = MockInvocationId(
-            test_module_path="tests.test_module",
-            test_class_name="TestClass",
-            test_function_name="test_method"
-        )
-
-        original_runtimes = {invocation_id: [2000000]}  # 2ms
-        optimized_runtimes = {invocation_id: [3000000]}  # 3ms (slower)
 
         result = existing_tests_source_for(
-            function_qualified_name,
+            "module.function",
             function_to_tests,
-            test_cfg,
+            self.test_cfg,
             original_runtimes,
             optimized_runtimes
         )
 
-        expected = """- test_module.py
-    - TestClass.test_method: 2.00ms -> 3.00ms $\\color{red}(-50.00\\%)$
-
+        expected = """| Test File::Test Function                  | Original ⏱️   | Optimized ⏱️   | Improvement   |
+|:------------------------------------------|:--------------|:---------------|:--------------|
+| `test_module.py::TestClass.test_function` | 500μs         | 1.00ms         | ⚠️-50.00%     |
 """
+
         assert result == expected
 
-    def test_multiple_test_files_and_methods(self):
-        """Test with multiple test files and multiple test methods."""
-        function_qualified_name = "module.function_name"
-        test_file_path1 = "/project/tests/test_module1.py"
-        test_file_path2 = "/project/tests/test_module2.py"
+    def test_test_without_class_name(self):
+        """Test function without class name (standalone test function)."""
+        mock_invocation_no_class = Mock()
+        mock_invocation_no_class.test_module_path = "tests.test_module"
+        mock_invocation_no_class.test_class_name = None
+        mock_invocation_no_class.test_function_name = "test_standalone"
 
         function_to_tests = {
-            function_qualified_name: {
-                MockFunctionCalledInTest(
-                    tests_in_file=MockTestsInFile(test_file=test_file_path1)
-                ),
-                MockFunctionCalledInTest(
-                    tests_in_file=MockTestsInFile(test_file=test_file_path2)
-                )
-            }
+            "module.function": {self.mock_function_called_in_test}
         }
-
-        test_cfg = MockTestConfig(
-            tests_root=Path("/project/tests"),
-            project_root_path=Path("/project")
-        )
-
-        invocation_id1 = MockInvocationId(
-            test_module_path="tests.test_module1",
-            test_class_name=None,
-            test_function_name="test_function1"
-        )
-
-        invocation_id2 = MockInvocationId(
-            test_module_path="tests.test_module1",
-            test_class_name="TestClass",
-            test_function_name="test_method1"
-        )
-
-        invocation_id3 = MockInvocationId(
-            test_module_path="tests.test_module2",
-            test_class_name=None,
-            test_function_name="test_function2"
-        )
-
         original_runtimes = {
-            invocation_id1: [1000000],  # 1ms
-            invocation_id2: [2000000],  # 2ms
-            invocation_id3: [500000]    # 0.5ms
+            mock_invocation_no_class: [1000000]
         }
         optimized_runtimes = {
-            invocation_id1: [800000],   # 0.8ms
-            invocation_id2: [1500000],  # 1.5ms
-            invocation_id3: [400000]    # 0.4ms
+            mock_invocation_no_class: [800000]
         }
 
         result = existing_tests_source_for(
-            function_qualified_name,
+            "module.function",
             function_to_tests,
-            test_cfg,
+            self.test_cfg,
             original_runtimes,
             optimized_runtimes
         )
 
-        expected = """- test_module1.py
-    - TestClass.test_method1: 2.00ms -> 1.50ms $\\color{green}(25.00\\%)$
-    - test_function1: 1.00ms -> 800μs $\\color{green}(20.00\\%)$
-
-- test_module2.py
-    - test_function2: 500μs -> 400μs $\\color{green}(20.00\\%)$
-
+        expected = """| Test File::Test Function          | Original ⏱️   | Optimized ⏱️   | Improvement   |
+|:----------------------------------|:--------------|:---------------|:--------------|
+| `test_module.py::test_standalone` | 1.00ms        | 800μs          | ✅25.00%      |
 """
-        assert result == expected
 
-    def test_missing_runtime_data(self):
-        """Test when runtime data is missing for some tests."""
-        function_qualified_name = "module.function_name"
-        test_file_path = "/project/tests/test_module.py"
+        assert result == expected
 
+    def test_missing_original_runtime(self):
+        """Test when original runtime is missing (shows NaN)."""
         function_to_tests = {
-            function_qualified_name: {
-                MockFunctionCalledInTest(
-                    tests_in_file=MockTestsInFile(test_file=test_file_path)
-                )
-            }
+            "module.function": {self.mock_function_called_in_test}
+        }
+        original_runtimes = {}
+        optimized_runtimes = {
+            self.mock_invocation_id: [500000]
         }
 
-        test_cfg = MockTestConfig(
-            tests_root=Path("/project/tests"),
-            project_root_path=Path("/project")
+        result = existing_tests_source_for(
+            "module.function",
+            function_to_tests,
+            self.test_cfg,
+            original_runtimes,
+            optimized_runtimes
         )
 
-        invocation_id1 = MockInvocationId(
-            test_module_path="tests.test_module",
-            test_class_name=None,
-            test_function_name="test_with_original_only"
-        )
+        expected = """| Test File::Test Function                  |   Original ⏱️ | Optimized ⏱️   | Improvement   |
+|:------------------------------------------|--------------:|:---------------|:--------------|
+| `test_module.py::TestClass.test_function` |           nan | 500μs          | ❌            |
+"""
 
-        invocation_id2 = MockInvocationId(
-            test_module_path="tests.test_module",
-            test_class_name=None,
-            test_function_name="test_with_optimized_only"
-        )
+        assert result == expected
 
-        original_runtimes = {invocation_id1: [1000000]}  # Only original
-        optimized_runtimes = {invocation_id2: [500000]}  # Only optimized
+    def test_missing_optimized_runtime(self):
+        """Test when optimized runtime is missing (shows NaN)."""
+        function_to_tests = {
+            "module.function": {self.mock_function_called_in_test}
+        }
+        original_runtimes = {
+            self.mock_invocation_id: [1000000]
+        }
+        optimized_runtimes = {}
 
         result = existing_tests_source_for(
-            function_qualified_name,
+            "module.function",
             function_to_tests,
-            test_cfg,
+            self.test_cfg,
             original_runtimes,
             optimized_runtimes
         )
 
-        expected = """- test_module.py
-    - test_with_optimized_only: NaN -> 500μs
-    - test_with_original_only: 1.00ms -> NaN
-
+        expected = """| Test File::Test Function                  | Original ⏱️   |   Optimized ⏱️ | Improvement   |
+|:------------------------------------------|:--------------|---------------:|:--------------|
+| `test_module.py::TestClass.test_function` | 1.00ms        |            nan | ❌            |
 """
+
         assert result == expected
 
-    def test_nested_test_directory(self):
-        """Test with nested test directories."""
-        function_qualified_name = "module.function_name"
-        test_file_path = "/project/tests/unit/test_module.py"
+    def test_multiple_tests_sorted_output(self):
+        """Test multiple tests with sorted output by filename and function name."""
+        # Create second test file
+        mock_function_called_2 = Mock()
+        mock_function_called_2.tests_in_file = Mock()
+        mock_function_called_2.tests_in_file.test_file = "/project/tests/test_another.py"
+
+        mock_invocation_2 = Mock()
+        mock_invocation_2.test_module_path = "tests.test_another"
+        mock_invocation_2.test_class_name = "TestAnother"
+        mock_invocation_2.test_function_name = "test_another_function"
 
         function_to_tests = {
-            function_qualified_name: {
-                MockFunctionCalledInTest(
-                    tests_in_file=MockTestsInFile(test_file=test_file_path)
-                )
-            }
+            "module.function": {self.mock_function_called_in_test, mock_function_called_2}
+        }
+        original_runtimes = {
+            self.mock_invocation_id: [1000000],
+            mock_invocation_2: [2000000]
+        }
+        optimized_runtimes = {
+            self.mock_invocation_id: [800000],
+            mock_invocation_2: [1500000]
         }
-
-        test_cfg = MockTestConfig(
-            tests_root=Path("/project/tests"),
-            project_root_path=Path("/project")
-        )
-
-        invocation_id = MockInvocationId(
-            test_module_path="tests.unit.test_module",
-            test_class_name=None,
-            test_function_name="test_function"
-        )
-
-        original_runtimes = {invocation_id: [1000000]}
-        optimized_runtimes = {invocation_id: [800000]}
 
         result = existing_tests_source_for(
-            function_qualified_name,
+            "module.function",
             function_to_tests,
-            test_cfg,
+            self.test_cfg,
             original_runtimes,
             optimized_runtimes
         )
 
-        expected = """- unit/test_module.py
-    - test_function: 1.00ms -> 800μs $\\color{green}(20.00\\%)$
-
+        expected = """| Test File::Test Function                             | Original ⏱️   | Optimized ⏱️   | Improvement   |
+|:-----------------------------------------------------|:--------------|:---------------|:--------------|
+| `test_another.py::TestAnother.test_another_function` | 2.00ms        | 1.50ms         | ✅33.33%      |
+| `test_module.py::TestClass.test_function`            | 1.00ms        | 800μs          | ✅25.00%      |
 """
-        assert result == expected
 
-    def test_multiple_invocations_same_test(self):
-        """Test when the same test has multiple invocations (runtimes are summed)."""
-        function_qualified_name = "module.function_name"
-        test_file_path = "/project/tests/test_module.py"
+        assert result == expected
 
+    def test_multiple_runtimes_uses_minimum(self):
+        """Test that function uses minimum runtime when multiple measurements exist."""
         function_to_tests = {
-            function_qualified_name: {
-                MockFunctionCalledInTest(
-                    tests_in_file=MockTestsInFile(test_file=test_file_path)
-                )
-            }
+            "module.function": {self.mock_function_called_in_test}
+        }
+        original_runtimes = {
+            self.mock_invocation_id: [1000000, 1200000, 800000]  # min: 800000
+        }
+        optimized_runtimes = {
+            self.mock_invocation_id: [600000, 700000, 500000]    # min: 500000
         }
 
-        test_cfg = MockTestConfig(
-            tests_root=Path("/project/tests"),
-            project_root_path=Path("/project")
+        result = existing_tests_source_for(
+            "module.function",
+            function_to_tests,
+            self.test_cfg,
+            original_runtimes,
+            optimized_runtimes
         )
 
-        # Same test function with multiple invocations
-        invocation_id1 = MockInvocationId(
-            test_module_path="tests.test_module",
-            test_class_name=None,
-            test_function_name="test_function"
-        )
+        expected = """| Test File::Test Function                  | Original ⏱️   | Optimized ⏱️   | Improvement   |
+|:------------------------------------------|:--------------|:---------------|:--------------|
+| `test_module.py::TestClass.test_function` | 800μs         | 500μs          | ✅60.00%      |
+"""
 
-        invocation_id2 = MockInvocationId(
-            test_module_path="tests.test_module",
-            test_class_name=None,
-            test_function_name="test_function"
-        )
+        assert result == expected
+
+    def test_complex_module_path_conversion(self):
+        """Test conversion of complex module paths to file paths."""
+        mock_invocation_complex = Mock()
+        mock_invocation_complex.test_module_path = "tests.integration.test_complex_module"
+        mock_invocation_complex.test_class_name = "TestComplex"
+        mock_invocation_complex.test_function_name = "test_complex_function"
 
+        mock_function_complex = Mock()
+        mock_function_complex.tests_in_file = Mock()
+        mock_function_complex.tests_in_file.test_file = f"/project/tests/integration/test_complex_module.py"
+
+        function_to_tests = {
+            "module.function": {mock_function_complex}
+        }
         original_runtimes = {
-            invocation_id1: [1000000, 1200000],  # min: 1ms
-            invocation_id2: [800000, 900000]     # min: 0.8ms
+            mock_invocation_complex: [1000000]
         }
         optimized_runtimes = {
-            invocation_id1: [600000, 700000],    # min: 0.6ms
-            invocation_id2: [400000, 500000]     # min: 0.4ms
+            mock_invocation_complex: [750000]
         }
 
         result = existing_tests_source_for(
-            function_qualified_name,
+            "module.function",
             function_to_tests,
-            test_cfg,
+            self.test_cfg,
             original_runtimes,
             optimized_runtimes
         )
 
-        # Total original: 1ms + 0.8ms = 1.8ms
-        # Total optimized: 0.6ms + 0.4ms = 1ms
-        expected = """- test_module.py
-    - test_function: 1.80ms -> 1.00ms $\\color{green}(44.44\\%)$
-
+        expected = """| Test File::Test Function                                                | Original ⏱️   | Optimized ⏱️   | Improvement   |
+|:------------------------------------------------------------------------|:--------------|:---------------|:--------------|
+| `integration/test_complex_module.py::TestComplex.test_complex_function` | 1.00ms        | 750μs          | ✅33.33%      |
 """
+
         assert result == expected
 
     def test_zero_runtime_values(self):
         """Test handling of zero runtime values."""
-        function_qualified_name = "module.function_name"
-        test_file_path = "/project/tests/test_module.py"
-
         function_to_tests = {
-            function_qualified_name: {
-                MockFunctionCalledInTest(
-                    tests_in_file=MockTestsInFile(test_file=test_file_path)
-                )
-            }
+            "module.function": {self.mock_function_called_in_test}
+        }
+        original_runtimes = {
+            self.mock_invocation_id: [0]
+        }
+        optimized_runtimes = {
+            self.mock_invocation_id: [0]
         }
 
-        test_cfg = MockTestConfig(
-            tests_root=Path("/project/tests"),
-            project_root_path=Path("/project")
+        result = existing_tests_source_for(
+            "module.function",
+            function_to_tests,
+            self.test_cfg,
+            original_runtimes,
+            optimized_runtimes
         )
 
-        invocation_id = MockInvocationId(
-            test_module_path="tests.test_module",
-            test_class_name=None,
-            test_function_name="test_function"
-        )
+        expected = """| Test File::Test Function                  |   Original ⏱️ |   Optimized ⏱️ | Improvement   |
+|:------------------------------------------|--------------:|---------------:|:--------------|
+| `test_module.py::TestClass.test_function` |           nan |            nan | ❌            |
+"""
+
+        assert result == expected
+
+    def test_filters_out_generated_tests(self):
+        """Test that generated tests are filtered out and only non-generated tests are included."""
+        # Create a test that would be filtered out (not in non_generated_tests)
+        mock_generated_test = Mock()
+        mock_generated_test.tests_in_file = Mock()
+        mock_generated_test.tests_in_file.test_file = "/project/tests/generated_test.py"
 
-        original_runtimes = {invocation_id: [0]}
-        optimized_runtimes = {invocation_id: [0]}
+        mock_generated_invocation = Mock()
+        mock_generated_invocation.test_module_path = "tests.generated_test"
+        mock_generated_invocation.test_class_name = "TestGenerated"
+        mock_generated_invocation.test_function_name = "test_generated"
+
+        function_to_tests = {
+            "module.function": {self.mock_function_called_in_test}
+        }
+        original_runtimes = {
+            self.mock_invocation_id: [1000000],
+            mock_generated_invocation: [500000]  # This should be filtered out
+        }
+        optimized_runtimes = {
+            self.mock_invocation_id: [800000],
+            mock_generated_invocation: [400000]  # This should be filtered out
+        }
 
         result = existing_tests_source_for(
-            function_qualified_name,
+            "module.function",
             function_to_tests,
-            test_cfg,
+            self.test_cfg,
             original_runtimes,
             optimized_runtimes
         )
 
-        expected = """- test_module.py
-    - test_function: NaN -> NaN
-
+        # Should only include the non-generated test
+        expected = """| Test File::Test Function                  | Original ⏱️   | Optimized ⏱️   | Improvement   |
+|:------------------------------------------|:--------------|:---------------|:--------------|
+| `test_module.py::TestClass.test_function` | 1.00ms        | 800μs          | ✅25.00%      |
 """
+
         assert result == expected
+
+

From df6efe9dcb1358250336c29332a168cc0eb50269 Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Tue, 17 Jun 2025 20:20:12 -0700
Subject: [PATCH 12/16] mypy fix

---
 codeflash/result/create_pr.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/codeflash/result/create_pr.py b/codeflash/result/create_pr.py
index 518781a9e..3c828d744 100644
--- a/codeflash/result/create_pr.py
+++ b/codeflash/result/create_pr.py
@@ -130,11 +130,11 @@ def existing_tests_source_for(
                 rows.append(
                     [f"`{filename}::{qualified_name}`", f"{print_original_runtime}", f"{print_optimized_runtime}", "❌"]
                 )
-    output += tabulate(
+    output += tabulate(  # type: ignore[no-untyped-call]
         headers=headers, tabular_data=rows, tablefmt="pipe", colglobalalign=None, preserve_whitespace=True
     )
     output += "\n"
-    return output
+    return output  # type: ignore[no-any-return]
 
 
 def check_create_pr(

From 9ad40e24edc0579d96dd8bedd51264a2e65a8434 Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Tue, 17 Jun 2025 20:23:48 -0700
Subject: [PATCH 13/16] mypy fix

---
 codeflash/result/create_pr.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/codeflash/result/create_pr.py b/codeflash/result/create_pr.py
index 3c828d744..190d20e80 100644
--- a/codeflash/result/create_pr.py
+++ b/codeflash/result/create_pr.py
@@ -38,7 +38,7 @@ def existing_tests_source_for(
     test_files = function_to_tests.get(function_qualified_name_with_modules_from_root)
     if not test_files:
         return ""
-    output = ""
+    output: str = ""
     rows = []
     headers = ["Test File::Test Function", "Original ⏱️", "Optimized ⏱️", "Improvement"]
     tests_root = test_cfg.tests_root
@@ -134,7 +134,7 @@ def existing_tests_source_for(
         headers=headers, tabular_data=rows, tablefmt="pipe", colglobalalign=None, preserve_whitespace=True
     )
     output += "\n"
-    return output  # type: ignore[no-any-return]
+    return output
 
 
 def check_create_pr(

From 4806815e18d38275c8f9f05e2236d3015eea531b Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Wed, 18 Jun 2025 14:40:48 -0700
Subject: [PATCH 14/16] ready to merge

---
 codeflash/code_utils/edit_generated_tests.py |   6 +-
 codeflash/code_utils/time_utils.py           |  12 +
 codeflash/result/create_pr.py                |  25 +-
 tests/test_add_runtime_comments.py           | 520 +++++++++++++++++--
 tests/test_existing_tests_source_for.py      |  59 +--
 tests/test_humanize_time.py                  | 102 +++-
 6 files changed, 637 insertions(+), 87 deletions(-)

diff --git a/codeflash/code_utils/edit_generated_tests.py b/codeflash/code_utils/edit_generated_tests.py
index 94c18ab5c..afb33317c 100644
--- a/codeflash/code_utils/edit_generated_tests.py
+++ b/codeflash/code_utils/edit_generated_tests.py
@@ -5,7 +5,7 @@
 import libcst as cst
 
 from codeflash.cli_cmds.console import logger
-from codeflash.code_utils.time_utils import format_time
+from codeflash.code_utils.time_utils import format_perf, format_time
 from codeflash.models.models import GeneratedTests, GeneratedTestsList, InvocationId
 from codeflash.result.critic import performance_gain
 from codeflash.verification.verification_utils import TestConfig
@@ -131,11 +131,11 @@ def leave_SimpleStatementLine(
                 if matching_original_times and matching_optimized_times:
                     original_time = min(matching_original_times)
                     optimized_time = min(matching_optimized_times)
-                    perf_gain = (
+                    perf_gain = format_perf(
                         performance_gain(original_runtime_ns=original_time, optimized_runtime_ns=optimized_time) * 100
                     )
                     # Create the runtime comment
-                    comment_text = f"# {format_time(original_time)} -> {format_time(optimized_time)} ({perf_gain:.2f}%)"
+                    comment_text = f"# {format_time(original_time)} -> {format_time(optimized_time)} ({perf_gain}%)"
 
                     # Add comment to the trailing whitespace
                     new_trailing_whitespace = cst.TrailingWhitespace(
diff --git a/codeflash/code_utils/time_utils.py b/codeflash/code_utils/time_utils.py
index 89273fe2d..4e32eedab 100644
--- a/codeflash/code_utils/time_utils.py
+++ b/codeflash/code_utils/time_utils.py
@@ -85,3 +85,15 @@ def format_time(nanoseconds: int) -> str:
 
     # This should never be reached, but included for completeness
     return f"{nanoseconds}ns"
+
+
+def format_perf(percentage: float) -> str:
+    """Format percentage into a human-readable string with 3 significant digits when needed."""
+    percentage_abs = abs(percentage)
+    if percentage_abs >= 100:
+        return f"{percentage:.0f}"
+    if percentage_abs >= 10:
+        return f"{percentage:.1f}"
+    if percentage_abs >= 1:
+        return f"{percentage:.2f}"
+    return f"{percentage:.3f}"
diff --git a/codeflash/result/create_pr.py b/codeflash/result/create_pr.py
index 190d20e80..a08875a4f 100644
--- a/codeflash/result/create_pr.py
+++ b/codeflash/result/create_pr.py
@@ -18,7 +18,7 @@
 )
 from codeflash.code_utils.github_utils import github_pr_url
 from codeflash.code_utils.tabulate import tabulate
-from codeflash.code_utils.time_utils import format_time
+from codeflash.code_utils.time_utils import format_perf, format_time
 from codeflash.github.PrComment import FileDiffContent, PrComment
 from codeflash.result.critic import performance_gain
 
@@ -40,7 +40,7 @@ def existing_tests_source_for(
         return ""
     output: str = ""
     rows = []
-    headers = ["Test File::Test Function", "Original ⏱️", "Optimized ⏱️", "Improvement"]
+    headers = ["Test File::Test Function", "Original ⏱️", "Optimized ⏱️", "Speedup"]
     tests_root = test_cfg.tests_root
     module_root = test_cfg.project_root_path
     rel_tests_root = tests_root.relative_to(module_root)
@@ -84,23 +84,17 @@ def existing_tests_source_for(
         ].keys()  # both will have the same keys as some default values are assigned in the previous loop
         for qualified_name in sorted(all_qualified_names):
             # if not present in optimized output nan
-            if optimized_tests_to_runtimes[filename][qualified_name] == 0:
-                print_optimized_runtime = "NaN"
-            else:
-                print_optimized_runtime = format_time(optimized_tests_to_runtimes[filename][qualified_name])
-            if original_tests_to_runtimes[filename][qualified_name] == 0:
-                print_original_runtime = "NaN"
-            else:
-                print_original_runtime = format_time(original_tests_to_runtimes[filename][qualified_name])
             if (
                 original_tests_to_runtimes[filename][qualified_name] != 0
                 and optimized_tests_to_runtimes[filename][qualified_name] != 0
             ):
+                print_optimized_runtime = format_time(optimized_tests_to_runtimes[filename][qualified_name])
+                print_original_runtime = format_time(original_tests_to_runtimes[filename][qualified_name])
                 greater = (
                     optimized_tests_to_runtimes[filename][qualified_name]
                     > original_tests_to_runtimes[filename][qualified_name]
                 )
-                perf_gain = (
+                perf_gain = format_perf(
                     performance_gain(
                         original_runtime_ns=original_tests_to_runtimes[filename][qualified_name],
                         optimized_runtime_ns=optimized_tests_to_runtimes[filename][qualified_name],
@@ -113,7 +107,7 @@ def existing_tests_source_for(
                             f"`{filename}::{qualified_name}`",
                             f"{print_original_runtime}",
                             f"{print_optimized_runtime}",
-                            f"⚠️{perf_gain:.2f}%",
+                            f"⚠️{perf_gain}%",
                         ]
                     )
                 else:
@@ -122,14 +116,9 @@ def existing_tests_source_for(
                             f"`{filename}::{qualified_name}`",
                             f"{print_original_runtime}",
                             f"{print_optimized_runtime}",
-                            f"✅{perf_gain:.2f}%",
+                            f"✅{perf_gain}%",
                         ]
                     )
-            else:
-                # one of them is NaN
-                rows.append(
-                    [f"`{filename}::{qualified_name}`", f"{print_original_runtime}", f"{print_optimized_runtime}", "❌"]
-                )
     output += tabulate(  # type: ignore[no-untyped-call]
         headers=headers, tabular_data=rows, tablefmt="pipe", colglobalalign=None, preserve_whitespace=True
     )
diff --git a/tests/test_add_runtime_comments.py b/tests/test_add_runtime_comments.py
index c553845e8..6a579bb85 100644
--- a/tests/test_add_runtime_comments.py
+++ b/tests/test_add_runtime_comments.py
@@ -5,10 +5,10 @@
 import pytest
 
 from codeflash.code_utils.edit_generated_tests import add_runtime_comments_to_generated_tests
-from codeflash.models.models import GeneratedTests, GeneratedTestsList, InvocationId
+from codeflash.models.models import GeneratedTests, GeneratedTestsList, InvocationId, FunctionTestInvocation, TestType, \
+    VerificationType, TestResults
 from codeflash.verification.verification_utils import TestConfig
 
-
 @pytest.fixture
 def test_config():
     """Create a mock TestConfig for testing."""
@@ -19,32 +19,484 @@ def test_config():
     config.tests_root = Path("/project/tests")
     return config
 
+class TestAddRuntimeComments:
+    """Test cases for add_runtime_comments_to_generated_tests method."""
+
+    def create_test_invocation(
+        self, test_function_name: str, runtime: int, loop_index: int = 1, iteration_id: str = "1", did_pass: bool = True
+    ) -> FunctionTestInvocation:
+        """Helper to create test invocation objects."""
+        return FunctionTestInvocation(
+            loop_index=loop_index,
+            id=InvocationId(
+                test_module_path="tests.test_module",
+                test_class_name=None,
+                test_function_name=test_function_name,
+                function_getting_tested="test_function",
+                iteration_id=iteration_id,
+            ),
+            file_name=Path("tests/test.py"),
+            did_pass=did_pass,
+            runtime=runtime,
+            test_framework="pytest",
+            test_type=TestType.GENERATED_REGRESSION,
+            return_value=None,
+            timed_out=False,
+            verification_type=VerificationType.FUNCTION_CALL,
+        )
+
+    def test_basic_runtime_comment_addition(self, test_config):
+        """Test basic functionality of adding runtime comments."""
+        # Create test source code
+        test_source = """def test_bubble_sort():
+    codeflash_output = bubble_sort([3, 1, 2])
+    assert codeflash_output == [1, 2, 3]
+"""
+
+        generated_test = GeneratedTests(
+            generated_original_test_source=test_source,
+            instrumented_behavior_test_source="",
+            instrumented_perf_test_source="",
+            behavior_file_path=Path("/project/tests/test_module.py"),
+            perf_file_path=Path("/project/tests/test_module_perf.py"),
+        )
+        """add_runtime_comments_to_generated_tests(
+            test_config, generated_tests, original_runtimes, optimized_runtimes
+        )"""
+        generated_tests = GeneratedTestsList(generated_tests=[generated_test])
+
+        # Create test results
+        original_test_results = TestResults()
+        optimized_test_results = TestResults()
+
+        # Add test invocations with different runtimes
+        original_invocation = self.create_test_invocation("test_bubble_sort", 500_000)  # 500μs
+        optimized_invocation = self.create_test_invocation("test_bubble_sort", 300_000)  # 300μs
+
+        original_test_results.add(original_invocation)
+        optimized_test_results.add(optimized_invocation)
+        original_runtimes = original_test_results.usable_runtime_data_by_test_case()
+        optimized_runtimes = optimized_test_results.usable_runtime_data_by_test_case()
+        # Test the functionality
+        result = add_runtime_comments_to_generated_tests(test_config, generated_tests, original_runtimes, optimized_runtimes)
+
+        # Check that comments were added
+        modified_source = result.generated_tests[0].generated_original_test_source
+        assert "# 500μs -> 300μs" in modified_source
+        assert "codeflash_output = bubble_sort([3, 1, 2]) # 500μs -> 300μs" in modified_source
+
+    def test_multiple_test_functions(self, test_config):
+        """Test handling multiple test functions in the same file."""
+        test_source = """def test_bubble_sort():
+    codeflash_output = bubble_sort([3, 1, 2])
+    assert codeflash_output == [1, 2, 3]
+
+def test_quick_sort():
+    codeflash_output = quick_sort([5, 2, 8])
+    assert codeflash_output == [2, 5, 8]
+
+def helper_function():
+    return "not a test"
+"""
+
+        generated_test = GeneratedTests(
+            generated_original_test_source=test_source,
+            instrumented_behavior_test_source="",
+            instrumented_perf_test_source="",
+            behavior_file_path=Path("/project/tests/test_module.py"),
+            perf_file_path=Path("/project/tests/test_module_perf.py")
+        )
+
+        generated_tests = GeneratedTestsList(generated_tests=[generated_test])
+
+        # Create test results for both functions
+        original_test_results = TestResults()
+        optimized_test_results = TestResults()
+
+        # Add test invocations for both test functions
+        original_test_results.add(self.create_test_invocation("test_bubble_sort", 500_000))
+        original_test_results.add(self.create_test_invocation("test_quick_sort", 800_000))
+
+        optimized_test_results.add(self.create_test_invocation("test_bubble_sort", 300_000))
+        optimized_test_results.add(self.create_test_invocation("test_quick_sort", 600_000))
+
+        original_runtimes = original_test_results.usable_runtime_data_by_test_case()
+        optimized_runtimes = optimized_test_results.usable_runtime_data_by_test_case()
+
+        # Test the functionality
+        result = add_runtime_comments_to_generated_tests(test_config, generated_tests, original_runtimes, optimized_runtimes)
+
+        modified_source = result.generated_tests[0].generated_original_test_source
+
+        # Check that comments were added to both test functions
+        assert "# 500μs -> 300μs" in modified_source
+        assert "# 800μs -> 600μs" in modified_source
+        # Helper function should not have comments
+        assert (
+            "helper_function():" in modified_source
+            and "# " not in modified_source.split("helper_function():")[1].split("\n")[0]
+        )
+
+    def test_different_time_formats(self, test_config):
+        """Test that different time ranges are formatted correctly with new precision rules."""
+        test_cases = [
+            (999, 500, "999ns -> 500ns"),  # nanoseconds
+            (25_000, 18_000, "25.0μs -> 18.0μs"),  # microseconds with precision
+            (500_000, 300_000, "500μs -> 300μs"),  # microseconds full integers
+            (1_500_000, 800_000, "1.50ms -> 800μs"),  # milliseconds with precision
+            (365_000_000, 290_000_000, "365ms -> 290ms"),  # milliseconds full integers
+            (2_000_000_000, 1_500_000_000, "2.00s -> 1.50s"),  # seconds with precision
+        ]
+
+        for original_time, optimized_time, expected_comment in test_cases:
+            test_source = """def test_function():
+    codeflash_output = some_function()
+    assert codeflash_output is not None
+"""
+
+            generated_test = GeneratedTests(
+                generated_original_test_source=test_source,
+                instrumented_behavior_test_source="",
+                instrumented_perf_test_source="",
+                behavior_file_path=Path("/project/tests/test_module.py"),
+                perf_file_path=Path("/project/tests/test_module_perf.py")
+            )
+
+            generated_tests = GeneratedTestsList(generated_tests=[generated_test])
+
+            # Create test results
+            original_test_results = TestResults()
+            optimized_test_results = TestResults()
+
+            original_test_results.add(self.create_test_invocation("test_function", original_time))
+            optimized_test_results.add(self.create_test_invocation("test_function", optimized_time))
+
+            original_runtimes = original_test_results.usable_runtime_data_by_test_case()
+            optimized_runtimes = optimized_test_results.usable_runtime_data_by_test_case()
+            # Test the functionality
+            result = add_runtime_comments_to_generated_tests(
+                test_config, generated_tests, original_runtimes, optimized_runtimes
+            )
+
+            modified_source = result.generated_tests[0].generated_original_test_source
+            assert f"# {expected_comment}" in modified_source
+
+    def test_missing_test_results(self, test_config):
+        """Test behavior when test results are missing for a test function."""
+        test_source = """def test_bubble_sort():
+    codeflash_output = bubble_sort([3, 1, 2])
+    assert codeflash_output == [1, 2, 3]
+"""
+
+        generated_test = GeneratedTests(
+            generated_original_test_source=test_source,
+            instrumented_behavior_test_source="",
+            instrumented_perf_test_source="",
+            behavior_file_path=Path("/project/tests/test_module.py"),
+            perf_file_path=Path("/project/tests/test_module_perf.py")
+        )
+
+        generated_tests = GeneratedTestsList(generated_tests=[generated_test])
+
+        # Create empty test results
+        original_test_results = TestResults()
+        optimized_test_results = TestResults()
+
+        original_runtimes = original_test_results.usable_runtime_data_by_test_case()
+        optimized_runtimes = optimized_test_results.usable_runtime_data_by_test_case()
+
+        # Test the functionality
+        result = add_runtime_comments_to_generated_tests(test_config, generated_tests, original_runtimes, optimized_runtimes)
+
+        # Check that no comments were added
+        modified_source = result.generated_tests[0].generated_original_test_source
+        assert modified_source == test_source  # Should be unchanged
+
+    def test_partial_test_results(self, test_config):
+        """Test behavior when only one set of test results is available."""
+        test_source = """def test_bubble_sort():
+    codeflash_output = bubble_sort([3, 1, 2])
+    assert codeflash_output == [1, 2, 3]
+"""
+
+        generated_test = GeneratedTests(
+            generated_original_test_source=test_source,
+            instrumented_behavior_test_source="",
+            instrumented_perf_test_source="",
+            behavior_file_path=Path("/project/tests/test_module.py"),
+            perf_file_path=Path("/project/tests/test_module_perf.py")
+        )
+
+        generated_tests = GeneratedTestsList(generated_tests=[generated_test])
+
+        # Create test results with only original data
+        original_test_results = TestResults()
+        optimized_test_results = TestResults()
+
+        original_test_results.add(self.create_test_invocation("test_bubble_sort", 500_000))
+        # No optimized results
+        original_runtimes = original_test_results.usable_runtime_data_by_test_case()
+        optimized_runtimes = optimized_test_results.usable_runtime_data_by_test_case()
+        # Test the functionality
+        result = add_runtime_comments_to_generated_tests(test_config, generated_tests, original_runtimes, optimized_runtimes)
+
+        # Check that no comments were added
+        modified_source = result.generated_tests[0].generated_original_test_source
+        assert modified_source == test_source  # Should be unchanged
+
+    def test_multiple_runtimes_uses_minimum(self, test_config):
+        """Test that when multiple runtimes exist, the minimum is used."""
+        test_source = """def test_bubble_sort():
+    codeflash_output = bubble_sort([3, 1, 2])
+    assert codeflash_output == [1, 2, 3]
+"""
+
+        generated_test = GeneratedTests(
+            generated_original_test_source=test_source,
+            instrumented_behavior_test_source="",
+            instrumented_perf_test_source="",
+            behavior_file_path=Path("/project/tests/test_module.py"),
+            perf_file_path=Path("/project/tests/test_module_perf.py")
+        )
+
+        generated_tests = GeneratedTestsList(generated_tests=[generated_test])
+
+        # Create test results with multiple loop iterations
+        original_test_results = TestResults()
+        optimized_test_results = TestResults()
+
+        # Add multiple runs with different runtimes
+        original_test_results.add(self.create_test_invocation("test_bubble_sort", 600_000, loop_index=1))
+        original_test_results.add(self.create_test_invocation("test_bubble_sort", 500_000, loop_index=2))
+        original_test_results.add(self.create_test_invocation("test_bubble_sort", 550_000, loop_index=3))
+
+        optimized_test_results.add(self.create_test_invocation("test_bubble_sort", 350_000, loop_index=1))
+        optimized_test_results.add(self.create_test_invocation("test_bubble_sort", 300_000, loop_index=2))
+        optimized_test_results.add(self.create_test_invocation("test_bubble_sort", 320_000, loop_index=3))
+
+        original_runtimes = original_test_results.usable_runtime_data_by_test_case()
+        optimized_runtimes = optimized_test_results.usable_runtime_data_by_test_case()
+        # Test the functionality
+        result = add_runtime_comments_to_generated_tests(test_config, generated_tests, original_runtimes, optimized_runtimes)
+
+        # Check that minimum times were used (500μs -> 300μs)
+        modified_source = result.generated_tests[0].generated_original_test_source
+        assert "# 500μs -> 300μs" in modified_source
+
+    def test_no_codeflash_output_assignment(self, test_config):
+        """Test behavior when test doesn't have codeflash_output assignment."""
+        test_source = """def test_bubble_sort():
+    result = bubble_sort([3, 1, 2])
+    assert result == [1, 2, 3]
+"""
+
+        generated_test = GeneratedTests(
+            generated_original_test_source=test_source,
+            instrumented_behavior_test_source="",
+            instrumented_perf_test_source="",
+            behavior_file_path=Path("/project/tests/test_module.py"),
+            perf_file_path=Path("/project/tests/test_module_perf.py")
+        )
+
+        generated_tests = GeneratedTestsList(generated_tests=[generated_test])
+
+        # Create test results
+        original_test_results = TestResults()
+        optimized_test_results = TestResults()
+
+        original_test_results.add(self.create_test_invocation("test_bubble_sort", 500_000))
+        optimized_test_results.add(self.create_test_invocation("test_bubble_sort", 300_000))
+
+        original_runtimes = original_test_results.usable_runtime_data_by_test_case()
+        optimized_runtimes = optimized_test_results.usable_runtime_data_by_test_case()
+
+        # Test the functionality
+        result = add_runtime_comments_to_generated_tests(test_config, generated_tests, original_runtimes, optimized_runtimes)
+
+        # Check that no comments were added (no codeflash_output assignment)
+        modified_source = result.generated_tests[0].generated_original_test_source
+        assert modified_source == test_source  # Should be unchanged
+
+    def test_invalid_python_code_handling(self, test_config):
+        """Test behavior when test source code is invalid Python."""
+        test_source = """def test_bubble_sort(:
+    codeflash_output = bubble_sort([3, 1, 2])
+    assert codeflash_output == [1, 2, 3]
+"""  # Invalid syntax: extra colon
+
+        generated_test = GeneratedTests(
+            generated_original_test_source=test_source,
+            instrumented_behavior_test_source="",
+            instrumented_perf_test_source="",
+            behavior_file_path=Path("/project/tests/test_module.py"),
+            perf_file_path=Path("/project/tests/test_module_perf.py")
+        )
+
+        generated_tests = GeneratedTestsList(generated_tests=[generated_test])
+
+        # Create test results
+        original_test_results = TestResults()
+        optimized_test_results = TestResults()
+
+        original_test_results.add(self.create_test_invocation("test_bubble_sort", 500_000))
+        optimized_test_results.add(self.create_test_invocation("test_bubble_sort", 300_000))
+
+        original_runtimes = original_test_results.usable_runtime_data_by_test_case()
+        optimized_runtimes = optimized_test_results.usable_runtime_data_by_test_case()
+
+        # Test the functionality - should handle parse error gracefully
+        result = add_runtime_comments_to_generated_tests(test_config, generated_tests, original_runtimes, optimized_runtimes)
+
+        # Check that original test is preserved when parsing fails
+        modified_source = result.generated_tests[0].generated_original_test_source
+        assert modified_source == test_source  # Should be unchanged due to parse error
+
+    def test_multiple_generated_tests(self, test_config):
+        """Test handling multiple generated test objects."""
+        test_source_1 = """def test_bubble_sort():
+    codeflash_output = bubble_sort([3, 1, 2])
+    assert codeflash_output == [1, 2, 3]
+"""
+
+        test_source_2 = """def test_quick_sort():
+    codeflash_output = quick_sort([5, 2, 8])
+    assert codeflash_output == [2, 5, 8]
+"""
+
+        generated_test_1 = GeneratedTests(
+            generated_original_test_source=test_source_1,
+            instrumented_behavior_test_source="",
+            instrumented_perf_test_source="",
+            behavior_file_path=Path("/project/tests/test_module.py"),
+            perf_file_path=Path("/project/tests/test_module_perf.py")
+        )
+
+        generated_test_2 = GeneratedTests(
+            generated_original_test_source=test_source_2,
+            instrumented_behavior_test_source="",
+            instrumented_perf_test_source="",
+            behavior_file_path=Path("/project/tests/test_module.py"),
+            perf_file_path=Path("/project/tests/test_module_perf.py")
+        )
+
+        generated_tests = GeneratedTestsList(generated_tests=[generated_test_1, generated_test_2])
+
+        # Create test results
+        original_test_results = TestResults()
+        optimized_test_results = TestResults()
+
+        original_test_results.add(self.create_test_invocation("test_bubble_sort", 500_000))
+        original_test_results.add(self.create_test_invocation("test_quick_sort", 800_000))
+
+        optimized_test_results.add(self.create_test_invocation("test_bubble_sort", 300_000))
+        optimized_test_results.add(self.create_test_invocation("test_quick_sort", 600_000))
+
+        original_runtimes = original_test_results.usable_runtime_data_by_test_case()
+        optimized_runtimes = optimized_test_results.usable_runtime_data_by_test_case()
+
+        # Test the functionality
+        result = add_runtime_comments_to_generated_tests(test_config, generated_tests, original_runtimes, optimized_runtimes)
+
+        # Check that comments were added to both test files
+        modified_source_1 = result.generated_tests[0].generated_original_test_source
+        modified_source_2 = result.generated_tests[1].generated_original_test_source
+
+        assert "# 500μs -> 300μs" in modified_source_1
+        assert "# 800μs -> 600μs" in modified_source_2
+
+    def test_preserved_test_attributes(self, test_config):
+        """Test that other test attributes are preserved during modification."""
+        test_source = """def test_bubble_sort():
+    codeflash_output = bubble_sort([3, 1, 2])
+    assert codeflash_output == [1, 2, 3]
+"""
+
+        original_behavior_source = "behavior test source"
+        original_perf_source = "perf test source"
+        original_behavior_path = Path("/project/tests/test_module.py")
+        original_perf_path = Path("/project/tests/test_module_perf.py")
+
+        generated_test = GeneratedTests(
+            generated_original_test_source=test_source,
+            instrumented_behavior_test_source=original_behavior_source,
+            instrumented_perf_test_source=original_perf_source,
+            behavior_file_path=original_behavior_path,
+            perf_file_path=original_perf_path
+        )
+
+        generated_tests = GeneratedTestsList(generated_tests=[generated_test])
+
+        # Create test results
+        original_test_results = TestResults()
+        optimized_test_results = TestResults()
+
+        original_test_results.add(self.create_test_invocation("test_bubble_sort", 500_000))
+        optimized_test_results.add(self.create_test_invocation("test_bubble_sort", 300_000))
+
+        original_runtimes = original_test_results.usable_runtime_data_by_test_case()
+        optimized_runtimes = optimized_test_results.usable_runtime_data_by_test_case()
+        # Test the functionality
+        result = add_runtime_comments_to_generated_tests(test_config, generated_tests, original_runtimes, optimized_runtimes)
+
+        # Check that other attributes are preserved
+        modified_test = result.generated_tests[0]
+        assert modified_test.instrumented_behavior_test_source == original_behavior_source
+        assert modified_test.instrumented_perf_test_source == original_perf_source
+        assert modified_test.behavior_file_path == original_behavior_path
+        assert modified_test.perf_file_path == original_perf_path
+
+        # Check that only the generated_original_test_source was modified
+        assert "# 500μs -> 300μs" in modified_test.generated_original_test_source
+
+    def test_multistatement_line_handling(self, test_config):
+        """Test that runtime comments work correctly with multiple statements on one line."""
+        test_source = """def test_mutation_of_input():
+    # Test that the input list is mutated in-place and returned
+    arr = [3, 1, 2]
+    codeflash_output = sorter(arr); result = codeflash_output
+    assert result == [1, 2, 3]
+    assert arr == [1, 2, 3]  # Input should be mutated
+"""
+
+        generated_test = GeneratedTests(
+            generated_original_test_source=test_source,
+            instrumented_behavior_test_source="",
+            instrumented_perf_test_source="",
+            behavior_file_path=Path("/project/tests/test_module.py"),
+            perf_file_path=Path("/project/tests/test_module_perf.py")
+        )
+
+        generated_tests = GeneratedTestsList(generated_tests=[generated_test])
+
+        # Create test results
+        original_test_results = TestResults()
+        optimized_test_results = TestResults()
+
+        original_test_results.add(self.create_test_invocation("test_mutation_of_input", 19_000))  # 19μs
+        optimized_test_results.add(self.create_test_invocation("test_mutation_of_input", 14_000))  # 14μs
+
+        original_runtimes = original_test_results.usable_runtime_data_by_test_case()
+        optimized_runtimes = optimized_test_results.usable_runtime_data_by_test_case()
+
+        # Test the functionality
+        result = add_runtime_comments_to_generated_tests(test_config, generated_tests, original_runtimes, optimized_runtimes)
+
+        # Check that comments were added to the correct line
+        modified_source = result.generated_tests[0].generated_original_test_source
+        assert "# 19.0μs -> 14.0μs" in modified_source
+
+        # Verify the comment is on the line with codeflash_output assignment
+        lines = modified_source.split("\n")
+        codeflash_line = None
+        for line in lines:
+            if "codeflash_output = sorter(arr)" in line:
+                codeflash_line = line
+                break
+
+        assert codeflash_line is not None, "Could not find codeflash_output assignment line"
+        assert "# 19.0μs -> 14.0μs" in codeflash_line, f"Comment not found in the correct line: {codeflash_line}"
+
 
-# @pytest.fixture
-# def sample_invocation_id():
-#     """Create a sample InvocationId for testing."""
-#     return InvocationId(
-#                 test_module_path="test_module_path",
-#                 test_class_name="test_class_name",
-#                 test_function_name="test_function_name",
-#                 function_getting_tested="function_getting_tested",
-#                 iteration_id="0",
-#             )
-#
-#
-# @pytest.fixture
-# def sample_invocation_id_no_class():
-#     """Create a sample InvocationId without class for testing."""
-#     return InvocationId(
-#                 test_module_path="test_module_path",
-#                 test_class_name=None,
-#                 test_function_name="test_function_name",
-#                 function_getting_tested="function_getting_tested",
-#                 iteration_id="0",
-#             )
-
-
-class TestAddRuntimeCommentsToGeneratedTests:
     def test_add_runtime_comments_simple_function(self, test_config):
         """Test adding runtime comments to a simple test function."""
         test_source = '''def test_function():
@@ -78,7 +530,7 @@ def test_add_runtime_comments_simple_function(self, test_config):
         )
 
         expected_source = '''def test_function():
-    codeflash_output = some_function() # 1.00s -> 500ms (100.00%)
+    codeflash_output = some_function() # 1.00s -> 500ms (100%)
     assert codeflash_output == expected
 '''
 
@@ -121,7 +573,7 @@ def test_function(self):
 
         expected_source = '''class TestClass:
     def test_function(self):
-        codeflash_output = some_function() # 2.00s -> 1.00s (100.00%)
+        codeflash_output = some_function() # 2.00s -> 1.00s (100%)
         assert codeflash_output == expected
 '''
 
@@ -165,9 +617,9 @@ def test_add_runtime_comments_multiple_assignments(self, test_config):
 
         expected_source = '''def test_function():
     setup_data = prepare_test()
-    codeflash_output = some_function() # 1.50s -> 750ms (100.00%)
+    codeflash_output = some_function() # 1.50s -> 750ms (100%)
     assert codeflash_output == expected
-    codeflash_output = another_function() # 1.50s -> 750ms (100.00%)
+    codeflash_output = another_function() # 1.50s -> 750ms (100%)
     assert codeflash_output == expected2
 '''
 
@@ -307,12 +759,12 @@ def test_add_runtime_comments_multiple_tests(self, test_config):
         )
 
         expected_source1 = '''def test_function1():
-    codeflash_output = some_function() # 1.00s -> 500ms (100.00%)
+    codeflash_output = some_function() # 1.00s -> 500ms (100%)
     assert codeflash_output == expected
 '''
 
         expected_source2 = '''def test_function2():
-    codeflash_output = another_function() # 2.00s -> 800ms (150.00%)
+    codeflash_output = another_function() # 2.00s -> 800ms (150%)
     assert codeflash_output == expected
 '''
 
@@ -353,7 +805,7 @@ def test_add_runtime_comments_performance_regression(self, test_config):
         )
 
         expected_source = '''def test_function():
-    codeflash_output = some_function() # 1.00s -> 1.50s (-33.33%)
+    codeflash_output = some_function() # 1.00s -> 1.50s (-33.3%)
     assert codeflash_output == expected
 '''
 
diff --git a/tests/test_existing_tests_source_for.py b/tests/test_existing_tests_source_for.py
index 945de6d84..8940b20d2 100644
--- a/tests/test_existing_tests_source_for.py
+++ b/tests/test_existing_tests_source_for.py
@@ -64,9 +64,9 @@ def test_single_test_with_improvement(self):
             optimized_runtimes
         )
 
-        expected = """| Test File::Test Function                  | Original ⏱️   | Optimized ⏱️   | Improvement   |
-|:------------------------------------------|:--------------|:---------------|:--------------|
-| `test_module.py::TestClass.test_function` | 1.00ms        | 500μs          | ✅100.00%     |
+        expected = """| Test File::Test Function                  | Original ⏱️   | Optimized ⏱️   | Speedup   |
+|:------------------------------------------|:--------------|:---------------|:----------|
+| `test_module.py::TestClass.test_function` | 1.00ms        | 500μs          | ✅100%    |
 """
 
         assert result == expected
@@ -91,9 +91,9 @@ def test_single_test_with_regression(self):
             optimized_runtimes
         )
 
-        expected = """| Test File::Test Function                  | Original ⏱️   | Optimized ⏱️   | Improvement   |
-|:------------------------------------------|:--------------|:---------------|:--------------|
-| `test_module.py::TestClass.test_function` | 500μs         | 1.00ms         | ⚠️-50.00%     |
+        expected = """| Test File::Test Function                  | Original ⏱️   | Optimized ⏱️   | Speedup   |
+|:------------------------------------------|:--------------|:---------------|:----------|
+| `test_module.py::TestClass.test_function` | 500μs         | 1.00ms         | ⚠️-50.0%  |
 """
 
         assert result == expected
@@ -123,9 +123,9 @@ def test_test_without_class_name(self):
             optimized_runtimes
         )
 
-        expected = """| Test File::Test Function          | Original ⏱️   | Optimized ⏱️   | Improvement   |
-|:----------------------------------|:--------------|:---------------|:--------------|
-| `test_module.py::test_standalone` | 1.00ms        | 800μs          | ✅25.00%      |
+        expected = """| Test File::Test Function          | Original ⏱️   | Optimized ⏱️   | Speedup   |
+|:----------------------------------|:--------------|:---------------|:----------|
+| `test_module.py::test_standalone` | 1.00ms        | 800μs          | ✅25.0%   |
 """
 
         assert result == expected
@@ -148,9 +148,8 @@ def test_missing_original_runtime(self):
             optimized_runtimes
         )
 
-        expected = """| Test File::Test Function                  |   Original ⏱️ | Optimized ⏱️   | Improvement   |
-|:------------------------------------------|--------------:|:---------------|:--------------|
-| `test_module.py::TestClass.test_function` |           nan | 500μs          | ❌            |
+        expected = """| Test File::Test Function   | Original ⏱️   | Optimized ⏱️   | Speedup   |
+|----------------------------|---------------|----------------|-----------|
 """
 
         assert result == expected
@@ -173,9 +172,8 @@ def test_missing_optimized_runtime(self):
             optimized_runtimes
         )
 
-        expected = """| Test File::Test Function                  | Original ⏱️   |   Optimized ⏱️ | Improvement   |
-|:------------------------------------------|:--------------|---------------:|:--------------|
-| `test_module.py::TestClass.test_function` | 1.00ms        |            nan | ❌            |
+        expected = """| Test File::Test Function   | Original ⏱️   | Optimized ⏱️   | Speedup   |
+|----------------------------|---------------|----------------|-----------|
 """
 
         assert result == expected
@@ -212,10 +210,10 @@ def test_multiple_tests_sorted_output(self):
             optimized_runtimes
         )
 
-        expected = """| Test File::Test Function                             | Original ⏱️   | Optimized ⏱️   | Improvement   |
-|:-----------------------------------------------------|:--------------|:---------------|:--------------|
-| `test_another.py::TestAnother.test_another_function` | 2.00ms        | 1.50ms         | ✅33.33%      |
-| `test_module.py::TestClass.test_function`            | 1.00ms        | 800μs          | ✅25.00%      |
+        expected = """| Test File::Test Function                             | Original ⏱️   | Optimized ⏱️   | Speedup   |
+|:-----------------------------------------------------|:--------------|:---------------|:----------|
+| `test_another.py::TestAnother.test_another_function` | 2.00ms        | 1.50ms         | ✅33.3%   |
+| `test_module.py::TestClass.test_function`            | 1.00ms        | 800μs          | ✅25.0%   |
 """
 
         assert result == expected
@@ -240,9 +238,9 @@ def test_multiple_runtimes_uses_minimum(self):
             optimized_runtimes
         )
 
-        expected = """| Test File::Test Function                  | Original ⏱️   | Optimized ⏱️   | Improvement   |
-|:------------------------------------------|:--------------|:---------------|:--------------|
-| `test_module.py::TestClass.test_function` | 800μs         | 500μs          | ✅60.00%      |
+        expected = """| Test File::Test Function                  | Original ⏱️   | Optimized ⏱️   | Speedup   |
+|:------------------------------------------|:--------------|:---------------|:----------|
+| `test_module.py::TestClass.test_function` | 800μs         | 500μs          | ✅60.0%   |
 """
 
         assert result == expected
@@ -276,9 +274,9 @@ def test_complex_module_path_conversion(self):
             optimized_runtimes
         )
 
-        expected = """| Test File::Test Function                                                | Original ⏱️   | Optimized ⏱️   | Improvement   |
-|:------------------------------------------------------------------------|:--------------|:---------------|:--------------|
-| `integration/test_complex_module.py::TestComplex.test_complex_function` | 1.00ms        | 750μs          | ✅33.33%      |
+        expected = """| Test File::Test Function                                                | Original ⏱️   | Optimized ⏱️   | Speedup   |
+|:------------------------------------------------------------------------|:--------------|:---------------|:----------|
+| `integration/test_complex_module.py::TestComplex.test_complex_function` | 1.00ms        | 750μs          | ✅33.3%   |
 """
 
         assert result == expected
@@ -303,9 +301,8 @@ def test_zero_runtime_values(self):
             optimized_runtimes
         )
 
-        expected = """| Test File::Test Function                  |   Original ⏱️ |   Optimized ⏱️ | Improvement   |
-|:------------------------------------------|--------------:|---------------:|:--------------|
-| `test_module.py::TestClass.test_function` |           nan |            nan | ❌            |
+        expected = """| Test File::Test Function   | Original ⏱️   | Optimized ⏱️   | Speedup   |
+|----------------------------|---------------|----------------|-----------|
 """
 
         assert result == expected
@@ -343,9 +340,9 @@ def test_filters_out_generated_tests(self):
         )
 
         # Should only include the non-generated test
-        expected = """| Test File::Test Function                  | Original ⏱️   | Optimized ⏱️   | Improvement   |
-|:------------------------------------------|:--------------|:---------------|:--------------|
-| `test_module.py::TestClass.test_function` | 1.00ms        | 800μs          | ✅25.00%      |
+        expected = """| Test File::Test Function                  | Original ⏱️   | Optimized ⏱️   | Speedup   |
+|:------------------------------------------|:--------------|:---------------|:----------|
+| `test_module.py::TestClass.test_function` | 1.00ms        | 800μs          | ✅25.0%   |
 """
 
         assert result == expected
diff --git a/tests/test_humanize_time.py b/tests/test_humanize_time.py
index 4021b077e..ecc5e16d7 100644
--- a/tests/test_humanize_time.py
+++ b/tests/test_humanize_time.py
@@ -1,4 +1,5 @@
 from codeflash.code_utils.time_utils import humanize_runtime, format_time
+from codeflash.code_utils.time_utils import format_perf
 import pytest
 
 
@@ -172,4 +173,103 @@ def test_negative_values(self):
         # This test depends on whether your function should handle negative values
         # You might want to modify based on expected behavior
         with pytest.raises((ValueError, TypeError)) or pytest.warns():
-            format_time(-1000)
\ No newline at end of file
+            format_time(-1000)
+
+
+class TestFormatPerf:
+    """Test cases for the format_perf function."""
+
+    def test_format_perf_large_values_above_100(self):
+        """Test formatting for values above 100 (no decimal places)."""
+        assert format_perf(150.789) == "151"
+        assert format_perf(999.999) == "1000"
+        assert format_perf(100.1) == "100"
+        assert format_perf(500) == "500"
+        assert format_perf(1000.5) == "1000"
+
+    def test_format_perf_medium_values_10_to_100(self):
+        """Test formatting for values between 10 and 100 (1 decimal place)."""
+        assert format_perf(99.99) == "100.0"
+        assert format_perf(50.789) == "50.8"
+        assert format_perf(10.1) == "10.1"
+        assert format_perf(25.0) == "25.0"
+        assert format_perf(33.333) == "33.3"
+
+    def test_format_perf_small_values_1_to_10(self):
+        """Test formatting for values between 1 and 10 (2 decimal places)."""
+        assert format_perf(9.999) == "10.00"
+        assert format_perf(5.789) == "5.79"
+        assert format_perf(1.1) == "1.10"
+        assert format_perf(2.0) == "2.00"
+        assert format_perf(7.123) == "7.12"
+
+    def test_format_perf_very_small_values_below_1(self):
+        """Test formatting for values below 1 (3 decimal places)."""
+        assert format_perf(0.999) == "0.999"
+        assert format_perf(0.5) == "0.500"
+        assert format_perf(0.123) == "0.123"
+        assert format_perf(0.001) == "0.001"
+        assert format_perf(0.0) == "0.000"
+
+    def test_format_perf_negative_values(self):
+        """Test formatting for negative values (uses absolute value for comparison)."""
+        assert format_perf(-150.789) == "-151"
+        assert format_perf(-50.789) == "-50.8"
+        assert format_perf(-5.789) == "-5.79"
+        assert format_perf(-0.999) == "-0.999"
+        assert format_perf(-0.0) == "-0.000"
+
+    def test_format_perf_boundary_values(self):
+        """Test formatting for exact boundary values."""
+        assert format_perf(100.0) == "100"
+        assert format_perf(10.0) == "10.0"
+        assert format_perf(1.0) == "1.00"
+        assert format_perf(-100.0) == "-100"
+        assert format_perf(-10.0) == "-10.0"
+        assert format_perf(-1.0) == "-1.00"
+
+    def test_format_perf_integer_inputs(self):
+        """Test formatting with integer inputs."""
+        assert format_perf(150) == "150"
+        assert format_perf(50) == "50.0"
+        assert format_perf(5) == "5.00"
+        assert format_perf(0) == "0.000"
+        assert format_perf(-150) == "-150"
+        assert format_perf(-50) == "-50.0"
+        assert format_perf(-5) == "-5.00"
+
+    def test_format_perf_float_inputs(self):
+        """Test formatting with float inputs."""
+        assert format_perf(123.456) == "123"
+        assert format_perf(12.3456) == "12.3"
+        assert format_perf(1.23456) == "1.23"
+        assert format_perf(0.123456) == "0.123"
+
+    def test_format_perf_edge_cases(self):
+        """Test formatting for edge cases and special values."""
+        # Very large numbers
+        assert format_perf(999999.99) == "1000000"
+        assert format_perf(1000000) == "1000000"
+
+        # Very small positive numbers
+        assert format_perf(0.0001) == "0.000"
+        assert format_perf(0.00001) == "0.000"
+
+        # Numbers very close to boundaries
+        assert format_perf(99.9999) == "100.0"
+        assert format_perf(9.9999) == "10.00"
+        assert format_perf(0.9999) == "1.000"
+
+    def test_format_perf_rounding_behavior(self):
+        """Test that rounding behavior is consistent."""
+        # Test rounding up
+        assert format_perf(100.5) == "100"
+        assert format_perf(10.55) == "10.6"
+        assert format_perf(1.555) == "1.55"
+        assert format_perf(0.1555) == "0.155"
+
+        # Test rounding down
+        assert format_perf(100.4) == "100"
+        assert format_perf(10.54) == "10.5"
+        assert format_perf(1.554) == "1.55"
+        assert format_perf(0.1554) == "0.155"
\ No newline at end of file

From 36b9c753c209d49d759950d924a3649488ce6674 Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Wed, 18 Jun 2025 14:48:30 -0700
Subject: [PATCH 15/16] non zero runtimes for comments

---
 codeflash/code_utils/edit_generated_tests.py | 30 +++++++++++---------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/codeflash/code_utils/edit_generated_tests.py b/codeflash/code_utils/edit_generated_tests.py
index afb33317c..7996bce3d 100644
--- a/codeflash/code_utils/edit_generated_tests.py
+++ b/codeflash/code_utils/edit_generated_tests.py
@@ -131,20 +131,22 @@ def leave_SimpleStatementLine(
                 if matching_original_times and matching_optimized_times:
                     original_time = min(matching_original_times)
                     optimized_time = min(matching_optimized_times)
-                    perf_gain = format_perf(
-                        performance_gain(original_runtime_ns=original_time, optimized_runtime_ns=optimized_time) * 100
-                    )
-                    # Create the runtime comment
-                    comment_text = f"# {format_time(original_time)} -> {format_time(optimized_time)} ({perf_gain}%)"
-
-                    # Add comment to the trailing whitespace
-                    new_trailing_whitespace = cst.TrailingWhitespace(
-                        whitespace=cst.SimpleWhitespace(" "),
-                        comment=cst.Comment(comment_text),
-                        newline=updated_node.trailing_whitespace.newline,
-                    )
-
-                    return updated_node.with_changes(trailing_whitespace=new_trailing_whitespace)
+                    if original_time != 0 and optimized_time != 0:
+                        perf_gain = format_perf(
+                            performance_gain(original_runtime_ns=original_time, optimized_runtime_ns=optimized_time)
+                            * 100
+                        )
+                        # Create the runtime comment
+                        comment_text = f"# {format_time(original_time)} -> {format_time(optimized_time)} ({perf_gain}%)"
+
+                        # Add comment to the trailing whitespace
+                        new_trailing_whitespace = cst.TrailingWhitespace(
+                            whitespace=cst.SimpleWhitespace(" "),
+                            comment=cst.Comment(comment_text),
+                            newline=updated_node.trailing_whitespace.newline,
+                        )
+
+                        return updated_node.with_changes(trailing_whitespace=new_trailing_whitespace)
 
             return updated_node
 

From 70360c6ab537c93b8208a8f5885d03b4be84fa5f Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Wed, 18 Jun 2025 15:31:08 -0700
Subject: [PATCH 16/16] absolute value of percentage

---
 codeflash/code_utils/edit_generated_tests.py | 11 ++++++++---
 tests/test_add_runtime_comments.py           | 14 +++++++-------
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/codeflash/code_utils/edit_generated_tests.py b/codeflash/code_utils/edit_generated_tests.py
index 7996bce3d..547dbc92b 100644
--- a/codeflash/code_utils/edit_generated_tests.py
+++ b/codeflash/code_utils/edit_generated_tests.py
@@ -133,11 +133,16 @@ def leave_SimpleStatementLine(
                     optimized_time = min(matching_optimized_times)
                     if original_time != 0 and optimized_time != 0:
                         perf_gain = format_perf(
-                            performance_gain(original_runtime_ns=original_time, optimized_runtime_ns=optimized_time)
-                            * 100
+                            abs(
+                                performance_gain(original_runtime_ns=original_time, optimized_runtime_ns=optimized_time)
+                                * 100
+                            )
                         )
+                        status = "slower" if optimized_time > original_time else "faster"
                         # Create the runtime comment
-                        comment_text = f"# {format_time(original_time)} -> {format_time(optimized_time)} ({perf_gain}%)"
+                        comment_text = (
+                            f"# {format_time(original_time)} -> {format_time(optimized_time)} ({perf_gain}% {status})"
+                        )
 
                         # Add comment to the trailing whitespace
                         new_trailing_whitespace = cst.TrailingWhitespace(
diff --git a/tests/test_add_runtime_comments.py b/tests/test_add_runtime_comments.py
index 6a579bb85..66a77b0d0 100644
--- a/tests/test_add_runtime_comments.py
+++ b/tests/test_add_runtime_comments.py
@@ -530,7 +530,7 @@ def test_add_runtime_comments_simple_function(self, test_config):
         )
 
         expected_source = '''def test_function():
-    codeflash_output = some_function() # 1.00s -> 500ms (100%)
+    codeflash_output = some_function() # 1.00s -> 500ms (100% faster)
     assert codeflash_output == expected
 '''
 
@@ -573,7 +573,7 @@ def test_function(self):
 
         expected_source = '''class TestClass:
     def test_function(self):
-        codeflash_output = some_function() # 2.00s -> 1.00s (100%)
+        codeflash_output = some_function() # 2.00s -> 1.00s (100% faster)
         assert codeflash_output == expected
 '''
 
@@ -617,9 +617,9 @@ def test_add_runtime_comments_multiple_assignments(self, test_config):
 
         expected_source = '''def test_function():
     setup_data = prepare_test()
-    codeflash_output = some_function() # 1.50s -> 750ms (100%)
+    codeflash_output = some_function() # 1.50s -> 750ms (100% faster)
     assert codeflash_output == expected
-    codeflash_output = another_function() # 1.50s -> 750ms (100%)
+    codeflash_output = another_function() # 1.50s -> 750ms (100% faster)
     assert codeflash_output == expected2
 '''
 
@@ -759,12 +759,12 @@ def test_add_runtime_comments_multiple_tests(self, test_config):
         )
 
         expected_source1 = '''def test_function1():
-    codeflash_output = some_function() # 1.00s -> 500ms (100%)
+    codeflash_output = some_function() # 1.00s -> 500ms (100% faster)
     assert codeflash_output == expected
 '''
 
         expected_source2 = '''def test_function2():
-    codeflash_output = another_function() # 2.00s -> 800ms (150%)
+    codeflash_output = another_function() # 2.00s -> 800ms (150% faster)
     assert codeflash_output == expected
 '''
 
@@ -805,7 +805,7 @@ def test_add_runtime_comments_performance_regression(self, test_config):
         )
 
         expected_source = '''def test_function():
-    codeflash_output = some_function() # 1.00s -> 1.50s (-33.3%)
+    codeflash_output = some_function() # 1.00s -> 1.50s (33.3% slower)
     assert codeflash_output == expected
 '''