codeflash-ai · aseembits93 · May 6, 2025 · May 7, 2025 · May 7, 2025 · May 8, 2025
diff --git a/.github/workflows/end-to-end-test-init-optim.yaml b/.github/workflows/end-to-end-test-init-optim.yaml
@@ -20,7 +20,7 @@ jobs:
       COLUMNS: 110
       MAX_RETRIES: 3
       RETRY_DELAY: 5
-      EXPECTED_IMPROVEMENT_PCT: 300
+      EXPECTED_IMPROVEMENT_PCT: 30
       CODEFLASH_END_TO_END: 1
     steps:
       - name: 🛎️ Checkout

diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py
@@ -248,6 +248,7 @@ def generate_regression_tests(
         test_timeout: int,
         trace_id: str,
         test_index: int,
+        single_prompt: bool=False,
     ) -> tuple[str, str, str] | None:
         """Generate regression tests for the given function by making a request to the Django endpoint.
 

diff --git a/codeflash/code_utils/config_consts.py b/codeflash/code_utils/config_consts.py
@@ -5,6 +5,6 @@
 MIN_IMPROVEMENT_THRESHOLD = 0.05
 MAX_TEST_FUNCTION_RUNS = 50
 MAX_CUMULATIVE_TEST_RUNTIME_NANOSECONDS = 100e6  # 100ms
-N_TESTS_TO_GENERATE = 2
+N_TESTS_TO_GENERATE = 4
 TOTAL_LOOPING_TIME = 10.0  # 10 second candidate benchmarking budget
 COVERAGE_THRESHOLD = 60.0
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
@@ -162,6 +162,7 @@ def optimize_function(self) -> Result[BestOptimization, str]:
             f"Generating new tests and optimizations for function {self.function_to_optimize.function_name}",
             transient=True,
         ):
+            # TODO: do a/b testing with same codegen but different testgen
             generated_results = self.generate_tests_and_optimizations(
                 testgen_context_code=code_context.testgen_context_code,
                 read_writable_code=code_context.read_writable_code,
@@ -745,7 +746,8 @@ def generate_tests_and_optimizations(
         run_experiment: bool = False,
     ) -> Result[tuple[GeneratedTestsList, dict[str, list[FunctionCalledInTest]], OptimizationSet], str]:
         assert len(generated_test_paths) == N_TESTS_TO_GENERATE
-        max_workers = N_TESTS_TO_GENERATE + 2 if not run_experiment else N_TESTS_TO_GENERATE + 3
+        max_workers = 2*N_TESTS_TO_GENERATE + 2 if not run_experiment else 2*N_TESTS_TO_GENERATE + 3
+        self.local_aiservice_client = LocalAiServiceClient()
         console.rule()
         with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
             # Submit the test generation task as future
@@ -755,6 +757,7 @@ def generate_tests_and_optimizations(
                 [definition.fully_qualified_name for definition in helper_functions],
                 generated_test_paths,
                 generated_perf_test_paths,
+                run_experiment=True,
             )
             future_optimization_candidates = executor.submit(
                 self.aiservice_client.optimize_python_code,
@@ -1208,8 +1211,9 @@ def generate_and_instrument_tests(
         helper_function_names: list[str],
         generated_test_paths: list[Path],
         generated_perf_test_paths: list[Path],
+        run_experiment: bool
     ) -> list[concurrent.futures.Future]:
-        return [
+        original = [
             executor.submit(
                 generate_tests,
                 self.aiservice_client,
@@ -1223,11 +1227,33 @@ def generate_and_instrument_tests(
                 test_index,
                 test_path,
                 test_perf_path,
+                single_prompt=False,
             )
             for test_index, (test_path, test_perf_path) in enumerate(
-                zip(generated_test_paths, generated_perf_test_paths)
+                zip(generated_test_paths[:2], generated_perf_test_paths[:2])
             )
         ]
+        if run_experiment:
+            original+=[
+                executor.submit(
+                    generate_tests,
+                    self.local_aiservice_client,
+                    source_code_being_tested,
+                    self.function_to_optimize,
+                    helper_function_names,
+                    Path(self.original_module_path),
+                    self.test_cfg,
+                    INDIVIDUAL_TESTCASE_TIMEOUT,
+                    self.function_trace_id,
+                    test_index,
+                    test_path,
+                    test_perf_path,
+                    single_prompt=True,
+                )
+                for test_index, (test_path, test_perf_path) in enumerate(
+                    zip(generated_test_paths[2:], generated_perf_test_paths[2:])
+                )]
+        return original
 
     def cleanup_generated_files(self) -> None:
         paths_to_cleanup = (

diff --git a/codeflash/verification/verifier.py b/codeflash/verification/verifier.py
@@ -26,6 +26,7 @@ def generate_tests(
     test_index: int,
     test_path: Path,
     test_perf_path: Path,
+    single_prompt: bool=False,
 ) -> tuple[str, str, Path] | None:
     # TODO: Sometimes this recreates the original Class definition. This overrides and messes up the original
     #  class import. Remove the recreation of the class definition
@@ -40,6 +41,7 @@ def generate_tests(
         test_timeout=test_timeout,
         trace_id=function_trace_id,
         test_index=test_index,
+        single_prompt=single_prompt,
     )
     if response and isinstance(response, tuple) and len(response) == 3:
         generated_test_source, instrumented_behavior_test_source, instrumented_perf_test_source = response

diff --git a/tests/scripts/end_to_end_test_init_optimization.py b/tests/scripts/end_to_end_test_init_optimization.py
@@ -9,7 +9,7 @@ def run_test(expected_improvement_pct: int) -> bool:
         file_path="remove_control_chars.py",
         function_name="CharacterRemover.remove_control_characters",
         test_framework="pytest",
-        min_improvement_x=1.0,
+        min_improvement_x=0.3,
         coverage_expectations=[
             CoverageExpectation(
                 function_name="CharacterRemover.remove_control_characters", expected_coverage=100.0, expected_lines=[14]