diff --git a/.github/workflows/end-to-end-test-init-optim.yaml b/.github/workflows/end-to-end-test-init-optim.yaml index 2ef2e97c5..fd4aff1e0 100644 --- a/.github/workflows/end-to-end-test-init-optim.yaml +++ b/.github/workflows/end-to-end-test-init-optim.yaml @@ -20,7 +20,7 @@ jobs: COLUMNS: 110 MAX_RETRIES: 3 RETRY_DELAY: 5 - EXPECTED_IMPROVEMENT_PCT: 300 + EXPECTED_IMPROVEMENT_PCT: 30 CODEFLASH_END_TO_END: 1 steps: - name: 🛎️ Checkout diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py index fddc5c18a..ac9a2ffcd 100644 --- a/codeflash/api/aiservice.py +++ b/codeflash/api/aiservice.py @@ -248,6 +248,7 @@ def generate_regression_tests( test_timeout: int, trace_id: str, test_index: int, + single_prompt: bool=False, ) -> tuple[str, str, str] | None: """Generate regression tests for the given function by making a request to the Django endpoint. diff --git a/codeflash/code_utils/config_consts.py b/codeflash/code_utils/config_consts.py index cb9b0c7f2..6f1b77e2c 100644 --- a/codeflash/code_utils/config_consts.py +++ b/codeflash/code_utils/config_consts.py @@ -5,6 +5,6 @@ MIN_IMPROVEMENT_THRESHOLD = 0.05 MAX_TEST_FUNCTION_RUNS = 50 MAX_CUMULATIVE_TEST_RUNTIME_NANOSECONDS = 100e6 # 100ms -N_TESTS_TO_GENERATE = 2 +N_TESTS_TO_GENERATE = 4 TOTAL_LOOPING_TIME = 10.0 # 10 second candidate benchmarking budget COVERAGE_THRESHOLD = 60.0 diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index 56124a9cb..e5ff59633 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -162,6 +162,7 @@ def optimize_function(self) -> Result[BestOptimization, str]: f"Generating new tests and optimizations for function {self.function_to_optimize.function_name}", transient=True, ): + # TODO: do a/b testing with same codegen but different testgen generated_results = self.generate_tests_and_optimizations( testgen_context_code=code_context.testgen_context_code, read_writable_code=code_context.read_writable_code, @@ -745,7 +746,8 @@ def generate_tests_and_optimizations( run_experiment: bool = False, ) -> Result[tuple[GeneratedTestsList, dict[str, list[FunctionCalledInTest]], OptimizationSet], str]: assert len(generated_test_paths) == N_TESTS_TO_GENERATE - max_workers = N_TESTS_TO_GENERATE + 2 if not run_experiment else N_TESTS_TO_GENERATE + 3 + max_workers = 2*N_TESTS_TO_GENERATE + 2 if not run_experiment else 2*N_TESTS_TO_GENERATE + 3 + self.local_aiservice_client = LocalAiServiceClient() console.rule() with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: # Submit the test generation task as future @@ -755,6 +757,7 @@ def generate_tests_and_optimizations( [definition.fully_qualified_name for definition in helper_functions], generated_test_paths, generated_perf_test_paths, + run_experiment=True, ) future_optimization_candidates = executor.submit( self.aiservice_client.optimize_python_code, @@ -1208,8 +1211,9 @@ def generate_and_instrument_tests( helper_function_names: list[str], generated_test_paths: list[Path], generated_perf_test_paths: list[Path], + run_experiment: bool ) -> list[concurrent.futures.Future]: - return [ + original = [ executor.submit( generate_tests, self.aiservice_client, @@ -1223,11 +1227,33 @@ def generate_and_instrument_tests( test_index, test_path, test_perf_path, + single_prompt=False, ) for test_index, (test_path, test_perf_path) in enumerate( - zip(generated_test_paths, generated_perf_test_paths) + zip(generated_test_paths[:2], generated_perf_test_paths[:2]) ) ] + if run_experiment: + original+=[ + executor.submit( + generate_tests, + self.local_aiservice_client, + source_code_being_tested, + self.function_to_optimize, + helper_function_names, + Path(self.original_module_path), + self.test_cfg, + INDIVIDUAL_TESTCASE_TIMEOUT, + self.function_trace_id, + test_index, + test_path, + test_perf_path, + single_prompt=True, + ) + for test_index, (test_path, test_perf_path) in enumerate( + zip(generated_test_paths[2:], generated_perf_test_paths[2:]) + )] + return original def cleanup_generated_files(self) -> None: paths_to_cleanup = ( diff --git a/codeflash/verification/verifier.py b/codeflash/verification/verifier.py index aba8f956e..96cf62fe8 100644 --- a/codeflash/verification/verifier.py +++ b/codeflash/verification/verifier.py @@ -26,6 +26,7 @@ def generate_tests( test_index: int, test_path: Path, test_perf_path: Path, + single_prompt: bool=False, ) -> tuple[str, str, Path] | None: # TODO: Sometimes this recreates the original Class definition. This overrides and messes up the original # class import. Remove the recreation of the class definition @@ -40,6 +41,7 @@ def generate_tests( test_timeout=test_timeout, trace_id=function_trace_id, test_index=test_index, + single_prompt=single_prompt, ) if response and isinstance(response, tuple) and len(response) == 3: generated_test_source, instrumented_behavior_test_source, instrumented_perf_test_source = response diff --git a/tests/scripts/end_to_end_test_init_optimization.py b/tests/scripts/end_to_end_test_init_optimization.py index a19be5d82..f429e246a 100644 --- a/tests/scripts/end_to_end_test_init_optimization.py +++ b/tests/scripts/end_to_end_test_init_optimization.py @@ -9,7 +9,7 @@ def run_test(expected_improvement_pct: int) -> bool: file_path="remove_control_chars.py", function_name="CharacterRemover.remove_control_characters", test_framework="pytest", - min_improvement_x=1.0, + min_improvement_x=0.3, coverage_expectations=[ CoverageExpectation( function_name="CharacterRemover.remove_control_characters", expected_coverage=100.0, expected_lines=[14]