From b8b4182353b8cf14d8f0fa1788fba39616ef49c3 Mon Sep 17 00:00:00 2001 From: aseembits93 Date: Mon, 5 May 2025 18:34:00 -0700 Subject: [PATCH 1/6] draft pr starting point --- codeflash/optimization/function_optimizer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index 786c4afb4..2f733875f 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -162,6 +162,7 @@ def optimize_function(self) -> Result[BestOptimization, str]: f"Generating new tests and optimizations for function {self.function_to_optimize.function_name}", transient=True, ): + #TODO: do a/b testing with same codegen but different testgen generated_results = self.generate_tests_and_optimizations( testgen_context_code=code_context.testgen_context_code, read_writable_code=code_context.read_writable_code, From 4736fd781f3a60c0528e5cd6fbe427e9c69c2468 Mon Sep 17 00:00:00 2001 From: aseembits93 Date: Wed, 7 May 2025 16:14:05 -0700 Subject: [PATCH 2/6] hardcoded a/b testing for now --- codeflash/api/aiservice.py | 6 +++- codeflash/optimization/function_optimizer.py | 33 +++++++++++++++++--- codeflash/verification/verifier.py | 2 ++ 3 files changed, 36 insertions(+), 5 deletions(-) diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py index fddc5c18a..afbc00092 100644 --- a/codeflash/api/aiservice.py +++ b/codeflash/api/aiservice.py @@ -248,6 +248,7 @@ def generate_regression_tests( test_timeout: int, trace_id: str, test_index: int, + single_prompt: bool=False, ) -> tuple[str, str, str] | None: """Generate regression tests for the given function by making a request to the Django endpoint. @@ -284,7 +285,10 @@ def generate_regression_tests( "codeflash_version": codeflash_version, } try: - response = self.make_ai_service_request("/testgen", payload=payload, timeout=600) + if single_prompt: + response = self.make_ai_service_request("/testgen-single-prompt", payload=payload, timeout=600) + else: + response = self.make_ai_service_request("/testgen", payload=payload, timeout=600) except requests.exceptions.RequestException as e: logger.exception(f"Error generating tests: {e}") ph("cli-testgen-error-caught", {"error": str(e)}) diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index 2f733875f..870e7b560 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -162,7 +162,7 @@ def optimize_function(self) -> Result[BestOptimization, str]: f"Generating new tests and optimizations for function {self.function_to_optimize.function_name}", transient=True, ): - #TODO: do a/b testing with same codegen but different testgen + # TODO: do a/b testing with same codegen but different testgen generated_results = self.generate_tests_and_optimizations( testgen_context_code=code_context.testgen_context_code, read_writable_code=code_context.read_writable_code, @@ -760,7 +760,8 @@ def generate_tests_and_optimizations( run_experiment: bool = False, ) -> Result[tuple[GeneratedTestsList, dict[str, list[FunctionCalledInTest]], OptimizationSet], str]: assert len(generated_test_paths) == N_TESTS_TO_GENERATE - max_workers = N_TESTS_TO_GENERATE + 2 if not run_experiment else N_TESTS_TO_GENERATE + 3 + max_workers = 2*N_TESTS_TO_GENERATE + 2 if not run_experiment else 2*N_TESTS_TO_GENERATE + 3 + self.local_aiservice_client = LocalAiServiceClient() console.rule() with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: # Submit the test generation task as future @@ -770,6 +771,7 @@ def generate_tests_and_optimizations( [definition.fully_qualified_name for definition in helper_functions], generated_test_paths, generated_perf_test_paths, + run_experiment=True, ) future_optimization_candidates = executor.submit( self.aiservice_client.optimize_python_code, @@ -1223,8 +1225,9 @@ def generate_and_instrument_tests( helper_function_names: list[str], generated_test_paths: list[Path], generated_perf_test_paths: list[Path], + run_experiment: bool ) -> list[concurrent.futures.Future]: - return [ + original = [ executor.submit( generate_tests, self.aiservice_client, @@ -1234,12 +1237,34 @@ def generate_and_instrument_tests( Path(self.original_module_path), self.test_cfg, INDIVIDUAL_TESTCASE_TIMEOUT, - self.function_trace_id, + self.function_trace_id,#[:-4]+"TST0" if run_experiment else self.function_trace_id, test_index, test_path, test_perf_path, + single_prompt=False, ) for test_index, (test_path, test_perf_path) in enumerate( zip(generated_test_paths, generated_perf_test_paths) ) ] + if run_experiment: + original+=[ + executor.submit( + generate_tests, + self.local_aiservice_client, + source_code_being_tested, + self.function_to_optimize, + helper_function_names, + Path(self.original_module_path), + self.test_cfg, + INDIVIDUAL_TESTCASE_TIMEOUT, + self.function_trace_id,#[:-4]+"TST1", + test_index, + test_path, + test_perf_path, + single_prompt=True, + ) + for test_index, (test_path, test_perf_path) in enumerate( + zip(generated_test_paths, generated_perf_test_paths) + )] + return original diff --git a/codeflash/verification/verifier.py b/codeflash/verification/verifier.py index aba8f956e..96cf62fe8 100644 --- a/codeflash/verification/verifier.py +++ b/codeflash/verification/verifier.py @@ -26,6 +26,7 @@ def generate_tests( test_index: int, test_path: Path, test_perf_path: Path, + single_prompt: bool=False, ) -> tuple[str, str, Path] | None: # TODO: Sometimes this recreates the original Class definition. This overrides and messes up the original # class import. Remove the recreation of the class definition @@ -40,6 +41,7 @@ def generate_tests( test_timeout=test_timeout, trace_id=function_trace_id, test_index=test_index, + single_prompt=single_prompt, ) if response and isinstance(response, tuple) and len(response) == 3: generated_test_source, instrumented_behavior_test_source, instrumented_perf_test_source = response From d914a0f9274bb0e157b7b9050dc01b8f7a06feda Mon Sep 17 00:00:00 2001 From: aseembits93 Date: Wed, 7 May 2025 17:22:06 -0700 Subject: [PATCH 3/6] easier to retrieve data --- codeflash/code_utils/config_consts.py | 2 +- codeflash/optimization/function_optimizer.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/codeflash/code_utils/config_consts.py b/codeflash/code_utils/config_consts.py index cb9b0c7f2..6f1b77e2c 100644 --- a/codeflash/code_utils/config_consts.py +++ b/codeflash/code_utils/config_consts.py @@ -5,6 +5,6 @@ MIN_IMPROVEMENT_THRESHOLD = 0.05 MAX_TEST_FUNCTION_RUNS = 50 MAX_CUMULATIVE_TEST_RUNTIME_NANOSECONDS = 100e6 # 100ms -N_TESTS_TO_GENERATE = 2 +N_TESTS_TO_GENERATE = 4 TOTAL_LOOPING_TIME = 10.0 # 10 second candidate benchmarking budget COVERAGE_THRESHOLD = 60.0 diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index 870e7b560..2e2171661 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -1237,14 +1237,14 @@ def generate_and_instrument_tests( Path(self.original_module_path), self.test_cfg, INDIVIDUAL_TESTCASE_TIMEOUT, - self.function_trace_id,#[:-4]+"TST0" if run_experiment else self.function_trace_id, + self.function_trace_id[:-4]+"TST0" if run_experiment else self.function_trace_id, test_index, test_path, test_perf_path, single_prompt=False, ) for test_index, (test_path, test_perf_path) in enumerate( - zip(generated_test_paths, generated_perf_test_paths) + zip(generated_test_paths[:2], generated_perf_test_paths[:2]) ) ] if run_experiment: @@ -1258,13 +1258,13 @@ def generate_and_instrument_tests( Path(self.original_module_path), self.test_cfg, INDIVIDUAL_TESTCASE_TIMEOUT, - self.function_trace_id,#[:-4]+"TST1", + self.function_trace_id[:-4]+"TST1", test_index, test_path, test_perf_path, single_prompt=True, ) for test_index, (test_path, test_perf_path) in enumerate( - zip(generated_test_paths, generated_perf_test_paths) + zip(generated_test_paths[2:], generated_perf_test_paths[2:]) )] return original From c955b51d7e6a14d34b7520ac0fb90cc7de797ce5 Mon Sep 17 00:00:00 2001 From: aseembits93 Date: Wed, 7 May 2025 17:32:01 -0700 Subject: [PATCH 4/6] cant disturb prod --- codeflash/optimization/function_optimizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index 2e2171661..5f40d2e11 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -1237,7 +1237,7 @@ def generate_and_instrument_tests( Path(self.original_module_path), self.test_cfg, INDIVIDUAL_TESTCASE_TIMEOUT, - self.function_trace_id[:-4]+"TST0" if run_experiment else self.function_trace_id, + self.function_trace_id, test_index, test_path, test_perf_path, @@ -1258,7 +1258,7 @@ def generate_and_instrument_tests( Path(self.original_module_path), self.test_cfg, INDIVIDUAL_TESTCASE_TIMEOUT, - self.function_trace_id[:-4]+"TST1", + self.function_trace_id, test_index, test_path, test_perf_path, From e444d24ecfd9c8db3fa4ae82c23bfc50d1c685f8 Mon Sep 17 00:00:00 2001 From: aseembits93 Date: Thu, 8 May 2025 20:05:29 -0700 Subject: [PATCH 5/6] use single api endpoint --- codeflash/api/aiservice.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py index afbc00092..ac9a2ffcd 100644 --- a/codeflash/api/aiservice.py +++ b/codeflash/api/aiservice.py @@ -285,10 +285,7 @@ def generate_regression_tests( "codeflash_version": codeflash_version, } try: - if single_prompt: - response = self.make_ai_service_request("/testgen-single-prompt", payload=payload, timeout=600) - else: - response = self.make_ai_service_request("/testgen", payload=payload, timeout=600) + response = self.make_ai_service_request("/testgen", payload=payload, timeout=600) except requests.exceptions.RequestException as e: logger.exception(f"Error generating tests: {e}") ph("cli-testgen-error-caught", {"error": str(e)}) From bfff678a1c96f93d803e5c91965466068944bfdb Mon Sep 17 00:00:00 2001 From: aseembits93 Date: Mon, 12 May 2025 14:03:43 -0700 Subject: [PATCH 6/6] recalibrating expected improvement with new tests --- .github/workflows/end-to-end-test-init-optim.yaml | 2 +- tests/scripts/end_to_end_test_init_optimization.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/end-to-end-test-init-optim.yaml b/.github/workflows/end-to-end-test-init-optim.yaml index 2ef2e97c5..fd4aff1e0 100644 --- a/.github/workflows/end-to-end-test-init-optim.yaml +++ b/.github/workflows/end-to-end-test-init-optim.yaml @@ -20,7 +20,7 @@ jobs: COLUMNS: 110 MAX_RETRIES: 3 RETRY_DELAY: 5 - EXPECTED_IMPROVEMENT_PCT: 300 + EXPECTED_IMPROVEMENT_PCT: 30 CODEFLASH_END_TO_END: 1 steps: - name: 🛎️ Checkout diff --git a/tests/scripts/end_to_end_test_init_optimization.py b/tests/scripts/end_to_end_test_init_optimization.py index a19be5d82..f429e246a 100644 --- a/tests/scripts/end_to_end_test_init_optimization.py +++ b/tests/scripts/end_to_end_test_init_optimization.py @@ -9,7 +9,7 @@ def run_test(expected_improvement_pct: int) -> bool: file_path="remove_control_chars.py", function_name="CharacterRemover.remove_control_characters", test_framework="pytest", - min_improvement_x=1.0, + min_improvement_x=0.3, coverage_expectations=[ CoverageExpectation( function_name="CharacterRemover.remove_control_characters", expected_coverage=100.0, expected_lines=[14]