From b8b4182353b8cf14d8f0fa1788fba39616ef49c3 Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Mon, 5 May 2025 18:34:00 -0700
Subject: [PATCH 1/6] draft pr starting point

---
 codeflash/optimization/function_optimizer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
index 786c4afb4..2f733875f 100644
--- a/codeflash/optimization/function_optimizer.py
+++ b/codeflash/optimization/function_optimizer.py
@@ -162,6 +162,7 @@ def optimize_function(self) -> Result[BestOptimization, str]:
             f"Generating new tests and optimizations for function {self.function_to_optimize.function_name}",
             transient=True,
         ):
+            #TODO: do a/b testing with same codegen but different testgen
             generated_results = self.generate_tests_and_optimizations(
                 testgen_context_code=code_context.testgen_context_code,
                 read_writable_code=code_context.read_writable_code,

From 4736fd781f3a60c0528e5cd6fbe427e9c69c2468 Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Wed, 7 May 2025 16:14:05 -0700
Subject: [PATCH 2/6] hardcoded a/b testing for now

---
 codeflash/api/aiservice.py                   |  6 +++-
 codeflash/optimization/function_optimizer.py | 33 +++++++++++++++++---
 codeflash/verification/verifier.py           |  2 ++
 3 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py
index fddc5c18a..afbc00092 100644
--- a/codeflash/api/aiservice.py
+++ b/codeflash/api/aiservice.py
@@ -248,6 +248,7 @@ def generate_regression_tests(
         test_timeout: int,
         trace_id: str,
         test_index: int,
+        single_prompt: bool=False,
     ) -> tuple[str, str, str] | None:
         """Generate regression tests for the given function by making a request to the Django endpoint.
 
@@ -284,7 +285,10 @@ def generate_regression_tests(
             "codeflash_version": codeflash_version,
         }
         try:
-            response = self.make_ai_service_request("/testgen", payload=payload, timeout=600)
+            if single_prompt:
+                response = self.make_ai_service_request("/testgen-single-prompt", payload=payload, timeout=600)
+            else:
+                response = self.make_ai_service_request("/testgen", payload=payload, timeout=600)
         except requests.exceptions.RequestException as e:
             logger.exception(f"Error generating tests: {e}")
             ph("cli-testgen-error-caught", {"error": str(e)})
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
index 2f733875f..870e7b560 100644
--- a/codeflash/optimization/function_optimizer.py
+++ b/codeflash/optimization/function_optimizer.py
@@ -162,7 +162,7 @@ def optimize_function(self) -> Result[BestOptimization, str]:
             f"Generating new tests and optimizations for function {self.function_to_optimize.function_name}",
             transient=True,
         ):
-            #TODO: do a/b testing with same codegen but different testgen
+            # TODO: do a/b testing with same codegen but different testgen
             generated_results = self.generate_tests_and_optimizations(
                 testgen_context_code=code_context.testgen_context_code,
                 read_writable_code=code_context.read_writable_code,
@@ -760,7 +760,8 @@ def generate_tests_and_optimizations(
         run_experiment: bool = False,
     ) -> Result[tuple[GeneratedTestsList, dict[str, list[FunctionCalledInTest]], OptimizationSet], str]:
         assert len(generated_test_paths) == N_TESTS_TO_GENERATE
-        max_workers = N_TESTS_TO_GENERATE + 2 if not run_experiment else N_TESTS_TO_GENERATE + 3
+        max_workers = 2*N_TESTS_TO_GENERATE + 2 if not run_experiment else 2*N_TESTS_TO_GENERATE + 3
+        self.local_aiservice_client = LocalAiServiceClient()
         console.rule()
         with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
             # Submit the test generation task as future
@@ -770,6 +771,7 @@ def generate_tests_and_optimizations(
                 [definition.fully_qualified_name for definition in helper_functions],
                 generated_test_paths,
                 generated_perf_test_paths,
+                run_experiment=True,
             )
             future_optimization_candidates = executor.submit(
                 self.aiservice_client.optimize_python_code,
@@ -1223,8 +1225,9 @@ def generate_and_instrument_tests(
         helper_function_names: list[str],
         generated_test_paths: list[Path],
         generated_perf_test_paths: list[Path],
+        run_experiment: bool
     ) -> list[concurrent.futures.Future]:
-        return [
+        original = [
             executor.submit(
                 generate_tests,
                 self.aiservice_client,
@@ -1234,12 +1237,34 @@ def generate_and_instrument_tests(
                 Path(self.original_module_path),
                 self.test_cfg,
                 INDIVIDUAL_TESTCASE_TIMEOUT,
-                self.function_trace_id,
+                self.function_trace_id,#[:-4]+"TST0" if run_experiment else self.function_trace_id,
                 test_index,
                 test_path,
                 test_perf_path,
+                single_prompt=False,
             )
             for test_index, (test_path, test_perf_path) in enumerate(
                 zip(generated_test_paths, generated_perf_test_paths)
             )
         ]
+        if run_experiment:
+            original+=[
+                executor.submit(
+                    generate_tests,
+                    self.local_aiservice_client,
+                    source_code_being_tested,
+                    self.function_to_optimize,
+                    helper_function_names,
+                    Path(self.original_module_path),
+                    self.test_cfg,
+                    INDIVIDUAL_TESTCASE_TIMEOUT,
+                    self.function_trace_id,#[:-4]+"TST1",
+                    test_index,
+                    test_path,
+                    test_perf_path,
+                    single_prompt=True,
+                )
+                for test_index, (test_path, test_perf_path) in enumerate(
+                    zip(generated_test_paths, generated_perf_test_paths)
+                )]
+        return original
diff --git a/codeflash/verification/verifier.py b/codeflash/verification/verifier.py
index aba8f956e..96cf62fe8 100644
--- a/codeflash/verification/verifier.py
+++ b/codeflash/verification/verifier.py
@@ -26,6 +26,7 @@ def generate_tests(
     test_index: int,
     test_path: Path,
     test_perf_path: Path,
+    single_prompt: bool=False,
 ) -> tuple[str, str, Path] | None:
     # TODO: Sometimes this recreates the original Class definition. This overrides and messes up the original
     #  class import. Remove the recreation of the class definition
@@ -40,6 +41,7 @@ def generate_tests(
         test_timeout=test_timeout,
         trace_id=function_trace_id,
         test_index=test_index,
+        single_prompt=single_prompt,
     )
     if response and isinstance(response, tuple) and len(response) == 3:
         generated_test_source, instrumented_behavior_test_source, instrumented_perf_test_source = response

From d914a0f9274bb0e157b7b9050dc01b8f7a06feda Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Wed, 7 May 2025 17:22:06 -0700
Subject: [PATCH 3/6] easier to retrieve data

---
 codeflash/code_utils/config_consts.py        | 2 +-
 codeflash/optimization/function_optimizer.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/codeflash/code_utils/config_consts.py b/codeflash/code_utils/config_consts.py
index cb9b0c7f2..6f1b77e2c 100644
--- a/codeflash/code_utils/config_consts.py
+++ b/codeflash/code_utils/config_consts.py
@@ -5,6 +5,6 @@
 MIN_IMPROVEMENT_THRESHOLD = 0.05
 MAX_TEST_FUNCTION_RUNS = 50
 MAX_CUMULATIVE_TEST_RUNTIME_NANOSECONDS = 100e6  # 100ms
-N_TESTS_TO_GENERATE = 2
+N_TESTS_TO_GENERATE = 4
 TOTAL_LOOPING_TIME = 10.0  # 10 second candidate benchmarking budget
 COVERAGE_THRESHOLD = 60.0
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
index 870e7b560..2e2171661 100644
--- a/codeflash/optimization/function_optimizer.py
+++ b/codeflash/optimization/function_optimizer.py
@@ -1237,14 +1237,14 @@ def generate_and_instrument_tests(
                 Path(self.original_module_path),
                 self.test_cfg,
                 INDIVIDUAL_TESTCASE_TIMEOUT,
-                self.function_trace_id,#[:-4]+"TST0" if run_experiment else self.function_trace_id,
+                self.function_trace_id[:-4]+"TST0" if run_experiment else self.function_trace_id,
                 test_index,
                 test_path,
                 test_perf_path,
                 single_prompt=False,
             )
             for test_index, (test_path, test_perf_path) in enumerate(
-                zip(generated_test_paths, generated_perf_test_paths)
+                zip(generated_test_paths[:2], generated_perf_test_paths[:2])
             )
         ]
         if run_experiment:
@@ -1258,13 +1258,13 @@ def generate_and_instrument_tests(
                     Path(self.original_module_path),
                     self.test_cfg,
                     INDIVIDUAL_TESTCASE_TIMEOUT,
-                    self.function_trace_id,#[:-4]+"TST1",
+                    self.function_trace_id[:-4]+"TST1",
                     test_index,
                     test_path,
                     test_perf_path,
                     single_prompt=True,
                 )
                 for test_index, (test_path, test_perf_path) in enumerate(
-                    zip(generated_test_paths, generated_perf_test_paths)
+                    zip(generated_test_paths[2:], generated_perf_test_paths[2:])
                 )]
         return original

From c955b51d7e6a14d34b7520ac0fb90cc7de797ce5 Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Wed, 7 May 2025 17:32:01 -0700
Subject: [PATCH 4/6] cant disturb prod

---
 codeflash/optimization/function_optimizer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
index 2e2171661..5f40d2e11 100644
--- a/codeflash/optimization/function_optimizer.py
+++ b/codeflash/optimization/function_optimizer.py
@@ -1237,7 +1237,7 @@ def generate_and_instrument_tests(
                 Path(self.original_module_path),
                 self.test_cfg,
                 INDIVIDUAL_TESTCASE_TIMEOUT,
-                self.function_trace_id[:-4]+"TST0" if run_experiment else self.function_trace_id,
+                self.function_trace_id,
                 test_index,
                 test_path,
                 test_perf_path,
@@ -1258,7 +1258,7 @@ def generate_and_instrument_tests(
                     Path(self.original_module_path),
                     self.test_cfg,
                     INDIVIDUAL_TESTCASE_TIMEOUT,
-                    self.function_trace_id[:-4]+"TST1",
+                    self.function_trace_id,
                     test_index,
                     test_path,
                     test_perf_path,

From e444d24ecfd9c8db3fa4ae82c23bfc50d1c685f8 Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Thu, 8 May 2025 20:05:29 -0700
Subject: [PATCH 5/6] use single api endpoint

---
 codeflash/api/aiservice.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py
index afbc00092..ac9a2ffcd 100644
--- a/codeflash/api/aiservice.py
+++ b/codeflash/api/aiservice.py
@@ -285,10 +285,7 @@ def generate_regression_tests(
             "codeflash_version": codeflash_version,
         }
         try:
-            if single_prompt:
-                response = self.make_ai_service_request("/testgen-single-prompt", payload=payload, timeout=600)
-            else:
-                response = self.make_ai_service_request("/testgen", payload=payload, timeout=600)
+            response = self.make_ai_service_request("/testgen", payload=payload, timeout=600)
         except requests.exceptions.RequestException as e:
             logger.exception(f"Error generating tests: {e}")
             ph("cli-testgen-error-caught", {"error": str(e)})

From bfff678a1c96f93d803e5c91965466068944bfdb Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Mon, 12 May 2025 14:03:43 -0700
Subject: [PATCH 6/6] recalibrating expected improvement with new tests

---
 .github/workflows/end-to-end-test-init-optim.yaml  | 2 +-
 tests/scripts/end_to_end_test_init_optimization.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/end-to-end-test-init-optim.yaml b/.github/workflows/end-to-end-test-init-optim.yaml
index 2ef2e97c5..fd4aff1e0 100644
--- a/.github/workflows/end-to-end-test-init-optim.yaml
+++ b/.github/workflows/end-to-end-test-init-optim.yaml
@@ -20,7 +20,7 @@ jobs:
       COLUMNS: 110
       MAX_RETRIES: 3
       RETRY_DELAY: 5
-      EXPECTED_IMPROVEMENT_PCT: 300
+      EXPECTED_IMPROVEMENT_PCT: 30
       CODEFLASH_END_TO_END: 1
     steps:
       - name: 🛎️ Checkout
diff --git a/tests/scripts/end_to_end_test_init_optimization.py b/tests/scripts/end_to_end_test_init_optimization.py
index a19be5d82..f429e246a 100644
--- a/tests/scripts/end_to_end_test_init_optimization.py
+++ b/tests/scripts/end_to_end_test_init_optimization.py
@@ -9,7 +9,7 @@ def run_test(expected_improvement_pct: int) -> bool:
         file_path="remove_control_chars.py",
         function_name="CharacterRemover.remove_control_characters",
         test_framework="pytest",
-        min_improvement_x=1.0,
+        min_improvement_x=0.3,
         coverage_expectations=[
             CoverageExpectation(
                 function_name="CharacterRemover.remove_control_characters", expected_coverage=100.0, expected_lines=[14]