From 47983e662a6862a6080bbb7a37a475dfbeb52b14 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 3 May 2026 18:25:11 +0000
Subject: [PATCH 1/3] feat: select randomly when cache is empty instead of
 always picking first variant

When multiple experiment variants share the lowest invocation count
(including the initial empty-cache state where all counts are zero),
pick one uniformly at random instead of always defaulting to the first
declared variant. This prevents systematic bias toward the first variant
on fresh repositories or whenever counts happen to be equal.

Changes:
- pick_experiment.cjs: update pickVariant() to collect all tied variants
  and return tied[Math.floor(Math.random() * tied.length)]
- pick_experiment.test.cjs: update deterministic tie-break unit tests to
  verify randomness; add vi.spyOn(Math, 'random') in main() tests that
  need a predictable first-run outcome
- ADR-29534: update Rule 6 and Alternative 2 to reflect random tie-breaking
- experiments.md guide: update statistical balancing paragraph
- main_workflow_schema.json: update experiments field description

Agent-Logs-Url: https://github.com/github/gh-aw/sessions/58f16353-bee5-464f-91f9-9fee1f89f203

Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com>
---
 actions/setup/js/pick_experiment.cjs          | 16 ++---
 actions/setup/js/pick_experiment.test.cjs     | 58 ++++++++++++++++---
 ...matter-ab-experiments-variant-selection.md |  4 +-
 docs/src/content/docs/guides/experiments.md   |  2 +-
 pkg/parser/schemas/main_workflow_schema.json  |  2 +-
 5 files changed, 63 insertions(+), 19 deletions(-)

diff --git a/actions/setup/js/pick_experiment.cjs b/actions/setup/js/pick_experiment.cjs
index 1f23d10125..6715c9335e 100644
--- a/actions/setup/js/pick_experiment.cjs
+++ b/actions/setup/js/pick_experiment.cjs
@@ -20,7 +20,8 @@
  * Algorithm:
  *   When weight is provided the variant is chosen by weighted-random selection.
  *   Otherwise the variant with the lowest invocation count is selected next (ties are
- *   broken by variant order, yielding a deterministic round-robin across runs).
+ *   broken by random selection, ensuring no variant is systematically favoured on the
+ *   first run or whenever counts are equal).
  *   When start_date or end_date is provided and today falls outside that window the
  *   control variant (first variant) is used and no counter is incremented.
  */
@@ -124,8 +125,10 @@ function isWithinDateWindow(startDate, endDate, todayOverride) {
 
 /**
  * Pick the variant for one experiment using a balanced least-used selection.
- * The variant with the lowest cumulative count is chosen; ties are broken by
- * the order of the variants array so selection is deterministic.
+ * The variant with the lowest cumulative count is chosen; when multiple variants
+ * share the lowest count (including the initial empty-cache state where all counts
+ * are zero), one is selected at random to avoid systematically favouring the first
+ * declared variant.
  *
  * @param {string} name       - Experiment name
  * @param {string[]} variants - Array of variant values (length >= 2)
@@ -135,15 +138,14 @@ function isWithinDateWindow(startDate, endDate, todayOverride) {
 function pickVariant(name, variants, state) {
   const counts = state.counts[name] || {};
   let minCount = Infinity;
-  let selected = variants[0];
   for (const variant of variants) {
     const c = counts[variant] || 0;
     if (c < minCount) {
       minCount = c;
-      selected = variant;
     }
   }
-  return selected;
+  const tied = variants.filter(v => (counts[v] || 0) === minCount);
+  return tied[Math.floor(Math.random() * tied.length)];
 }
 
 /**
@@ -279,7 +281,7 @@ async function writeSummary(assignments, configs, state, core) {
     }
   }
 
-  lines.push("_Variants are selected by balanced round-robin (or weighted) to ensure statistical relevance across runs._");
+  lines.push("_Variants are selected by balanced round-robin (or weighted) to ensure statistical relevance across runs. Ties are broken randomly so no variant is systematically favoured on the first run._");
   await core.summary.addRaw(lines.join("\n")).write();
 }
 
diff --git a/actions/setup/js/pick_experiment.test.cjs b/actions/setup/js/pick_experiment.test.cjs
index 91f1878e83..56bada9be1 100644
--- a/actions/setup/js/pick_experiment.test.cjs
+++ b/actions/setup/js/pick_experiment.test.cjs
@@ -33,9 +33,15 @@ describe("pick_experiment", () => {
   // ── pickVariant ────────────────────────────────────────────────────────────
 
   describe("pickVariant", () => {
-    it("selects the first variant when counts are equal", () => {
+    it("selects one of the tied variants randomly when counts are equal", () => {
       const state = { counts: {} };
-      expect(pickVariant("f", ["A", "B"], state)).toBe("A");
+      // Run many times and verify both variants are eventually selected.
+      const results = new Set();
+      for (let i = 0; i < 100; i++) {
+        results.add(pickVariant("f", ["A", "B"], state));
+      }
+      expect(results).toContain("A");
+      expect(results).toContain("B");
     });
 
     it("selects the least-used variant", () => {
@@ -48,14 +54,27 @@ describe("pick_experiment", () => {
       expect(pickVariant("f", ["A", "B", "C"], state)).toBe("C");
     });
 
-    it("returns the first variant when all counts are equal (tie-break by order)", () => {
+    it("randomly selects from all tied variants when all counts are equal", () => {
       const state = { counts: { f: { A: 1, B: 1, C: 1 } } };
-      expect(pickVariant("f", ["A", "B", "C"], state)).toBe("A");
+      // Run many times and verify all three variants are selected.
+      const results = new Set();
+      for (let i = 0; i < 200; i++) {
+        results.add(pickVariant("f", ["A", "B", "C"], state));
+      }
+      expect(results).toContain("A");
+      expect(results).toContain("B");
+      expect(results).toContain("C");
     });
 
-    it("handles unknown experiment name (no counts yet)", () => {
+    it("handles unknown experiment name (no counts yet) by picking randomly", () => {
       const state = { counts: {} };
-      expect(pickVariant("new", ["X", "Y"], state)).toBe("X");
+      // Both variants must be reachable from an empty state.
+      const results = new Set();
+      for (let i = 0; i < 100; i++) {
+        results.add(pickVariant("new", ["X", "Y"], state));
+      }
+      expect(results).toContain("X");
+      expect(results).toContain("Y");
     });
   });
 
@@ -147,6 +166,9 @@ describe("pick_experiment", () => {
       process.env.GH_AW_EXPERIMENT_STATE_FILE = stateFile;
       process.env.GH_AW_EXPERIMENT_STATE_DIR = tmpDir;
 
+      // Force Math.random → 0 so the first tied variant ("A") is selected.
+      vi.spyOn(Math, "random").mockReturnValue(0);
+
       await main();
 
       // Individual output per experiment
@@ -154,6 +176,8 @@ describe("pick_experiment", () => {
       // Combined JSON output
       expect(mockCore.setOutput).toHaveBeenCalledWith("experiments", JSON.stringify({ feature1: "A" }));
       expect(mockCore.setFailed).not.toHaveBeenCalled();
+
+      vi.restoreAllMocks();
     });
 
     it("persists state between calls to simulate multi-run balance", async () => {
@@ -164,14 +188,18 @@ describe("pick_experiment", () => {
       process.env.GH_AW_EXPERIMENT_STATE_FILE = stateFile;
       process.env.GH_AW_EXPERIMENT_STATE_DIR = tmpDir;
 
+      // Force Math.random → 0 so the first tied variant ("X") is selected on the first run.
+      vi.spyOn(Math, "random").mockReturnValue(0);
+
       // First run → X
       await main();
       const firstCall = mockCore.setOutput.mock.calls.find(c => c[0] === "feat");
       expect(firstCall?.[1]).toBe("X");
 
+      vi.restoreAllMocks();
       vi.clearAllMocks();
 
-      // Second run → Y (state persisted from first call)
+      // Second run → Y (state persisted from first call; Y has the lower count)
       await main();
       const secondCall = mockCore.setOutput.mock.calls.find(c => c[0] === "feat");
       expect(secondCall?.[1]).toBe("Y");
@@ -198,12 +226,17 @@ describe("pick_experiment", () => {
       process.env.GH_AW_EXPERIMENT_STATE_FILE = stateFile;
       process.env.GH_AW_EXPERIMENT_STATE_DIR = tmpDir;
 
+      // Force Math.random → 0 so the first tied variant is chosen for each experiment.
+      vi.spyOn(Math, "random").mockReturnValue(0);
+
       await main();
 
       const assignmentsFile = path.join(tmpDir, "assignments.json");
       expect(fs.existsSync(assignmentsFile)).toBe(true);
       const assignments = JSON.parse(fs.readFileSync(assignmentsFile, "utf8"));
       expect(assignments).toEqual({ feature1: "A", style: "concise" });
+
+      vi.restoreAllMocks();
     });
 
     it("overwrites assignments.json on successive runs reflecting the current variant", async () => {
@@ -212,14 +245,18 @@ describe("pick_experiment", () => {
       process.env.GH_AW_EXPERIMENT_STATE_FILE = stateFile;
       process.env.GH_AW_EXPERIMENT_STATE_DIR = tmpDir;
 
+      // Force Math.random → 0 so the first tied variant ("X") is chosen on the first run.
+      vi.spyOn(Math, "random").mockReturnValue(0);
+
       // First run → X
       await main();
       const assignmentsFile = path.join(tmpDir, "assignments.json");
       expect(JSON.parse(fs.readFileSync(assignmentsFile, "utf8"))).toEqual({ feat: "X" });
 
+      vi.restoreAllMocks();
       vi.clearAllMocks();
 
-      // Second run → Y
+      // Second run → Y (Y has the lower count after first run recorded X)
       await main();
       expect(JSON.parse(fs.readFileSync(assignmentsFile, "utf8"))).toEqual({ feat: "Y" });
     });
@@ -265,10 +302,15 @@ describe("pick_experiment", () => {
       process.env.GH_AW_EXPERIMENT_STATE_FILE = stateFile;
       process.env.GH_AW_EXPERIMENT_STATE_DIR = tmpDir;
 
+      // Force Math.random → 0 so the first tied variant ("concise") is chosen.
+      vi.spyOn(Math, "random").mockReturnValue(0);
+
       await main();
 
       expect(mockCore.setOutput).toHaveBeenCalledWith("style", "concise");
       expect(mockCore.setFailed).not.toHaveBeenCalled();
+
+      vi.restoreAllMocks();
     });
 
     it("uses control variant when today is before start_date", async () => {
diff --git a/docs/adr/29534-frontmatter-ab-experiments-variant-selection.md b/docs/adr/29534-frontmatter-ab-experiments-variant-selection.md
index a70249b10f..5bc17e33d8 100644
--- a/docs/adr/29534-frontmatter-ab-experiments-variant-selection.md
+++ b/docs/adr/29534-frontmatter-ab-experiments-variant-selection.md
@@ -24,7 +24,7 @@ An external feature-flag service would provide a mature A/B testing API with das
 
 #### Alternative 2: Random Per-Run Variant Selection Without State Persistence
 
-Selecting a variant randomly on each run (e.g., `Math.random()`) requires no cache and no persistent state. This was rejected because it does not guarantee balance: over N runs, some variants may appear much more (or less) often than `N/K`, making statistically meaningful comparisons impossible without a large number of runs. The least-used counter approach achieves approximate balance in far fewer runs.
+Selecting a variant randomly on each run (e.g., `Math.random()`) requires no cache and no persistent state. This was rejected as the sole selection strategy because it does not guarantee balance: over N runs, some variants may appear much more (or less) often than `N/K`, making statistically meaningful comparisons impossible without a large number of runs. The least-used counter approach achieves approximate balance in far fewer runs. However, random selection is retained as the tie-breaking strategy within the least-used algorithm — when multiple variants share the minimum count (including the initial empty-cache state), one is chosen at random to avoid systematically favouring the first declared variant.
 
 #### Alternative 3: CI/CD Environment Variables Set Externally
 
@@ -65,7 +65,7 @@ Teams could manually pass variant values as repository variables or dispatch inp
 ### Variant Selection
 
 5. Implementations **MUST** select the variant with the lowest cumulative invocation count across all previous runs (least-used selection).
-6. When two or more variants share the lowest count (including the initial state where all counts are zero), implementations **MUST** break ties by selecting the variant appearing earliest in the declared list.
+6. When two or more variants share the lowest count (including the initial state where all counts are zero), implementations **MUST** break ties by selecting uniformly at random from the tied variants, so no variant is systematically favoured on the first run or whenever counts are equal.
 7. Variant counts **MUST** be persisted between workflow runs using the GitHub Actions cache, keyed by a combination of the sanitized workflow ID and the current run ID, with a restore-key prefix that matches any prior run for that workflow ID.
 8. Implementations **MUST** expose each selected variant as a named step output (`steps.pick-experiment.outputs.<experiment-name>`) and **MUST** also set a combined JSON output (`steps.pick-experiment.outputs.experiments`) containing all variant assignments.
 9. Implementations **MUST** upload the experiment state directory as an artifact named `experiment` (using `if: always()`) so that assignments are available for post-run analysis even when subsequent steps fail.
diff --git a/docs/src/content/docs/guides/experiments.md b/docs/src/content/docs/guides/experiments.md
index 779848e201..5f66aea29c 100644
--- a/docs/src/content/docs/guides/experiments.md
+++ b/docs/src/content/docs/guides/experiments.md
@@ -84,7 +84,7 @@ Address the issue described above.
 
 ## Statistical balancing
 
-The activation job maintains a per-variant invocation counter in an `actions/cache` entry keyed by workflow ID. The variant with the lowest cumulative count is selected on each run; ties are broken by variant order. Over N runs every variant is used approximately N/K times (K = variant count), providing basic A/B balance with no configuration.
+The activation job maintains a per-variant invocation counter in an `actions/cache` entry keyed by workflow ID. The variant with the lowest cumulative count is selected on each run; when multiple variants share the lowest count (including the very first run when the cache is empty), one is chosen at random so no variant is systematically favoured. Over N runs every variant is used approximately N/K times (K = variant count), providing basic A/B balance with no configuration.
 
 The counter persists across workflow runs via the GitHub Actions cache. A fresh repository starts from zero counts.
 
diff --git a/pkg/parser/schemas/main_workflow_schema.json b/pkg/parser/schemas/main_workflow_schema.json
index df0bbfee5a..8095ee9433 100644
--- a/pkg/parser/schemas/main_workflow_schema.json
+++ b/pkg/parser/schemas/main_workflow_schema.json
@@ -2749,7 +2749,7 @@
       ]
     },
     "experiments": {
-      "description": "A/B testing experiments. Each key is an experiment name; the value is either an array of two or more variant strings (bare-array form) or an object with a 'variants' field plus optional metadata fields (description, metric, weight, issue, start_date, end_date, hypothesis, secondary_metrics, guardrail_metrics, min_samples, owner). At runtime the activation job picks a variant using actions/cache to maintain consistent assignment across runs. Use ${{ experiments.<name> }} in the workflow prompt to reference the selected variant. When multiple experiments are declared, assignments are statistically balanced using a counter that round-robins across variants (or weighted when 'weight' is provided).",
+      "description": "A/B testing experiments. Each key is an experiment name; the value is either an array of two or more variant strings (bare-array form) or an object with a 'variants' field plus optional metadata fields (description, metric, weight, issue, start_date, end_date, hypothesis, secondary_metrics, guardrail_metrics, min_samples, owner). At runtime the activation job picks a variant using actions/cache to maintain consistent assignment across runs. Use ${{ experiments.<name> }} in the workflow prompt to reference the selected variant. When multiple experiments are declared, assignments are statistically balanced using a least-used counter that round-robins across variants (or weighted when 'weight' is provided); ties are broken randomly so no variant is systematically favoured on the first run.",
       "type": "object",
       "propertyNames": {
         "pattern": "^[a-zA-Z_][a-zA-Z0-9_]*$",

From e2084d94388ccb08976e96c98aab18ba5e0273dc Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 3 May 2026 18:26:51 +0000
Subject: [PATCH 2/3] refactor: optimize pickVariant to single pass and improve
 test robustness

- Use a single-pass algorithm in pickVariant(): build the tied array
  during the minCount scan (O(n) instead of O(2n))
- Increase probabilistic tie-break tests from 100 to 200 iterations for
  consistency with the three-variant test; add comments documenting the
  negligible (1/2)^200 flakiness bound

Agent-Logs-Url: https://github.com/github/gh-aw/sessions/58f16353-bee5-464f-91f9-9fee1f89f203

Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com>
---
 actions/setup/js/pick_experiment.cjs      | 5 ++++-
 actions/setup/js/pick_experiment.test.cjs | 6 ++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/actions/setup/js/pick_experiment.cjs b/actions/setup/js/pick_experiment.cjs
index 6715c9335e..6ad811d9a6 100644
--- a/actions/setup/js/pick_experiment.cjs
+++ b/actions/setup/js/pick_experiment.cjs
@@ -138,13 +138,16 @@ function isWithinDateWindow(startDate, endDate, todayOverride) {
 function pickVariant(name, variants, state) {
   const counts = state.counts[name] || {};
   let minCount = Infinity;
+  let tied = [];
   for (const variant of variants) {
     const c = counts[variant] || 0;
     if (c < minCount) {
       minCount = c;
+      tied = [variant];
+    } else if (c === minCount) {
+      tied.push(variant);
     }
   }
-  const tied = variants.filter(v => (counts[v] || 0) === minCount);
   return tied[Math.floor(Math.random() * tied.length)];
 }
 
diff --git a/actions/setup/js/pick_experiment.test.cjs b/actions/setup/js/pick_experiment.test.cjs
index 56bada9be1..0775a6761e 100644
--- a/actions/setup/js/pick_experiment.test.cjs
+++ b/actions/setup/js/pick_experiment.test.cjs
@@ -36,8 +36,9 @@ describe("pick_experiment", () => {
     it("selects one of the tied variants randomly when counts are equal", () => {
       const state = { counts: {} };
       // Run many times and verify both variants are eventually selected.
+      // The probability that one variant never appears in 200 trials is (1/2)^200, which is negligible.
       const results = new Set();
-      for (let i = 0; i < 100; i++) {
+      for (let i = 0; i < 200; i++) {
         results.add(pickVariant("f", ["A", "B"], state));
       }
       expect(results).toContain("A");
@@ -69,8 +70,9 @@ describe("pick_experiment", () => {
     it("handles unknown experiment name (no counts yet) by picking randomly", () => {
       const state = { counts: {} };
       // Both variants must be reachable from an empty state.
+      // The probability that one variant never appears in 200 trials is (1/2)^200, which is negligible.
       const results = new Set();
-      for (let i = 0; i < 100; i++) {
+      for (let i = 0; i < 200; i++) {
         results.add(pickVariant("new", ["X", "Y"], state));
       }
       expect(results).toContain("X");

From 395093392d545e453fff2671934b57bd609b04ce Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 3 May 2026 18:43:32 +0000
Subject: [PATCH 3/3] test: replace probabilistic tie-break tests with
 deterministic Math.random mocks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use vi.spyOn(Math, 'random').mockReturnValueOnce() with specific values
instead of running 200-iteration loops that rely on real randomness.
Each test now asserts an exact variant selection for a controlled random
input, eliminating any possibility of CI flakiness.

- 2-variant tie: 0 → "A", 0.5 → "B"
- 3-variant tie: 0 → "A", 0.4 → "B", 0.7 → "C"
- empty-cache 2-variant: 0 → "X", 0.5 → "Y"

Agent-Logs-Url: https://github.com/github/gh-aw/sessions/c7876ebb-f617-451d-b588-19e0a6b7aa98

Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com>
---
 actions/setup/js/pick_experiment.test.cjs | 55 ++++++++++++-----------
 1 file changed, 30 insertions(+), 25 deletions(-)

diff --git a/actions/setup/js/pick_experiment.test.cjs b/actions/setup/js/pick_experiment.test.cjs
index 0775a6761e..cfc8357b42 100644
--- a/actions/setup/js/pick_experiment.test.cjs
+++ b/actions/setup/js/pick_experiment.test.cjs
@@ -33,16 +33,17 @@ describe("pick_experiment", () => {
   // ── pickVariant ────────────────────────────────────────────────────────────
 
   describe("pickVariant", () => {
-    it("selects one of the tied variants randomly when counts are equal", () => {
+    it("breaks ties randomly for two-variant experiment when counts are equal", () => {
       const state = { counts: {} };
-      // Run many times and verify both variants are eventually selected.
-      // The probability that one variant never appears in 200 trials is (1/2)^200, which is negligible.
-      const results = new Set();
-      for (let i = 0; i < 200; i++) {
-        results.add(pickVariant("f", ["A", "B"], state));
-      }
-      expect(results).toContain("A");
-      expect(results).toContain("B");
+      // Math.floor(0 * 2) = 0 → tied[0] = "A"
+      vi.spyOn(Math, "random").mockReturnValueOnce(0);
+      expect(pickVariant("f", ["A", "B"], state)).toBe("A");
+
+      // Math.floor(0.5 * 2) = 1 → tied[1] = "B"
+      vi.spyOn(Math, "random").mockReturnValueOnce(0.5);
+      expect(pickVariant("f", ["A", "B"], state)).toBe("B");
+
+      vi.restoreAllMocks();
     });
 
     it("selects the least-used variant", () => {
@@ -57,26 +58,30 @@ describe("pick_experiment", () => {
 
     it("randomly selects from all tied variants when all counts are equal", () => {
       const state = { counts: { f: { A: 1, B: 1, C: 1 } } };
-      // Run many times and verify all three variants are selected.
-      const results = new Set();
-      for (let i = 0; i < 200; i++) {
-        results.add(pickVariant("f", ["A", "B", "C"], state));
-      }
-      expect(results).toContain("A");
-      expect(results).toContain("B");
-      expect(results).toContain("C");
+      // All three variants are tied; verify the random index is respected.
+      // Math.floor(0   * 3) = 0 → tied[0] = "A"
+      // Math.floor(0.4 * 3) = 1 → tied[1] = "B"  (0.4*3=1.2)
+      // Math.floor(0.7 * 3) = 2 → tied[2] = "C"  (0.7*3=2.1)
+      vi.spyOn(Math, "random").mockReturnValueOnce(0).mockReturnValueOnce(0.4).mockReturnValueOnce(0.7);
+      expect(pickVariant("f", ["A", "B", "C"], state)).toBe("A");
+      expect(pickVariant("f", ["A", "B", "C"], state)).toBe("B");
+      expect(pickVariant("f", ["A", "B", "C"], state)).toBe("C");
+
+      vi.restoreAllMocks();
     });
 
     it("handles unknown experiment name (no counts yet) by picking randomly", () => {
       const state = { counts: {} };
-      // Both variants must be reachable from an empty state.
-      // The probability that one variant never appears in 200 trials is (1/2)^200, which is negligible.
-      const results = new Set();
-      for (let i = 0; i < 200; i++) {
-        results.add(pickVariant("new", ["X", "Y"], state));
-      }
-      expect(results).toContain("X");
-      expect(results).toContain("Y");
+      // Both variants are tied with zero counts; verify the random index is respected.
+      // Math.floor(0   * 2) = 0 → tied[0] = "X"
+      // Math.floor(0.5 * 2) = 1 → tied[1] = "Y"
+      vi.spyOn(Math, "random").mockReturnValueOnce(0);
+      expect(pickVariant("new", ["X", "Y"], state)).toBe("X");
+
+      vi.spyOn(Math, "random").mockReturnValueOnce(0.5);
+      expect(pickVariant("new", ["X", "Y"], state)).toBe("Y");
+
+      vi.restoreAllMocks();
     });
   });