diff --git a/actions/setup/js/pick_experiment.cjs b/actions/setup/js/pick_experiment.cjs index 1f23d10125..6ad811d9a6 100644 --- a/actions/setup/js/pick_experiment.cjs +++ b/actions/setup/js/pick_experiment.cjs @@ -20,7 +20,8 @@ * Algorithm: * When weight is provided the variant is chosen by weighted-random selection. * Otherwise the variant with the lowest invocation count is selected next (ties are - * broken by variant order, yielding a deterministic round-robin across runs). + * broken by random selection, ensuring no variant is systematically favoured on the + * first run or whenever counts are equal). * When start_date or end_date is provided and today falls outside that window the * control variant (first variant) is used and no counter is incremented. */ @@ -124,8 +125,10 @@ function isWithinDateWindow(startDate, endDate, todayOverride) { /** * Pick the variant for one experiment using a balanced least-used selection. - * The variant with the lowest cumulative count is chosen; ties are broken by - * the order of the variants array so selection is deterministic. + * The variant with the lowest cumulative count is chosen; when multiple variants + * share the lowest count (including the initial empty-cache state where all counts + * are zero), one is selected at random to avoid systematically favouring the first + * declared variant. * * @param {string} name - Experiment name * @param {string[]} variants - Array of variant values (length >= 2) @@ -135,15 +138,17 @@ function isWithinDateWindow(startDate, endDate, todayOverride) { function pickVariant(name, variants, state) { const counts = state.counts[name] || {}; let minCount = Infinity; - let selected = variants[0]; + let tied = []; for (const variant of variants) { const c = counts[variant] || 0; if (c < minCount) { minCount = c; - selected = variant; + tied = [variant]; + } else if (c === minCount) { + tied.push(variant); } } - return selected; + return tied[Math.floor(Math.random() * tied.length)]; } /** @@ -279,7 +284,7 @@ async function writeSummary(assignments, configs, state, core) { } } - lines.push("_Variants are selected by balanced round-robin (or weighted) to ensure statistical relevance across runs._"); + lines.push("_Variants are selected by balanced round-robin (or weighted) to ensure statistical relevance across runs. Ties are broken randomly so no variant is systematically favoured on the first run._"); await core.summary.addRaw(lines.join("\n")).write(); } diff --git a/actions/setup/js/pick_experiment.test.cjs b/actions/setup/js/pick_experiment.test.cjs index 91f1878e83..cfc8357b42 100644 --- a/actions/setup/js/pick_experiment.test.cjs +++ b/actions/setup/js/pick_experiment.test.cjs @@ -33,9 +33,17 @@ describe("pick_experiment", () => { // ── pickVariant ──────────────────────────────────────────────────────────── describe("pickVariant", () => { - it("selects the first variant when counts are equal", () => { + it("breaks ties randomly for two-variant experiment when counts are equal", () => { const state = { counts: {} }; + // Math.floor(0 * 2) = 0 → tied[0] = "A" + vi.spyOn(Math, "random").mockReturnValueOnce(0); expect(pickVariant("f", ["A", "B"], state)).toBe("A"); + + // Math.floor(0.5 * 2) = 1 → tied[1] = "B" + vi.spyOn(Math, "random").mockReturnValueOnce(0.5); + expect(pickVariant("f", ["A", "B"], state)).toBe("B"); + + vi.restoreAllMocks(); }); it("selects the least-used variant", () => { @@ -48,14 +56,32 @@ describe("pick_experiment", () => { expect(pickVariant("f", ["A", "B", "C"], state)).toBe("C"); }); - it("returns the first variant when all counts are equal (tie-break by order)", () => { + it("randomly selects from all tied variants when all counts are equal", () => { const state = { counts: { f: { A: 1, B: 1, C: 1 } } }; + // All three variants are tied; verify the random index is respected. + // Math.floor(0 * 3) = 0 → tied[0] = "A" + // Math.floor(0.4 * 3) = 1 → tied[1] = "B" (0.4*3=1.2) + // Math.floor(0.7 * 3) = 2 → tied[2] = "C" (0.7*3=2.1) + vi.spyOn(Math, "random").mockReturnValueOnce(0).mockReturnValueOnce(0.4).mockReturnValueOnce(0.7); expect(pickVariant("f", ["A", "B", "C"], state)).toBe("A"); + expect(pickVariant("f", ["A", "B", "C"], state)).toBe("B"); + expect(pickVariant("f", ["A", "B", "C"], state)).toBe("C"); + + vi.restoreAllMocks(); }); - it("handles unknown experiment name (no counts yet)", () => { + it("handles unknown experiment name (no counts yet) by picking randomly", () => { const state = { counts: {} }; + // Both variants are tied with zero counts; verify the random index is respected. + // Math.floor(0 * 2) = 0 → tied[0] = "X" + // Math.floor(0.5 * 2) = 1 → tied[1] = "Y" + vi.spyOn(Math, "random").mockReturnValueOnce(0); expect(pickVariant("new", ["X", "Y"], state)).toBe("X"); + + vi.spyOn(Math, "random").mockReturnValueOnce(0.5); + expect(pickVariant("new", ["X", "Y"], state)).toBe("Y"); + + vi.restoreAllMocks(); }); }); @@ -147,6 +173,9 @@ describe("pick_experiment", () => { process.env.GH_AW_EXPERIMENT_STATE_FILE = stateFile; process.env.GH_AW_EXPERIMENT_STATE_DIR = tmpDir; + // Force Math.random → 0 so the first tied variant ("A") is selected. + vi.spyOn(Math, "random").mockReturnValue(0); + await main(); // Individual output per experiment @@ -154,6 +183,8 @@ describe("pick_experiment", () => { // Combined JSON output expect(mockCore.setOutput).toHaveBeenCalledWith("experiments", JSON.stringify({ feature1: "A" })); expect(mockCore.setFailed).not.toHaveBeenCalled(); + + vi.restoreAllMocks(); }); it("persists state between calls to simulate multi-run balance", async () => { @@ -164,14 +195,18 @@ describe("pick_experiment", () => { process.env.GH_AW_EXPERIMENT_STATE_FILE = stateFile; process.env.GH_AW_EXPERIMENT_STATE_DIR = tmpDir; + // Force Math.random → 0 so the first tied variant ("X") is selected on the first run. + vi.spyOn(Math, "random").mockReturnValue(0); + // First run → X await main(); const firstCall = mockCore.setOutput.mock.calls.find(c => c[0] === "feat"); expect(firstCall?.[1]).toBe("X"); + vi.restoreAllMocks(); vi.clearAllMocks(); - // Second run → Y (state persisted from first call) + // Second run → Y (state persisted from first call; Y has the lower count) await main(); const secondCall = mockCore.setOutput.mock.calls.find(c => c[0] === "feat"); expect(secondCall?.[1]).toBe("Y"); @@ -198,12 +233,17 @@ describe("pick_experiment", () => { process.env.GH_AW_EXPERIMENT_STATE_FILE = stateFile; process.env.GH_AW_EXPERIMENT_STATE_DIR = tmpDir; + // Force Math.random → 0 so the first tied variant is chosen for each experiment. + vi.spyOn(Math, "random").mockReturnValue(0); + await main(); const assignmentsFile = path.join(tmpDir, "assignments.json"); expect(fs.existsSync(assignmentsFile)).toBe(true); const assignments = JSON.parse(fs.readFileSync(assignmentsFile, "utf8")); expect(assignments).toEqual({ feature1: "A", style: "concise" }); + + vi.restoreAllMocks(); }); it("overwrites assignments.json on successive runs reflecting the current variant", async () => { @@ -212,14 +252,18 @@ describe("pick_experiment", () => { process.env.GH_AW_EXPERIMENT_STATE_FILE = stateFile; process.env.GH_AW_EXPERIMENT_STATE_DIR = tmpDir; + // Force Math.random → 0 so the first tied variant ("X") is chosen on the first run. + vi.spyOn(Math, "random").mockReturnValue(0); + // First run → X await main(); const assignmentsFile = path.join(tmpDir, "assignments.json"); expect(JSON.parse(fs.readFileSync(assignmentsFile, "utf8"))).toEqual({ feat: "X" }); + vi.restoreAllMocks(); vi.clearAllMocks(); - // Second run → Y + // Second run → Y (Y has the lower count after first run recorded X) await main(); expect(JSON.parse(fs.readFileSync(assignmentsFile, "utf8"))).toEqual({ feat: "Y" }); }); @@ -265,10 +309,15 @@ describe("pick_experiment", () => { process.env.GH_AW_EXPERIMENT_STATE_FILE = stateFile; process.env.GH_AW_EXPERIMENT_STATE_DIR = tmpDir; + // Force Math.random → 0 so the first tied variant ("concise") is chosen. + vi.spyOn(Math, "random").mockReturnValue(0); + await main(); expect(mockCore.setOutput).toHaveBeenCalledWith("style", "concise"); expect(mockCore.setFailed).not.toHaveBeenCalled(); + + vi.restoreAllMocks(); }); it("uses control variant when today is before start_date", async () => { diff --git a/docs/adr/29534-frontmatter-ab-experiments-variant-selection.md b/docs/adr/29534-frontmatter-ab-experiments-variant-selection.md index a70249b10f..5bc17e33d8 100644 --- a/docs/adr/29534-frontmatter-ab-experiments-variant-selection.md +++ b/docs/adr/29534-frontmatter-ab-experiments-variant-selection.md @@ -24,7 +24,7 @@ An external feature-flag service would provide a mature A/B testing API with das #### Alternative 2: Random Per-Run Variant Selection Without State Persistence -Selecting a variant randomly on each run (e.g., `Math.random()`) requires no cache and no persistent state. This was rejected because it does not guarantee balance: over N runs, some variants may appear much more (or less) often than `N/K`, making statistically meaningful comparisons impossible without a large number of runs. The least-used counter approach achieves approximate balance in far fewer runs. +Selecting a variant randomly on each run (e.g., `Math.random()`) requires no cache and no persistent state. This was rejected as the sole selection strategy because it does not guarantee balance: over N runs, some variants may appear much more (or less) often than `N/K`, making statistically meaningful comparisons impossible without a large number of runs. The least-used counter approach achieves approximate balance in far fewer runs. However, random selection is retained as the tie-breaking strategy within the least-used algorithm — when multiple variants share the minimum count (including the initial empty-cache state), one is chosen at random to avoid systematically favouring the first declared variant. #### Alternative 3: CI/CD Environment Variables Set Externally @@ -65,7 +65,7 @@ Teams could manually pass variant values as repository variables or dispatch inp ### Variant Selection 5. Implementations **MUST** select the variant with the lowest cumulative invocation count across all previous runs (least-used selection). -6. When two or more variants share the lowest count (including the initial state where all counts are zero), implementations **MUST** break ties by selecting the variant appearing earliest in the declared list. +6. When two or more variants share the lowest count (including the initial state where all counts are zero), implementations **MUST** break ties by selecting uniformly at random from the tied variants, so no variant is systematically favoured on the first run or whenever counts are equal. 7. Variant counts **MUST** be persisted between workflow runs using the GitHub Actions cache, keyed by a combination of the sanitized workflow ID and the current run ID, with a restore-key prefix that matches any prior run for that workflow ID. 8. Implementations **MUST** expose each selected variant as a named step output (`steps.pick-experiment.outputs.`) and **MUST** also set a combined JSON output (`steps.pick-experiment.outputs.experiments`) containing all variant assignments. 9. Implementations **MUST** upload the experiment state directory as an artifact named `experiment` (using `if: always()`) so that assignments are available for post-run analysis even when subsequent steps fail. diff --git a/docs/src/content/docs/guides/experiments.md b/docs/src/content/docs/guides/experiments.md index 779848e201..5f66aea29c 100644 --- a/docs/src/content/docs/guides/experiments.md +++ b/docs/src/content/docs/guides/experiments.md @@ -84,7 +84,7 @@ Address the issue described above. ## Statistical balancing -The activation job maintains a per-variant invocation counter in an `actions/cache` entry keyed by workflow ID. The variant with the lowest cumulative count is selected on each run; ties are broken by variant order. Over N runs every variant is used approximately N/K times (K = variant count), providing basic A/B balance with no configuration. +The activation job maintains a per-variant invocation counter in an `actions/cache` entry keyed by workflow ID. The variant with the lowest cumulative count is selected on each run; when multiple variants share the lowest count (including the very first run when the cache is empty), one is chosen at random so no variant is systematically favoured. Over N runs every variant is used approximately N/K times (K = variant count), providing basic A/B balance with no configuration. The counter persists across workflow runs via the GitHub Actions cache. A fresh repository starts from zero counts. diff --git a/pkg/parser/schemas/main_workflow_schema.json b/pkg/parser/schemas/main_workflow_schema.json index df0bbfee5a..8095ee9433 100644 --- a/pkg/parser/schemas/main_workflow_schema.json +++ b/pkg/parser/schemas/main_workflow_schema.json @@ -2749,7 +2749,7 @@ ] }, "experiments": { - "description": "A/B testing experiments. Each key is an experiment name; the value is either an array of two or more variant strings (bare-array form) or an object with a 'variants' field plus optional metadata fields (description, metric, weight, issue, start_date, end_date, hypothesis, secondary_metrics, guardrail_metrics, min_samples, owner). At runtime the activation job picks a variant using actions/cache to maintain consistent assignment across runs. Use ${{ experiments. }} in the workflow prompt to reference the selected variant. When multiple experiments are declared, assignments are statistically balanced using a counter that round-robins across variants (or weighted when 'weight' is provided).", + "description": "A/B testing experiments. Each key is an experiment name; the value is either an array of two or more variant strings (bare-array form) or an object with a 'variants' field plus optional metadata fields (description, metric, weight, issue, start_date, end_date, hypothesis, secondary_metrics, guardrail_metrics, min_samples, owner). At runtime the activation job picks a variant using actions/cache to maintain consistent assignment across runs. Use ${{ experiments. }} in the workflow prompt to reference the selected variant. When multiple experiments are declared, assignments are statistically balanced using a least-used counter that round-robins across variants (or weighted when 'weight' is provided); ties are broken randomly so no variant is systematically favoured on the first run.", "type": "object", "propertyNames": { "pattern": "^[a-zA-Z_][a-zA-Z0-9_]*$",