Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 12 additions & 7 deletions actions/setup/js/pick_experiment.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
* Algorithm:
* When weight is provided the variant is chosen by weighted-random selection.
* Otherwise the variant with the lowest invocation count is selected next (ties are
* broken by variant order, yielding a deterministic round-robin across runs).
* broken by random selection, ensuring no variant is systematically favoured on the
* first run or whenever counts are equal).
* When start_date or end_date is provided and today falls outside that window the
* control variant (first variant) is used and no counter is incremented.
*/
Expand Down Expand Up @@ -124,8 +125,10 @@ function isWithinDateWindow(startDate, endDate, todayOverride) {

/**
* Pick the variant for one experiment using a balanced least-used selection.
* The variant with the lowest cumulative count is chosen; ties are broken by
* the order of the variants array so selection is deterministic.
* The variant with the lowest cumulative count is chosen; when multiple variants
* share the lowest count (including the initial empty-cache state where all counts
* are zero), one is selected at random to avoid systematically favouring the first
* declared variant.
*
* @param {string} name - Experiment name
* @param {string[]} variants - Array of variant values (length >= 2)
Expand All @@ -135,15 +138,17 @@ function isWithinDateWindow(startDate, endDate, todayOverride) {
function pickVariant(name, variants, state) {
const counts = state.counts[name] || {};
let minCount = Infinity;
let selected = variants[0];
let tied = [];
for (const variant of variants) {
const c = counts[variant] || 0;
if (c < minCount) {
minCount = c;
selected = variant;
tied = [variant];
} else if (c === minCount) {
tied.push(variant);
}
}
return selected;
return tied[Math.floor(Math.random() * tied.length)];
}

/**
Expand Down Expand Up @@ -279,7 +284,7 @@ async function writeSummary(assignments, configs, state, core) {
}
}

lines.push("_Variants are selected by balanced round-robin (or weighted) to ensure statistical relevance across runs._");
lines.push("_Variants are selected by balanced round-robin (or weighted) to ensure statistical relevance across runs. Ties are broken randomly so no variant is systematically favoured on the first run._");
await core.summary.addRaw(lines.join("\n")).write();
}

Expand Down
59 changes: 54 additions & 5 deletions actions/setup/js/pick_experiment.test.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,17 @@ describe("pick_experiment", () => {
// ── pickVariant ────────────────────────────────────────────────────────────

describe("pickVariant", () => {
it("selects the first variant when counts are equal", () => {
it("breaks ties randomly for two-variant experiment when counts are equal", () => {
const state = { counts: {} };
// Math.floor(0 * 2) = 0 → tied[0] = "A"
vi.spyOn(Math, "random").mockReturnValueOnce(0);
expect(pickVariant("f", ["A", "B"], state)).toBe("A");

// Math.floor(0.5 * 2) = 1 → tied[1] = "B"
vi.spyOn(Math, "random").mockReturnValueOnce(0.5);
expect(pickVariant("f", ["A", "B"], state)).toBe("B");

vi.restoreAllMocks();
});

it("selects the least-used variant", () => {
Expand All @@ -48,14 +56,32 @@ describe("pick_experiment", () => {
expect(pickVariant("f", ["A", "B", "C"], state)).toBe("C");
});

it("returns the first variant when all counts are equal (tie-break by order)", () => {
it("randomly selects from all tied variants when all counts are equal", () => {
const state = { counts: { f: { A: 1, B: 1, C: 1 } } };
// All three variants are tied; verify the random index is respected.
// Math.floor(0 * 3) = 0 → tied[0] = "A"
// Math.floor(0.4 * 3) = 1 → tied[1] = "B" (0.4*3=1.2)
// Math.floor(0.7 * 3) = 2 → tied[2] = "C" (0.7*3=2.1)
vi.spyOn(Math, "random").mockReturnValueOnce(0).mockReturnValueOnce(0.4).mockReturnValueOnce(0.7);
expect(pickVariant("f", ["A", "B", "C"], state)).toBe("A");
expect(pickVariant("f", ["A", "B", "C"], state)).toBe("B");
expect(pickVariant("f", ["A", "B", "C"], state)).toBe("C");

vi.restoreAllMocks();
});

it("handles unknown experiment name (no counts yet)", () => {
it("handles unknown experiment name (no counts yet) by picking randomly", () => {
const state = { counts: {} };
// Both variants are tied with zero counts; verify the random index is respected.
// Math.floor(0 * 2) = 0 → tied[0] = "X"
// Math.floor(0.5 * 2) = 1 → tied[1] = "Y"
vi.spyOn(Math, "random").mockReturnValueOnce(0);
expect(pickVariant("new", ["X", "Y"], state)).toBe("X");

vi.spyOn(Math, "random").mockReturnValueOnce(0.5);
expect(pickVariant("new", ["X", "Y"], state)).toBe("Y");

vi.restoreAllMocks();
});
});

Expand Down Expand Up @@ -147,13 +173,18 @@ describe("pick_experiment", () => {
process.env.GH_AW_EXPERIMENT_STATE_FILE = stateFile;
process.env.GH_AW_EXPERIMENT_STATE_DIR = tmpDir;

// Force Math.random → 0 so the first tied variant ("A") is selected.
vi.spyOn(Math, "random").mockReturnValue(0);

await main();

// Individual output per experiment
expect(mockCore.setOutput).toHaveBeenCalledWith("feature1", "A");
// Combined JSON output
expect(mockCore.setOutput).toHaveBeenCalledWith("experiments", JSON.stringify({ feature1: "A" }));
expect(mockCore.setFailed).not.toHaveBeenCalled();

vi.restoreAllMocks();
});

it("persists state between calls to simulate multi-run balance", async () => {
Expand All @@ -164,14 +195,18 @@ describe("pick_experiment", () => {
process.env.GH_AW_EXPERIMENT_STATE_FILE = stateFile;
process.env.GH_AW_EXPERIMENT_STATE_DIR = tmpDir;

// Force Math.random → 0 so the first tied variant ("X") is selected on the first run.
vi.spyOn(Math, "random").mockReturnValue(0);

// First run → X
await main();
const firstCall = mockCore.setOutput.mock.calls.find(c => c[0] === "feat");
expect(firstCall?.[1]).toBe("X");

vi.restoreAllMocks();
vi.clearAllMocks();

// Second run → Y (state persisted from first call)
// Second run → Y (state persisted from first call; Y has the lower count)
await main();
const secondCall = mockCore.setOutput.mock.calls.find(c => c[0] === "feat");
expect(secondCall?.[1]).toBe("Y");
Expand All @@ -198,12 +233,17 @@ describe("pick_experiment", () => {
process.env.GH_AW_EXPERIMENT_STATE_FILE = stateFile;
process.env.GH_AW_EXPERIMENT_STATE_DIR = tmpDir;

// Force Math.random → 0 so the first tied variant is chosen for each experiment.
vi.spyOn(Math, "random").mockReturnValue(0);

await main();

const assignmentsFile = path.join(tmpDir, "assignments.json");
expect(fs.existsSync(assignmentsFile)).toBe(true);
const assignments = JSON.parse(fs.readFileSync(assignmentsFile, "utf8"));
expect(assignments).toEqual({ feature1: "A", style: "concise" });

vi.restoreAllMocks();
});

it("overwrites assignments.json on successive runs reflecting the current variant", async () => {
Expand All @@ -212,14 +252,18 @@ describe("pick_experiment", () => {
process.env.GH_AW_EXPERIMENT_STATE_FILE = stateFile;
process.env.GH_AW_EXPERIMENT_STATE_DIR = tmpDir;

// Force Math.random → 0 so the first tied variant ("X") is chosen on the first run.
vi.spyOn(Math, "random").mockReturnValue(0);

// First run → X
await main();
const assignmentsFile = path.join(tmpDir, "assignments.json");
expect(JSON.parse(fs.readFileSync(assignmentsFile, "utf8"))).toEqual({ feat: "X" });

vi.restoreAllMocks();
vi.clearAllMocks();

// Second run → Y
// Second run → Y (Y has the lower count after first run recorded X)
await main();
expect(JSON.parse(fs.readFileSync(assignmentsFile, "utf8"))).toEqual({ feat: "Y" });
});
Expand Down Expand Up @@ -265,10 +309,15 @@ describe("pick_experiment", () => {
process.env.GH_AW_EXPERIMENT_STATE_FILE = stateFile;
process.env.GH_AW_EXPERIMENT_STATE_DIR = tmpDir;

// Force Math.random → 0 so the first tied variant ("concise") is chosen.
vi.spyOn(Math, "random").mockReturnValue(0);

await main();

expect(mockCore.setOutput).toHaveBeenCalledWith("style", "concise");
expect(mockCore.setFailed).not.toHaveBeenCalled();

vi.restoreAllMocks();
});

it("uses control variant when today is before start_date", async () => {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ An external feature-flag service would provide a mature A/B testing API with das

#### Alternative 2: Random Per-Run Variant Selection Without State Persistence

Selecting a variant randomly on each run (e.g., `Math.random()`) requires no cache and no persistent state. This was rejected because it does not guarantee balance: over N runs, some variants may appear much more (or less) often than `N/K`, making statistically meaningful comparisons impossible without a large number of runs. The least-used counter approach achieves approximate balance in far fewer runs.
Selecting a variant randomly on each run (e.g., `Math.random()`) requires no cache and no persistent state. This was rejected as the sole selection strategy because it does not guarantee balance: over N runs, some variants may appear much more (or less) often than `N/K`, making statistically meaningful comparisons impossible without a large number of runs. The least-used counter approach achieves approximate balance in far fewer runs. However, random selection is retained as the tie-breaking strategy within the least-used algorithm — when multiple variants share the minimum count (including the initial empty-cache state), one is chosen at random to avoid systematically favouring the first declared variant.

#### Alternative 3: CI/CD Environment Variables Set Externally

Expand Down Expand Up @@ -65,7 +65,7 @@ Teams could manually pass variant values as repository variables or dispatch inp
### Variant Selection

5. Implementations **MUST** select the variant with the lowest cumulative invocation count across all previous runs (least-used selection).
6. When two or more variants share the lowest count (including the initial state where all counts are zero), implementations **MUST** break ties by selecting the variant appearing earliest in the declared list.
6. When two or more variants share the lowest count (including the initial state where all counts are zero), implementations **MUST** break ties by selecting uniformly at random from the tied variants, so no variant is systematically favoured on the first run or whenever counts are equal.
7. Variant counts **MUST** be persisted between workflow runs using the GitHub Actions cache, keyed by a combination of the sanitized workflow ID and the current run ID, with a restore-key prefix that matches any prior run for that workflow ID.
8. Implementations **MUST** expose each selected variant as a named step output (`steps.pick-experiment.outputs.<experiment-name>`) and **MUST** also set a combined JSON output (`steps.pick-experiment.outputs.experiments`) containing all variant assignments.
9. Implementations **MUST** upload the experiment state directory as an artifact named `experiment` (using `if: always()`) so that assignments are available for post-run analysis even when subsequent steps fail.
Expand Down
2 changes: 1 addition & 1 deletion docs/src/content/docs/guides/experiments.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ Address the issue described above.

## Statistical balancing

The activation job maintains a per-variant invocation counter in an `actions/cache` entry keyed by workflow ID. The variant with the lowest cumulative count is selected on each run; ties are broken by variant order. Over N runs every variant is used approximately N/K times (K = variant count), providing basic A/B balance with no configuration.
The activation job maintains a per-variant invocation counter in an `actions/cache` entry keyed by workflow ID. The variant with the lowest cumulative count is selected on each run; when multiple variants share the lowest count (including the very first run when the cache is empty), one is chosen at random so no variant is systematically favoured. Over N runs every variant is used approximately N/K times (K = variant count), providing basic A/B balance with no configuration.

The counter persists across workflow runs via the GitHub Actions cache. A fresh repository starts from zero counts.

Expand Down
2 changes: 1 addition & 1 deletion pkg/parser/schemas/main_workflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -2749,7 +2749,7 @@
]
},
"experiments": {
"description": "A/B testing experiments. Each key is an experiment name; the value is either an array of two or more variant strings (bare-array form) or an object with a 'variants' field plus optional metadata fields (description, metric, weight, issue, start_date, end_date, hypothesis, secondary_metrics, guardrail_metrics, min_samples, owner). At runtime the activation job picks a variant using actions/cache to maintain consistent assignment across runs. Use ${{ experiments.<name> }} in the workflow prompt to reference the selected variant. When multiple experiments are declared, assignments are statistically balanced using a counter that round-robins across variants (or weighted when 'weight' is provided).",
"description": "A/B testing experiments. Each key is an experiment name; the value is either an array of two or more variant strings (bare-array form) or an object with a 'variants' field plus optional metadata fields (description, metric, weight, issue, start_date, end_date, hypothesis, secondary_metrics, guardrail_metrics, min_samples, owner). At runtime the activation job picks a variant using actions/cache to maintain consistent assignment across runs. Use ${{ experiments.<name> }} in the workflow prompt to reference the selected variant. When multiple experiments are declared, assignments are statistically balanced using a least-used counter that round-robins across variants (or weighted when 'weight' is provided); ties are broken randomly so no variant is systematically favoured on the first run.",
"type": "object",
"propertyNames": {
"pattern": "^[a-zA-Z_][a-zA-Z0-9_]*$",
Expand Down