From ce5b17084947daa43e4c6214410969bfd171a24d Mon Sep 17 00:00:00 2001 From: Vikrant Puppala Date: Tue, 26 May 2026 09:13:58 +0000 Subject: [PATCH 1/2] [PECOBLR-2461] fix(tests/e2e): de-collide MST table names across concurrent CI jobs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `_unique_table_name` derived the table name purely from the test node id, so two CI runs racing on the same warehouse + catalog (e.g. PR + push to main landing within seconds, as happened in run 26410038645) would both target `peco.default.mst_pysql_` and step on each other's CREATE / DROP / DML. This caused the three "flaky" failures we kept seeing in `test_transactions.py`: - test_multi_table_commit → TRANSACTION_ROLLBACK_REQUIRED_AFTER_ABORT - test_executemany_rollback_in_txn → TABLE_OR_VIEW_NOT_FOUND - test_write_conflict_single_table → TABLE_OR_VIEW_ALREADY_EXISTS The companion helper `_unique_table_name_raw` already appended a uuid4 suffix for exactly this reason; the fixture helper was an oversight from #775. Add the same suffix here, reserving room so the result still fits in the 80-char cap. Co-authored-by: Isaac Signed-off-by: Vikrant Puppala --- tests/e2e/test_transactions.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/e2e/test_transactions.py b/tests/e2e/test_transactions.py index e91afc0dd..fd4fd73ab 100644 --- a/tests/e2e/test_transactions.py +++ b/tests/e2e/test_transactions.py @@ -35,10 +35,17 @@ def _unique_table_name(request): - """Derive a unique Delta table name from the test node id.""" + """Derive a unique Delta table name from the test node id. + + The uuid suffix keeps tables unique across concurrent CI jobs that + share the same warehouse/catalog — without it, two runs racing on + the same test name collide on CREATE/DROP. + """ node_id = request.node.name sanitized = re.sub(r"[^a-z0-9_]", "_", node_id.lower()) - return f"mst_pysql_{sanitized}"[:80] + suffix = uuid.uuid4().hex[:8] + # Reserve room for the 9-char "_{suffix}" tail so total stays <= 80. + return f"mst_pysql_{sanitized}"[:71] + f"_{suffix}" def _unique_table_name_raw(suffix): From 665b37010fa61cc46cb0c412b48c94c5d2a103b0 Mon Sep 17 00:00:00 2001 From: Vikrant Puppala Date: Tue, 26 May 2026 09:40:16 +0000 Subject: [PATCH 2/2] also de-collide UC Volume tests and add PR-branch concurrency cancellation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While verifying the transactions fix, a second flake surfaced — the same cross-CI race pattern, this time on a UC Volume file path: tests/e2e/test_driver.py::TestPySQLCoreSuite:: test_uc_volume_put_fails_if_file_exists_and_overwrite_not_set → Failed: DID NOT RAISE Two tests in `tests/e2e/common/uc_volume_tests.py` PUT/GET/REMOVE against the hardcoded path `/Volumes/{catalog}/{schema}/e2etests/file1.csv`. When two CI runs overlap (as happened on this PR after the force-push for DCO sign-off), one run's REMOVE deletes the other run's file between its two PUTs, so the expected `FILE_IN_STAGING_PATH_ALREADY_EXISTS` never fires. Suffix the volume path with `uuid4().hex[:8]` in the two affected tests (test_uc_volume_life_cycle and test_uc_volume_put_fails_if_file_exists_ and_overwrite_not_set). The other UC Volume tests that reference the same path are exercising client-side / server-parse failure paths that never touch the file — leaving those alone keeps the diff focused. Also add a `concurrency` block to the E2E workflow: - On pull_request, cancel-in-progress: a new push / force-push on a PR cancels the previous run on that ref. Eliminates the entire class of "force-pushed during CI → two runs racing on shared warehouse state" failures. - On push to main, runs are NOT cancelled — each merge commit keeps its own clean CI signal so a regression on commit N can't be hidden by commit N+1 landing seconds later. Concurrent main runs can still collide on shared state, but the uuid-suffix conventions in the e2e tests are what keep that isolated. Signed-off-by: Vikrant Puppala --- .github/workflows/code-coverage.yml | 13 +++++++++++++ tests/e2e/common/uc_volume_tests.py | 24 ++++++++++++++++-------- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/.github/workflows/code-coverage.yml b/.github/workflows/code-coverage.yml index 9f578ec9f..c5da8ab2d 100644 --- a/.github/workflows/code-coverage.yml +++ b/.github/workflows/code-coverage.yml @@ -11,6 +11,19 @@ on: pull_request: workflow_dispatch: +# Serialise E2E runs per ref so a force-push (or a fast follow-up commit) +# on a PR cancels the previous run instead of racing it against shared +# warehouse state (Delta tables, UC Volume files, etc.). +# +# Pushes to main are NOT cancelled — each merge commit needs its own clean +# CI signal so a regression on commit N doesn't get hidden by commit N+1 +# arriving seconds later. (Concurrent main runs can still collide on shared +# state, but that's the cost of preserving per-commit signal; the +# uuid-suffix conventions in the e2e tests are what keep them isolated.) +concurrency: + group: e2e-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + jobs: test-with-coverage: runs-on: diff --git a/tests/e2e/common/uc_volume_tests.py b/tests/e2e/common/uc_volume_tests.py index 5b4086f91..cf8797c63 100644 --- a/tests/e2e/common/uc_volume_tests.py +++ b/tests/e2e/common/uc_volume_tests.py @@ -1,5 +1,6 @@ import os import tempfile +from uuid import uuid4 import pytest import databricks.sql as sql @@ -40,12 +41,16 @@ def test_uc_volume_life_cycle(self, catalog, schema): with open(fh, "wb") as fp: fp.write(original_text) + # Unique per-run path so concurrent CI jobs sharing the same volume + # don't step on each other's PUT/GET/REMOVE. + volume_path = f"/Volumes/{catalog}/{schema}/e2etests/life_cycle_{uuid4().hex[:8]}.csv" + with self.connection( extra_params={"staging_allowed_local_path": temp_path} ) as conn: cursor = conn.cursor() - query = f"PUT '{temp_path}' INTO '/Volumes/{catalog}/{schema}/e2etests/file1.csv' OVERWRITE" + query = f"PUT '{temp_path}' INTO '{volume_path}' OVERWRITE" cursor.execute(query) # GET should succeed @@ -56,7 +61,7 @@ def test_uc_volume_life_cycle(self, catalog, schema): extra_params={"staging_allowed_local_path": new_temp_path} ) as conn: cursor = conn.cursor() - query = f"GET '/Volumes/{catalog}/{schema}/e2etests/file1.csv' TO '{new_temp_path}'" + query = f"GET '{volume_path}' TO '{new_temp_path}'" cursor.execute(query) with open(new_fh, "rb") as fp: @@ -66,7 +71,7 @@ def test_uc_volume_life_cycle(self, catalog, schema): # REMOVE should succeed - remove_query = f"REMOVE '/Volumes/{catalog}/{schema}/e2etests/file1.csv'" + remove_query = f"REMOVE '{volume_path}'" # Use minimal retry settings to fail fast extra_params = { @@ -84,7 +89,7 @@ def test_uc_volume_life_cycle(self, catalog, schema): Error, match="Staging operation over HTTP was unsuccessful: 404" ): cursor = conn.cursor() - query = f"GET '/Volumes/{catalog}/{schema}/e2etests/file1.csv' TO '{new_temp_path}'" + query = f"GET '{volume_path}' TO '{new_temp_path}'" cursor.execute(query) os.remove(temp_path) @@ -151,19 +156,22 @@ def test_uc_volume_put_fails_if_file_exists_and_overwrite_not_set( with open(fh, "wb") as fp: fp.write(original_text) + # Unique per-run path so a concurrent CI job's REMOVE doesn't delete + # our file between the two PUTs and silently turn the expected + # FILE_IN_STAGING_PATH_ALREADY_EXISTS into a successful PUT. + volume_path = f"/Volumes/{catalog}/{schema}/e2etests/put_conflict_{uuid4().hex[:8]}.csv" + def perform_put(): with self.connection( extra_params={"staging_allowed_local_path": temp_path} ) as conn: cursor = conn.cursor() - query = f"PUT '{temp_path}' INTO '/Volumes/{catalog}/{schema}/e2etests/file1.csv'" + query = f"PUT '{temp_path}' INTO '{volume_path}'" cursor.execute(query) def perform_remove(): try: - remove_query = ( - f"REMOVE '/Volumes/{catalog}/{schema}/e2etests/file1.csv'" - ) + remove_query = f"REMOVE '{volume_path}'" with self.connection( extra_params={"staging_allowed_local_path": "/"}