From 5ac50f7b0c11dd56d3f98866b061624b85b62ead Mon Sep 17 00:00:00 2001 From: Yahor Yuzefovich Date: Thu, 30 Oct 2025 19:59:35 +0000 Subject: [PATCH] copy: fix vectorized auto commit behavior When we implemented the vectorized INSERT which supports COPY in some cases, we missed one condition for auto-committing the txn that is present in the regular `tableWriterBase` path. Namely, we need to check whether the deadline that might be set on the txn hasn't expired yet, and if it has, we shouldn't be auto-committing and should be leaving it up to the connExecutor (which will try to refresh the deadline). The impact of the bug is that often if COPY took longer than 40s (controlled via `server.sqlliveness.ttl`), we'd hit the txn retry error and propagate it to the client. Release note (bug fix): Previously, the "atomic" COPY command (controlled via `copy_from_atomic_enabled`, which is `true` by default) could encounter RETRY_COMMIT_DEADLINE_EXCEEDED txn errors if the whole command took 1 minute or more. This was the case only when the vectorized engine was used for COPY and is now fixed. --- pkg/cmd/roachtest/tests/copyfrom.go | 11 +---------- pkg/sql/colexec/insert.go | 10 ++++++++-- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/pkg/cmd/roachtest/tests/copyfrom.go b/pkg/cmd/roachtest/tests/copyfrom.go index 04c6df98ed60..5723e2563b4f 100644 --- a/pkg/cmd/roachtest/tests/copyfrom.go +++ b/pkg/cmd/roachtest/tests/copyfrom.go @@ -148,16 +148,7 @@ func runCopyFromCRDB(ctx context.Context, t test.Test, c cluster.Cluster, sf int // Enable the verbose logging on relevant files to have better understanding // in case the test fails. startOpts.RoachprodOpts.ExtraArgs = append(startOpts.RoachprodOpts.ExtraArgs, "--vmodule=copy_from=2,insert=2") - // The roachtest frequently runs on overloaded instances and can timeout as - // a result. We've seen cases where the atomic COPY takes about 2 minutes to - // complete, so we set the closed TS and SQL liveness TTL to 5 minutes (to - // give enough safety gap, otherwise the closed TS system and lease system - // expireation might continuously push the COPY txn not allowing it ever to - // complete). - clusterSettings := install.MakeClusterSettings(install.ClusterSettingsOption{ - "kv.closed_timestamp.target_duration": "300s", - "server.sqlliveness.ttl": "300s", - }) + clusterSettings := install.MakeClusterSettings() c.Start(ctx, t.L(), startOpts, clusterSettings, c.All()) initTest(ctx, t, c, sf) db, err := c.ConnE(ctx, t.L(), 1) diff --git a/pkg/sql/colexec/insert.go b/pkg/sql/colexec/insert.go index 34b055bb7edf..cb16fb2b01ea 100644 --- a/pkg/sql/colexec/insert.go +++ b/pkg/sql/colexec/insert.go @@ -190,9 +190,15 @@ func (v *vectorInserter) Next() coldata.Batch { } colexecerror.ExpectedError(err) } - log.VEventf(ctx, 2, "copy running batch, autocommit: %v, final: %v, numrows: %d", v.autoCommit, end == b.Length(), end-start) + // Similar to tableWriterBase.finalize, we examine whether it's likely + // that we'll be able to auto-commit. If it seems unlikely based on the + // deadlie, we won't auto-commit which might allow the connExecutor to + // get a fresh deadline before committing. + autoCommit := v.autoCommit && end == b.Length() && + !v.flowCtx.Txn.DeadlineLikelySufficient() + log.VEventf(ctx, 2, "copy running batch, autocommit: %v, numrows: %d", autoCommit, end-start) var err error - if v.autoCommit && end == b.Length() { + if autoCommit { err = v.flowCtx.Txn.CommitInBatch(ctx, kvba.Batch) } else { err = v.flowCtx.Txn.Run(ctx, kvba.Batch)