From 0272d4e1cc93dd4e5a47608a25cdb3a459b4177f Mon Sep 17 00:00:00 2001 From: skuruppu Date: Mon, 7 Sep 2020 12:35:32 +1000 Subject: [PATCH] fix: retry PDML on EOS on DATA error (#5238) * fix: retry PDML on EOS on DATA error For long-running PDML queries (>= 30mins), there's a possibility that the gRPC connection is terminated with an error "Received unexpected EOS on DATA frame from server". We now retry the same transaction on this error. Fixes #5209 * test: added unit test to verify retry on error --- .../SpannerCommandTests.cs | 20 +++++++++++ .../V1/SpannerClientHelpers.cs | 30 ++++++++++++++++ .../EphemeralTransaction.cs | 34 ++++++++++++++----- 3 files changed, 75 insertions(+), 9 deletions(-) diff --git a/apis/Google.Cloud.Spanner.Data/Google.Cloud.Spanner.Data.Tests/SpannerCommandTests.cs b/apis/Google.Cloud.Spanner.Data/Google.Cloud.Spanner.Data.Tests/SpannerCommandTests.cs index 9555c0fe4724..3c745c741334 100644 --- a/apis/Google.Cloud.Spanner.Data/Google.Cloud.Spanner.Data.Tests/SpannerCommandTests.cs +++ b/apis/Google.Cloud.Spanner.Data/Google.Cloud.Spanner.Data.Tests/SpannerCommandTests.cs @@ -229,6 +229,26 @@ public void ExecuteReaderHasResourceHeader() It.Is(settings => HasResourcePrefixHeader(settings))), Times.Once()); } + [Fact] + public void PdmlRetriedOnEosError() + { + Mock spannerClientMock = SpannerClientHelpers + .CreateMockClient(Logger.DefaultLogger, MockBehavior.Strict); + spannerClientMock + .SetupBatchCreateSessionsAsync() + .SetupBeginTransactionAsync() + .SetupExecuteStreamingSqlForDmlThrowingEosError(); + + SpannerConnection connection = BuildSpannerConnection(spannerClientMock); + + var command = connection.CreateDmlCommand("UPDATE abc SET xyz = 1 WHERE Id > 1"); + long rowCount = command.ExecutePartitionedUpdate(); + Assert.True(rowCount > 0); + spannerClientMock.Verify(client => client.ExecuteStreamingSql( + It.IsAny(), + It.IsAny()), Times.Exactly(3)); + } + private Mock SetupExecuteStreamingSql(string optimizerVersion = "") { Mock spannerClientMock = SpannerClientHelpers diff --git a/apis/Google.Cloud.Spanner.Data/Google.Cloud.Spanner.Data.Tests/V1/SpannerClientHelpers.cs b/apis/Google.Cloud.Spanner.Data/Google.Cloud.Spanner.Data.Tests/V1/SpannerClientHelpers.cs index ef7fbe5470ea..0c0b17ba5636 100644 --- a/apis/Google.Cloud.Spanner.Data/Google.Cloud.Spanner.Data.Tests/V1/SpannerClientHelpers.cs +++ b/apis/Google.Cloud.Spanner.Data/Google.Cloud.Spanner.Data.Tests/V1/SpannerClientHelpers.cs @@ -16,6 +16,7 @@ using Google.Api.Gax.Grpc; using Google.Api.Gax.Grpc.Testing; using Google.Api.Gax.Testing; +using Google.Cloud.Spanner.Data; using Google.Cloud.Spanner.V1.Internal.Logging; using Google.Protobuf; using Google.Protobuf.WellKnownTypes; @@ -217,6 +218,35 @@ internal static Mock SetupExecuteStreamingSql(this Mock SetupExecuteStreamingSqlForDmlThrowingEosError(this Mock spannerClientMock) + { + const string eosError = "Received unexpected EOS on DATA frame from server"; + spannerClientMock + .SetupSequence(client => client.ExecuteStreamingSql( + It.IsAny(), + It.IsAny())) + .Throws(new SpannerException(ErrorCode.Internal, eosError)) + .Throws(new SpannerException(ErrorCode.Internal, eosError)) + .Returns(() => + { + IEnumerable results = new string[] {"token1", "token2", "token3"} + .Select((resumeToken, index) => new PartialResultSet + { + ResumeToken = ByteString.CopyFromUtf8(resumeToken), + Stats = new ResultSetStats + { + RowCountExact = 10 + } + }) + .ToList(); + var asyncResults = new AsyncStreamAdapter(results.ToAsyncEnumerable().GetAsyncEnumerator(default)); + var call = new AsyncServerStreamingCall(asyncResults, + Task.FromResult(new Metadata()), () => new Status(), () => new Metadata(), () => { }); + return new ExecuteStreamingSqlStreamImpl(call); + }); + return spannerClientMock; + } + private static ISetupSequentialResult SetupFailures(this ISetupSequentialResult setup, int failures, StatusCode statusCode, string exceptionMessage, TimeSpan? exceptionRetryDelay) { var exception = BuildRpcException(exceptionRetryDelay, statusCode, exceptionMessage); diff --git a/apis/Google.Cloud.Spanner.Data/Google.Cloud.Spanner.Data/EphemeralTransaction.cs b/apis/Google.Cloud.Spanner.Data/Google.Cloud.Spanner.Data/EphemeralTransaction.cs index 380a3f08103c..8c9a5b511459 100644 --- a/apis/Google.Cloud.Spanner.Data/Google.Cloud.Spanner.Data/EphemeralTransaction.cs +++ b/apis/Google.Cloud.Spanner.Data/Google.Cloud.Spanner.Data/EphemeralTransaction.cs @@ -14,6 +14,7 @@ using Google.Api.Gax; using Google.Cloud.Spanner.V1; +using System; using System.Collections.Generic; using System.Threading; using System.Threading.Tasks; @@ -45,17 +46,32 @@ async Task Impl() using (var transaction = await _connection.BeginTransactionImplAsync(_transactionOptions, TransactionMode.ReadWrite, cancellationToken).ConfigureAwait(false)) { transaction.CommitTimeout = timeoutSeconds; - long count = await ((ISpannerTransaction)transaction) - .ExecuteDmlAsync(request, cancellationToken, timeoutSeconds) - .ConfigureAwait(false); - - // This is somewhat ugly. PDML commits as it goes, so we don't need to, whereas non-partitioned - // DML needs the commit afterwards to finish up. - if (_transactionOptions.ModeCase != TransactionOptions.ModeOneofCase.PartitionedDml) + while (true) { - await transaction.CommitAsync(cancellationToken).ConfigureAwait(false); + try + { + long count = await ((ISpannerTransaction)transaction) + .ExecuteDmlAsync(request, cancellationToken, timeoutSeconds) + .ConfigureAwait(false); + + // This is somewhat ugly. PDML commits as it goes, so we don't need to, whereas non-partitioned + // DML needs the commit afterwards to finish up. + if (_transactionOptions.ModeCase != TransactionOptions.ModeOneofCase.PartitionedDml) + { + await transaction.CommitAsync(cancellationToken).ConfigureAwait(false); + } + return count; + } + catch (SpannerException e) when ( + _transactionOptions.ModeCase == TransactionOptions.ModeOneofCase.PartitionedDml && + e.ErrorCode == ErrorCode.Internal && + e.Message.Contains("Received unexpected EOS on DATA frame from server")) + { + // Retry with the same transaction. Since this error happens in long-lived + // transactions (>= 30 mins), it's unnecessary to do exponential backoff. + continue; + } } - return count; } } }