Skip to content

Commit

Permalink
feat: Make StreamingPull fail after 100 consecutive failures
Browse files Browse the repository at this point in the history
With a max backoff of 30 seconds (which we reach pretty quickly)
this means we'll only actually fail after about 45 minutes, so it
does allow quite a lot of time for recovery.
  • Loading branch information
jskeet committed May 30, 2024
1 parent 335abae commit d9d658b
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;
using Xunit;
Expand Down Expand Up @@ -1784,7 +1785,7 @@ public void StreamingPullRetry_NonRetriableException()
}

[Fact]
public void StreamingPullRetry_InternalErrorRetriesForever()
public void StreamingPullRetry_InternalErrorContinuesRetrying()
{
// A regular internal failure that's not due to an auth error.
var exception = new RpcException(new Status(StatusCode.Internal, "Bang"));
Expand All @@ -1801,6 +1802,24 @@ public void StreamingPullRetry_InternalErrorRetriesForever()
});
}

[Fact]
public void StreamingPullRetry_RetriableErrorEventuallyFails()
{
// A regular internal failure that's not due to an auth error.
var exception = new RpcException(new Status(StatusCode.Internal, "Bang"));

// When we've reached a limit of the number of exceptions we're happy to retry, we'll eventually fail.
// (This will take a long time, with all the backoffs involved...)
using var fake = Fake.Create(CreateBadMoveNextSequence(TimeSpan.FromSeconds(1), exception, 100, includeTrailing: true));

fake.Scheduler.Run(async () =>
{
var subscriberTask = fake.Subscriber.StartAsync((msg, ct) => throw new Exception("No messages should be provided"));
var subscriberEx = await Assert.ThrowsAsync<RpcException>(() => subscriberTask);
Assert.Equal(exception.Status, subscriberEx.Status);
});
}

/// <summary>
/// If the streaming pull call fails in MoveNext after a short time (e.g. 10 seconds)
/// we should retry with backoff.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1122,7 +1122,11 @@ private class RetryState
private const string GrpcCoreAuthExceptionPrefix = "Getting metadata from plugin failed with error: Exception occurred in metadata credentials plugin. ";
private const int MaxAuthExceptionsBeforeFailing = 4;

private const int ExceptionLimit = 100;
/// <summary>
/// We fail after this many consecutive failures, regardless of what the failure was.
/// With the backoffs involved, this will be after a pretty significant amount of time anyway.
/// </summary>
private const int ConcurrentFailureLimit = 100;

private readonly IClock _clock;
private readonly ILogger _logger;
Expand Down Expand Up @@ -1160,9 +1164,13 @@ internal bool RecordFailureAndCheckForRetry(Exception exception)
{
return false;
}
if (_exceptions.Count < ExceptionLimit)

_exceptions.Add(rpcEx);

// If we've reached our limit, fail regardless.
if (_exceptions.Count == ConcurrentFailureLimit)
{
_exceptions.Add(rpcEx);
return false;
}

var code = rpcEx.StatusCode;
Expand All @@ -1183,6 +1191,8 @@ internal bool RecordFailureAndCheckForRetry(Exception exception)
}

// If the exception was a failure due to auth, and we've seen some before, don't retry.
// The auth-related exceptions don't need to be consecutive: if we have transient auth related
// problems, then non-auth problems, then auth related problems again, we're definitely in a bad situation.
if (IsAuthException(rpcEx))
{
var count = _exceptions.Count(IsAuthException);
Expand Down

0 comments on commit d9d658b

Please sign in to comment.