-
Notifications
You must be signed in to change notification settings - Fork 1.1k
feat(gax-grpc): add configurable resize delta and warning for repeated resizing #12838
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
03e341c
2060fcf
e876b31
b3ffb64
8a5daa1
813e101
91ffb52
de28aef
827b22d
899736f
f9792b6
856f3f2
a6a34dc
2be58da
e61486f
cbef704
2895af5
068887e
947fad5
f62aa0c
697c336
387c68e
defe1bb
e84b9d2
35234d1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -69,6 +69,11 @@ | |
| * <p>Package-private for internal use. | ||
| */ | ||
| class ChannelPool extends ManagedChannel { | ||
| static final String CHANNEL_POOL_CONSECUTIVE_RESIZING_WARNING = | ||
| "Channel pool is repeatedly resizing. " | ||
| + "Consider adjusting `initialChannelCount` or `maxResizeDelta` to a more reasonable value. " | ||
| + "See https://docs.cloud.google.com/java/docs/troubleshooting to enable logging " | ||
| + "and set `com.google.api.gax.grpc.ChannelPool.level=FINEST` to log the channel pool resize behavior."; | ||
| @VisibleForTesting static final Logger LOG = Logger.getLogger(ChannelPool.class.getName()); | ||
| private static final java.time.Duration REFRESH_PERIOD = java.time.Duration.ofMinutes(50); | ||
|
|
||
|
|
@@ -84,6 +89,16 @@ class ChannelPool extends ManagedChannel { | |
| private final AtomicInteger indexTicker = new AtomicInteger(); | ||
| private final String authority; | ||
|
|
||
| // The number of consecutive resize cycles to wait before logging a warning about repeated | ||
| // resizing. This value was chosen to detect repeated requests for changes (multiple continuous | ||
| // increase or decrease attempts) without being too sensitive. | ||
| private static final int CONSECUTIVE_RESIZE_THRESHOLD = 5; | ||
|
|
||
| // Tracks the number of consecutive resize cycles where a resize actually occurred (either expand | ||
| // or shrink). Used to detect repeated resizing activity and log a warning. | ||
| // Note: This field is only accessed safely within resizeSafely() and does not need to be atomic. | ||
|
lqiu96 marked this conversation as resolved.
|
||
| private int consecutiveResizes = 0; | ||
|
lqiu96 marked this conversation as resolved.
|
||
|
|
||
| static ChannelPool create( | ||
| ChannelPoolSettings settings, | ||
| ChannelFactory channelFactory, | ||
|
|
@@ -275,7 +290,8 @@ private void resizeSafely() { | |
| * <li>Get the maximum number of outstanding RPCs since last invocation | ||
| * <li>Determine a valid range of number of channels to handle that many outstanding RPCs | ||
| * <li>If the current number of channel falls outside of that range, add or remove at most | ||
| * {@link ChannelPoolSettings#MAX_RESIZE_DELTA} to get closer to middle of that range. | ||
| * {@link ChannelPoolSettings#DEFAULT_MAX_RESIZE_DELTA} to get closer to middle of that | ||
| * range. | ||
| * </ul> | ||
| * | ||
| * <p>Not threadsafe, must be called under the entryWriteLock monitor | ||
|
|
@@ -313,9 +329,25 @@ void resize() { | |
| int currentSize = localEntries.size(); | ||
| int delta = tentativeTarget - currentSize; | ||
| int dampenedTarget = tentativeTarget; | ||
| if (Math.abs(delta) > ChannelPoolSettings.MAX_RESIZE_DELTA) { | ||
| dampenedTarget = | ||
| currentSize + (int) Math.copySign(ChannelPoolSettings.MAX_RESIZE_DELTA, delta); | ||
| if (Math.abs(delta) > settings.getMaxResizeDelta()) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we know why there was a limit in the first place? Were there any technical limitations?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IIUC, it looks to just be a choice. Dampening and rate limit the channel growth to not overwhelm the client for a sudden burst. |
||
| dampenedTarget = currentSize + (int) Math.copySign(settings.getMaxResizeDelta(), delta); | ||
| } | ||
|
|
||
| // Only count as "resized" if the thresholds are crossed and Gax attempts to scale. Checking | ||
| // that `dampenedTarget != currentSize` would cause false positives when the pool is within | ||
| // bounds but not at the target (target aims for the middle of the bounds) | ||
| boolean resized = (currentSize < minChannels || currentSize > maxChannels); | ||
| if (resized) { | ||
| consecutiveResizes++; | ||
| } else { | ||
| consecutiveResizes = 0; | ||
| } | ||
|
|
||
| // Log warning only once when the consecutive threshold is reached to avoid spamming logs. Log | ||
| // message will repeat if the number of consecutive resizes resets (e.g. stabilizes for a bit). | ||
| // However, aim to log once to ensure that this does not incur log spam. | ||
| if (consecutiveResizes == CONSECUTIVE_RESIZE_THRESHOLD) { | ||
| LOG.warning(CHANNEL_POOL_CONSECUTIVE_RESIZING_WARNING); | ||
| } | ||
|
|
||
| // Only resize the pool when thresholds are crossed | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -59,7 +59,11 @@ public abstract class ChannelPoolSettings { | |
| static final Duration RESIZE_INTERVAL = Duration.ofMinutes(1); | ||
|
|
||
| /** The maximum number of channels that can be added or removed at a time. */ | ||
| static final int MAX_RESIZE_DELTA = 2; | ||
| static final int DEFAULT_MAX_RESIZE_DELTA = 2; | ||
|
|
||
| // Arbitrary limit to prevent unbounded growth and protect server/client resources. | ||
| // Capping at 25 ensures we don't scale too aggressively in a single cycle. | ||
| private static final int MAX_ALLOWED_RESIZE_DELTA = 25; | ||
|
|
||
| /** | ||
| * Threshold to start scaling down the channel pool. | ||
|
|
@@ -92,6 +96,22 @@ public abstract class ChannelPoolSettings { | |
| */ | ||
| public abstract int getMaxChannelCount(); | ||
|
|
||
| /** | ||
| * The maximum number of channels that can be added or removed at a time. | ||
| * | ||
| * <p>This setting limits the rate at which the channel pool can grow or shrink in a single resize | ||
| * period. The default value is {@value #DEFAULT_MAX_RESIZE_DELTA}. Increasing this value can help | ||
| * the pool better handle sudden bursts or spikes in requests by allowing it to scale up faster. | ||
| * Regardless of this setting, the number of channels will never exceed {@link | ||
| * #getMaxChannelCount()}. | ||
| * | ||
| * <p><b>Note:</b> This value cannot exceed {@value #MAX_ALLOWED_RESIZE_DELTA}. | ||
| * | ||
| * <p><b>Warning:</b> Higher values for resize delta may still result in performance degradation | ||
| * during spikes due to rapid scaling. | ||
| */ | ||
| public abstract int getMaxResizeDelta(); | ||
|
|
||
| /** | ||
| * The initial size of the channel pool. | ||
| * | ||
|
|
@@ -116,11 +136,7 @@ boolean isStaticSize() { | |
| return true; | ||
| } | ||
| // When the scaling threshold are not set | ||
| if (getMinRpcsPerChannel() == 0 && getMaxRpcsPerChannel() == Integer.MAX_VALUE) { | ||
| return true; | ||
| } | ||
|
|
||
| return false; | ||
| return getMinRpcsPerChannel() == 0 && getMaxRpcsPerChannel() == Integer.MAX_VALUE; | ||
| } | ||
|
|
||
| public abstract Builder toBuilder(); | ||
|
|
@@ -132,6 +148,9 @@ public static ChannelPoolSettings staticallySized(int size) { | |
| .setMaxRpcsPerChannel(Integer.MAX_VALUE) | ||
| .setMinChannelCount(size) | ||
| .setMaxChannelCount(size) | ||
| // Static pools don't resize so this value doesn't affect operation. However, | ||
| // validation still checks that resize delta doesn't exceed channel pool size. | ||
| .setMaxResizeDelta(Math.min(DEFAULT_MAX_RESIZE_DELTA, size)) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should still give it a max limit instead of being unbounded. Otherwise customers might expect this to handle 100x request spike as well.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The delta is capped to between [1, MAX_CHANNEL_COUNT]. The javadocs already mention that the number of channels can never exceed the total number of channels configured (Default 200)
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But is 200 a realistic number? I think it makes sense to allow customers configure resizeDelta to be more than 2, but more than 10 (We need to come up with an more acurate number) may introduce other performance issues. For example
Can we reach out to the gRPC team and get some suggestions?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This PR does not intend to fix channel pool's issues. It only exposing a setting to allow users to configure a value that they want that fits within the current bounds of the existing logic. The default resize delta remains 2 and if they choose a different value, then they can test it and figure if it works for their workload. There are limitations to the existing channel pool logic that can use some broader changes. There is a project proposal to investigate (b/503856499) how to make this better overall.
I just aim provide a safe default and let customers tinker with that they think best. Delta of 10 or 25 may work better for different workloads. If we want to fix ChannelPooling, then I think other changes would be more beneficial than investigating the resize delta value.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree that this PR is not to fix ChannelPooling. However, the new setter could make it easier for customers to exploit the current limitations of ChannelPooling, hence I think it would still be better to set an upper limit.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I'm not sure what you mean by exploiting here. If they configure a high value where performance degrades and doesn't work for them, they can rollback this change. If a high delta works for their use case, then they can opt to keep it. Regardless, I think we may have to agree to disagree on setting a hard bound here. I don't think we should set an arbitrary bound for user configuration regardless of current limitations, barring something that doesn't fit the logic like negative resize delta. IMO, if it has drastic performance concerns, it would be beneficial to see user workload configurations as well as the issue reports. It gives us signal about their requirements and helps us see what would be needed for a future ChannelPool overhaul (e.g. channel priming, power-of-two, etc). And the channelpool issues give us more direct data point to prioritize the project (instead of slightly tangential reports of increased client-side request latency). I'm worried that limiting this "exploit" hides the need to fix the underlying channelpool issues. Rather than having to talk to gRPC and having to investigate the possible default upper bound limits, how we compromise and I set this to a higher default upper bound value (e.g. 25?). I'll add javadocs about potential performance concerns for setting a higher delta?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated to clamp to 25 max for the resize delta with warning about performance |
||
| .build(); | ||
| } | ||
|
|
||
|
|
@@ -142,7 +161,8 @@ public static Builder builder() { | |
| .setMaxChannelCount(200) | ||
| .setMinRpcsPerChannel(0) | ||
| .setMaxRpcsPerChannel(Integer.MAX_VALUE) | ||
| .setPreemptiveRefreshEnabled(false); | ||
| .setPreemptiveRefreshEnabled(false) | ||
| .setMaxResizeDelta(DEFAULT_MAX_RESIZE_DELTA); | ||
| } | ||
|
|
||
| @AutoValue.Builder | ||
|
|
@@ -159,6 +179,15 @@ public abstract static class Builder { | |
|
|
||
| public abstract Builder setPreemptiveRefreshEnabled(boolean enabled); | ||
|
|
||
| /** | ||
| * Sets the maximum number of channels that can be added or removed in a single resize cycle. | ||
| * This acts as a rate limiter to prevent wild fluctuations. | ||
| * | ||
| * <p><b>Warning:</b> Higher values for resize delta may still result in performance degradation | ||
| * during spikes due to rapid scaling. | ||
| */ | ||
| public abstract Builder setMaxResizeDelta(int count); | ||
|
|
||
| abstract ChannelPoolSettings autoBuild(); | ||
|
|
||
| public ChannelPoolSettings build() { | ||
|
|
@@ -178,6 +207,14 @@ public ChannelPoolSettings build() { | |
| "initial channel count must be less than maxChannelCount"); | ||
| Preconditions.checkState( | ||
| s.getInitialChannelCount() > 0, "Initial channel count must be greater than 0"); | ||
| Preconditions.checkState( | ||
| s.getMaxResizeDelta() > 0, "Max resize delta must be greater than 0"); | ||
| Preconditions.checkState( | ||
| s.getMaxResizeDelta() <= MAX_ALLOWED_RESIZE_DELTA, | ||
| "Max resize delta cannot be greater than " + MAX_ALLOWED_RESIZE_DELTA); | ||
| Preconditions.checkState( | ||
| s.getMaxResizeDelta() <= s.getMaxChannelCount(), | ||
| "Max resize delta cannot be greater than max channel count"); | ||
| return s; | ||
| } | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: would it make sense to also add max/minRpcsPerChannel in the message? E.g.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My plan is to change this to reference a public guide that I'm going to write (after metrics). WDYT if I leave this for now (for the immediate Datastore ask) and then update it to the public guide?