Skip to content

Commit

Permalink
feat: Add metric for scaling duration
Browse files Browse the repository at this point in the history
Adds a distribution metric for scaling duration, and only records success once operation is complete
  • Loading branch information
nielm committed May 3, 2024
1 parent 9187ef9 commit c53fe10
Show file tree
Hide file tree
Showing 4 changed files with 156 additions and 23 deletions.
70 changes: 60 additions & 10 deletions src/autoscaler-common/counters_base.js
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ const {version: packageVersion} = require('../../package.json');
* @typedef {{
* counterName: string,
* counterDesc: string,
* counterType?: "CUMULATIVE" | "HISTOGRAM" // default=COUNTER
* counterUnits?: string
* counterHistogramBuckets?: number[]
* }} CounterDefinition
*/

Expand Down Expand Up @@ -123,7 +126,13 @@ let exporterMode;
/**
* Global counters object, populated by createCounters.
*
* @type {Map<String,OpenTelemetryApi.Counter>} counter Name to counter instance
* @type {Map<
* String,
* {
* cumulative?: OpenTelemetryApi.Counter,
* histogram?: OpenTelemetryApi.Histogram
* }
* >} counter Name to counter instance
*/
const COUNTERS = new Map();

Expand Down Expand Up @@ -348,27 +357,67 @@ async function createCounters(counterDefinitions) {
if (COUNTERS.get(counterDef.counterName)) {
throw new Error('Counter already created: ' + counterDef.counterName);
}
COUNTERS.set(
counterDef.counterName,
meter.createCounter(COUNTERS_PREFIX + counterDef.counterName, {
description: counterDef.counterDesc,
}),
);
switch (counterDef.counterType || 'CUMULATIVE') {
case 'CUMULATIVE':
COUNTERS.set(counterDef.counterName, {
cumulative: meter.createCounter(
COUNTERS_PREFIX + counterDef.counterName,
{
description: counterDef.counterDesc,
unit: counterDef.counterUnits,
},
),
});
break;
case 'HISTOGRAM':
COUNTERS.set(counterDef.counterName, {
histogram: meter.createHistogram(
COUNTERS_PREFIX + counterDef.counterName,
{
description: counterDef.counterDesc,
unit: counterDef.counterUnits,
advice: {
explicitBucketBoundaries: counterDef.counterHistogramBuckets,
},
},
),
});
break;
default:
throw new Error(
`Invalid counter type for ${counterDef.counterName}: ${counterDef.counterType}`,
);
}
}
}

/**
* Increment a counter.
* Increment a cumulative counter.
*
* @param {string} counterName
* @param {OpenTelemetryApi.Attributes} [counterAttributes]
*/
function incCounter(counterName, counterAttributes) {
const counter = COUNTERS.get(counterName);
if (!counter) {
if (!counter?.cumulative) {
throw new Error('Unknown counter: ' + counterName);
}
counter.cumulative.add(1, counterAttributes);
}

/**
* Record a histogram counter value.
*
* @param {string} counterName
* @param {number} value
* @param {OpenTelemetryApi.Attributes} [counterAttributes]
*/
function recordValue(counterName, value, counterAttributes) {
const counter = COUNTERS.get(counterName);
if (!counter?.histogram) {
throw new Error('Unknown counter: ' + counterName);
}
counter.add(1, counterAttributes);
counter.histogram.record(value, counterAttributes);
}

let lastForceFlushTime = 0;
Expand Down Expand Up @@ -482,6 +531,7 @@ module.exports = {
COUNTER_ATTRIBUTE_NAMES,
createCounters,
incCounter,
recordValue,
tryFlush,
setTryFlushEnabled,
};
29 changes: 29 additions & 0 deletions src/scaler/scaler-core/counters.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ const COUNTER_NAMES = {
SCALING_FAILED: COUNTERS_PREFIX + 'scaling-failed',
REQUESTS_SUCCESS: COUNTERS_PREFIX + 'requests-success',
REQUESTS_FAILED: COUNTERS_PREFIX + 'requests-failed',
SCALING_DURATION: COUNTERS_PREFIX + 'scaling-duration',
};

const ATTRIBUTE_NAMES = {
Expand Down Expand Up @@ -71,6 +72,17 @@ const COUNTERS = [
counterName: COUNTER_NAMES.REQUESTS_FAILED,
counterDesc: 'The number of scaling request messages that failed',
},
{
counterName: COUNTER_NAMES.SCALING_DURATION,
counterDesc: 'The time taken to complete the scaling operation',
counterType: 'HISTOGRAM',
counterUnits: 'ms', // milliseconds
// This creates a set of 25 buckets with exponential growth
// starting at 0, increasing to 6553_500ms ~= 109mins
counterHistogramBuckets: [...Array(25).keys()].map((n) =>
Math.floor(100 * (2 ** (n / 1.5) - 1)),
),
},
];

const pendingInit = CountersBase.createCounters(COUNTERS);
Expand Down Expand Up @@ -156,11 +168,28 @@ async function incRequestsFailedCounter() {
CountersBase.incCounter(COUNTER_NAMES.REQUESTS_FAILED);
}

/**
* Record scaling duration to the distribution.
*
* @param {number} durationMillis
* @param {AutoscalerSpanner} spanner config object
* @param {number} requestedSize
*/
async function recordScalingDuration(durationMillis, spanner, requestedSize) {
await pendingInit;
CountersBase.recordValue(
COUNTER_NAMES.SCALING_DURATION,
Math.floor(durationMillis),
_getCounterAttributes(spanner, requestedSize),
);
}

module.exports = {
incScalingSuccessCounter,
incScalingFailedCounter,
incScalingDeniedCounter,
incRequestsSuccessCounter,
incRequestsFailedCounter,
recordScalingDuration,
tryFlush: CountersBase.tryFlush,
};
31 changes: 24 additions & 7 deletions src/scaler/scaler-core/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -398,10 +398,6 @@ async function processScalingRequest(spanner, autoscalerState) {
lastScalingCompleteTimestamp: null,
});
eventType = 'SCALING';

// TODO: When no longer handling backward compatibility in state
// this should be moved to the LRO check.
await Counters.incScalingSuccessCounter(spanner, suggestedSize);
} catch (err) {
logger.error({
message: `----- ${spanner.projectId}/${spanner.instanceId}: Unsuccessful scaling attempt: ${err}`,
Expand Down Expand Up @@ -647,9 +643,12 @@ async function readStateCheckOngoingLRO(spanner, autoscalerState) {
? {nodeCount: metadata.instance?.nodeCount}
: {processingUnits: metadata.instance?.processingUnits};
const requestedSizeValue =
spanner.units === AutoscalerUnits.NODES
? metadata.instance?.nodeCount || undefined
: metadata.instance?.processingUnits || undefined;
(spanner.units === AutoscalerUnits.NODES
? // one of the following 2 values should always be set, but as a fallback
// set the currentSize
metadata.instance?.nodeCount || undefined
: metadata.instance?.processingUnits || undefined) ||
spanner.currentSize;
const displayedRequestedSize = JSON.stringify(requestedSize);

if (operationState.done) {
Expand All @@ -664,6 +663,7 @@ async function readStateCheckOngoingLRO(spanner, autoscalerState) {
requestedSize: requestedSize,
payload: spanner,
});

// Clear last operation ID from savedState and set completion time.
savedState.scalingOperationId = null;
if (endTimestamp) {
Expand All @@ -676,6 +676,15 @@ async function readStateCheckOngoingLRO(spanner, autoscalerState) {
savedState.lastScalingCompleteTimestamp =
savedState.lastScalingTimestamp;
}

// Record success counters.
await Counters.recordScalingDuration(
savedState.lastScalingCompleteTimestamp -
savedState.lastScalingTimestamp,
spanner,
requestedSizeValue,
);
await Counters.incScalingSuccessCounter(spanner, requestedSizeValue);
} else {
// Last operation failed with an error
logger.error({
Expand Down Expand Up @@ -739,6 +748,14 @@ async function readStateCheckOngoingLRO(spanner, autoscalerState) {
});
savedState.scalingOperationId = null;
savedState.lastScalingCompleteTimestamp = savedState.lastScalingTimestamp;
// Record success counters.
await Counters.recordScalingDuration(
savedState.lastScalingCompleteTimestamp - savedState.lastScalingTimestamp,
spanner,
spanner.currentSize,
);
await Counters.incScalingSuccessCounter(spanner, spanner.currentSize);

return {
savedState,
expectedFulfillmentPeriod: undefined,
Expand Down
49 changes: 43 additions & 6 deletions src/scaler/scaler-core/test/index.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ describe('#processScalingRequest', () => {
incScalingSuccessCounter: sinon.stub(),
incScalingFailedCounter: sinon.stub(),
incScalingDeniedCounter: sinon.stub(),
recordScalingDuration: sinon.stub(),
};
const stubScaleSpannerInstance = sinon.stub();
const getSuggestedSizeStub = sinon.stub();
Expand Down Expand Up @@ -154,11 +155,7 @@ describe('#processScalingRequest', () => {

assert.equals(stubScaleSpannerInstance.callCount, 1);
assert.equals(stubScaleSpannerInstance.getCall(0).args[1], suggestedSize);
assert.equals(countersStub.incScalingSuccessCounter.callCount, 1);
assert.equals(
countersStub.incScalingSuccessCounter.getCall(0).args[1],
suggestedSize,
);
assert.equals(countersStub.incScalingSuccessCounter.callCount, 0);
assert.equals(countersStub.incScalingDeniedCounter.callCount, 0);
assert.equals(countersStub.incScalingFailedCounter.callCount, 0);

Expand Down Expand Up @@ -420,6 +417,8 @@ describe('#readStateCheckOngoingLRO', () => {

const countersStub = {
incScalingFailedCounter: sinon.stub(),
incScalingSuccessCounter: sinon.stub(),
recordScalingDuration: sinon.stub(),
};

/** @type {StateData} */
Expand Down Expand Up @@ -503,6 +502,8 @@ describe('#readStateCheckOngoingLRO', () => {
}),
);
sinon.assert.notCalled(getOperation);
sinon.assert.notCalled(countersStub.incScalingSuccessCounter);
sinon.assert.notCalled(countersStub.recordScalingDuration);
sinon.assert.notCalled(stateStub.updateState);
});

Expand All @@ -525,6 +526,14 @@ describe('#readStateCheckOngoingLRO', () => {
);

sinon.assert.calledOnce(getOperation);
// Failure to get the operation is considered a 'success'...
sinon.assert.calledOnce(countersStub.incScalingSuccessCounter);
sinon.assert.calledOnceWithMatch(
countersStub.recordScalingDuration,
sinon.match(0), // duration
sinon.match.any, // spanner
sinon.match(spannerParams.currentSize), // size
);
sinon.assert.calledWith(stateStub.updateState, expectedState);
});

Expand All @@ -547,6 +556,14 @@ describe('#readStateCheckOngoingLRO', () => {
);

sinon.assert.calledOnce(getOperation);
// Failure to get the operation is considered a 'success'...
sinon.assert.calledOnce(countersStub.incScalingSuccessCounter);
sinon.assert.calledOnceWithMatch(
countersStub.recordScalingDuration,
sinon.match(0), // duration
sinon.match.any, // spanner
sinon.match(spannerParams.currentSize), // size
);
sinon.assert.calledWith(stateStub.updateState, expectedState);
});

Expand Down Expand Up @@ -574,8 +591,10 @@ describe('#readStateCheckOngoingLRO', () => {
);

sinon.assert.calledOnce(getOperation);
sinon.assert.notCalled(countersStub.incScalingSuccessCounter);
sinon.assert.notCalled(countersStub.recordScalingDuration);
sinon.assert.calledOnce(countersStub.incScalingFailedCounter);
sinon.assert.calledWith(stateStub.updateState, expectedState);
assert.equals(countersStub.incScalingFailedCounter.callCount, 1);
});

it('should clear the operation if no metadata', async () => {
Expand All @@ -598,6 +617,13 @@ describe('#readStateCheckOngoingLRO', () => {
);

sinon.assert.calledOnce(getOperation);
sinon.assert.calledOnce(countersStub.incScalingSuccessCounter);
sinon.assert.calledOnceWithMatch(
countersStub.recordScalingDuration,
sinon.match(0), // duration
sinon.match.any, // spanner
sinon.match(spannerParams.currentSize), // size
);
sinon.assert.calledWith(stateStub.updateState, expectedState);
});

Expand All @@ -618,6 +644,8 @@ describe('#readStateCheckOngoingLRO', () => {
);

sinon.assert.calledOnce(getOperation);
sinon.assert.notCalled(countersStub.incScalingSuccessCounter);
sinon.assert.notCalled(countersStub.recordScalingDuration);
sinon.assert.calledWith(stateStub.updateState, originalAutoscalerState);
});

Expand All @@ -640,6 +668,8 @@ describe('#readStateCheckOngoingLRO', () => {
);

sinon.assert.calledOnce(getOperation);
sinon.assert.notCalled(countersStub.incScalingSuccessCounter);
sinon.assert.notCalled(countersStub.recordScalingDuration);
sinon.assert.calledWith(stateStub.updateState, originalAutoscalerState);
});

Expand All @@ -666,6 +696,13 @@ describe('#readStateCheckOngoingLRO', () => {
);

sinon.assert.calledOnce(getOperation);
sinon.assert.calledOnce(countersStub.incScalingSuccessCounter);
sinon.assert.calledOnceWithMatch(
countersStub.recordScalingDuration,
sinon.match(60_000), // duration
sinon.match.any, // spanner
sinon.match(2000), // size
);
sinon.assert.calledWith(stateStub.updateState, expectedState);
});
});

0 comments on commit c53fe10

Please sign in to comment.