From c6bbb26b7124fee4f37ba76abd7ad63a4ed43e37 Mon Sep 17 00:00:00 2001 From: Andrew Nowak Date: Mon, 6 Oct 2025 17:28:54 +0100 Subject: [PATCH 1/5] enable rotating nodes into new ASGs --- README.md | 12 +++++++++++ cloudformation.yaml | 1 + src/addNode.ts | 8 +++---- src/autoScalingGroupCheck.ts | 2 +- src/aws/autoscaling.ts | 40 +++++++++++++++++++++++++++-------- src/clusterSizeCheck.ts | 4 ++-- src/elasticsearch/types.ts | 2 +- src/getTargetNode.ts | 14 ++++++------ src/reattachTargetInstance.ts | 7 +++++- src/utils/handlerInputs.ts | 5 ++++- 10 files changed, 70 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index e14f643..beca381 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,18 @@ in this project. The frequency of node rotations is passed into the template as ### Running Manually Sometimes it's useful to rotate an ES node manually (e.g. during an ES upgrade), you can optionally pass a `targetInstanceId` in the step function input object. It's usually easiest to open an existing execution and click `New Execution` then just edit the input object. +### Rotating nodes into a new ASG + +Very occasionally, it is required to migrate a cluster into a new Autoscaling Group. To do this with the node rotation step function by: + +1. Follow the setup steps above. +1. Create the new ASG with DesiredCapacity set to 0. +1. Set the MinimumCapacity of the old ASG to 0. +1. Tag the new ASG with `gu:riffraff:new-asg = True`. +1. Run as normal, either manually or letting the schedule rotate the instances. + +The step function will detect and launch new instances in the new ASG, while removing nodes from the old ASG. + ## Implementation This Step Function triggers a number of TypeScript lambdas, which coordinate the process of replacing a node by: diff --git a/cloudformation.yaml b/cloudformation.yaml index 9014e88..0584da9 100644 --- a/cloudformation.yaml +++ b/cloudformation.yaml @@ -114,6 +114,7 @@ Resources: Action: - autoscaling:DetachInstances - autoscaling:AttachInstances + - autoscaling:SetDesiredCapacity - autoscaling:TerminateInstanceInAutoScalingGroup Resource: - !Sub arn:aws:autoscaling:${AWS::Region}:${AWS::AccountId}:autoScalingGroup:*:autoScalingGroupName/* diff --git a/src/addNode.ts b/src/addNode.ts index 1f23817..b19b7e3 100644 --- a/src/addNode.ts +++ b/src/addNode.ts @@ -1,4 +1,4 @@ -import {detachInstance} from './aws/autoscaling'; +import {launchNewInstance} from './aws/autoscaling'; import {AddNodeResponse, ClusterStatusResponse} from './utils/handlerInputs'; import {Elasticsearch} from './elasticsearch/elasticsearch'; import {Instance} from './aws/types'; @@ -7,17 +7,17 @@ import {ElasticsearchClusterStatus} from './elasticsearch/types'; export async function handler(event: ClusterStatusResponse): Promise { const targetInstance: Instance = event.targetElasticSearchNode.ec2Instance; - const asg: string = event.asgName; + const asg: string = event.destinationAsgName; const elasticsearchClient = new Elasticsearch(targetInstance.id) return new Promise((resolve, reject) => { elasticsearchClient.updateRebalancingStatus("none") - .then(() => detachInstance(targetInstance, asg)) + .then(() => launchNewInstance(targetInstance, asg)) .then(() => elasticsearchClient.getClusterHealth()) .then((clusterStatus: ElasticsearchClusterStatus) => { const response: AddNodeResponse = { - "asgName": asg, + "destinationAsgName": asg, "targetElasticSearchNode": event.targetElasticSearchNode, "expectedClusterSize": clusterStatus.number_of_nodes + 1 }; diff --git a/src/autoScalingGroupCheck.ts b/src/autoScalingGroupCheck.ts index 4f7bbc6..21ced16 100644 --- a/src/autoScalingGroupCheck.ts +++ b/src/autoScalingGroupCheck.ts @@ -3,7 +3,7 @@ import {getASG} from "./aws/autoscaling"; export async function handler(event: AsgInput): Promise { try { - const asg = await getASG(event.asgName) + const asg = await getASG(event.destinationAsgName) if (asg.MaxSize <= asg.Instances.length) { const error = `ASG MaxSize must be greater than Desired Capacity to allow for ReattachTargetInstance step.` diff --git a/src/aws/autoscaling.ts b/src/aws/autoscaling.ts index 91ae7b3..93ce530 100644 --- a/src/aws/autoscaling.ts +++ b/src/aws/autoscaling.ts @@ -5,22 +5,44 @@ import { AutoScaling, AutoScalingGroup, DescribeAutoScalingGroupsCommand, DescribeAutoScalingGroupsCommandOutput, DetachInstancesCommand, DetachInstancesCommandOutput, + SetDesiredCapacityCommand, TerminateInstanceInAutoScalingGroupCommand, TerminateInstanceInAutoScalingGroupCommandOutput } from "@aws-sdk/client-auto-scaling"; const awsAutoscaling = new AutoScaling(); -export function detachInstance(instance: Instance, asgName: string): Promise { - console.log(`Detaching ${instance.id} from ${asgName}. This should also bring a new instance into the ASG`); - const params = { - InstanceIds: [ instance.id ], - AutoScalingGroupName: asgName, - ShouldDecrementDesiredCapacity: false - }; - const req = new DetachInstancesCommand(params); +export async function launchNewInstance(instance: Instance, asgName: string): Promise { + if (instance.autoScalingGroupName === asgName) { + console.log(`Detaching ${instance.id} from ${asgName}. This should also bring a new instance into the ASG`); + const params = { + InstanceIds: [ instance.id ], + AutoScalingGroupName: asgName, + ShouldDecrementDesiredCapacity: false + }; + const req = new DetachInstancesCommand(params); - return retry(() => awsAutoscaling.send(req), `detaching instance ${instance.id}`, 5) + return retry(() => awsAutoscaling.send(req), `detaching instance ${instance.id}`, 5) + } else { + console.log(`Launch new instance to new ASG ${asgName}.`); + const asgs = await retry( + () => awsAutoscaling.send(new DescribeAutoScalingGroupsCommand({ + AutoScalingGroupNames: [asgName] + })), + `getting current capacity in ${asgName}`, + 5, + ); + const capacity = asgs.AutoScalingGroups[0].DesiredCapacity; + + return retry( + () => awsAutoscaling.send(new SetDesiredCapacityCommand({ + AutoScalingGroupName: asgName, + DesiredCapacity: capacity + 1, + })), + `launching new instance in ${asgName}`, + 5, + ); + } } export function attachInstance(instance: Instance, asgName: string): Promise<{}> { diff --git a/src/clusterSizeCheck.ts b/src/clusterSizeCheck.ts index 1301108..c8a9a6e 100644 --- a/src/clusterSizeCheck.ts +++ b/src/clusterSizeCheck.ts @@ -7,7 +7,7 @@ import {getASG} from "./aws/autoscaling"; export async function handler(event: AddNodeResponse): Promise { - const asg = await getASG(event.asgName) + const asg = await getASG(event.destinationAsgName) const instanceIds = asg.Instances.map(i => i.InstanceId) const newestInstance = await getSpecificInstance(instanceIds, findNewestInstance) const elasticsearchClient = new Elasticsearch(event.targetElasticSearchNode.ec2Instance.id) @@ -27,7 +27,7 @@ export async function handler(event: AddNodeResponse): Promise { const response: TargetAndNewNodeResponse = { - "asgName": event.asgName, + "destinationAsgName": event.destinationAsgName, "targetElasticSearchNode": event.targetElasticSearchNode, "newestElasticsearchNode": newestElasticsearchNode }; diff --git a/src/elasticsearch/types.ts b/src/elasticsearch/types.ts index dc6e86d..44164df 100644 --- a/src/elasticsearch/types.ts +++ b/src/elasticsearch/types.ts @@ -12,7 +12,7 @@ export class ElasticsearchNode { nodeId: string; isMasterEligible: boolean; - constructor(instance: Instance, nodeId: string, isMasterEligible) { + constructor(instance: Instance, nodeId: string, isMasterEligible: boolean) { this.ec2Instance = instance; this.nodeId = nodeId; this.isMasterEligible = isMasterEligible diff --git a/src/getTargetNode.ts b/src/getTargetNode.ts index 26d56a4..6609ef1 100644 --- a/src/getTargetNode.ts +++ b/src/getTargetNode.ts @@ -14,14 +14,14 @@ export async function handler(event: StateMachineInput): Promise asg.AutoScalingGroupName); + const eligibleASGs = await getASGsByTag(event.autoScalingGroupDiscoveryTagKey, "true"); const eligibleInstances = ( // TODO it would be nice to not need the Tags on the instances as well, but currently used in the ElasticsearchAdminSsmPolicy IAM policy in cloudformation.yaml await getInstancesByTag(event.autoScalingGroupDiscoveryTagKey, "true") - ).filter(i => eligibleASGs.includes(i.autoScalingGroupName)); + ).filter(i => eligibleASGs.some(a => a.AutoScalingGroupName === i.autoScalingGroupName)); + + const newASG = eligibleASGs.find(a => a.Tags?.some(tag => tag.Key === 'gu:riffraff:new-asg')); // We can manually run rotation against a particular instance if needed if(event.targetInstanceId) { @@ -38,7 +38,9 @@ export async function handler(event: StateMachineInput): Promise { const targetInstance: Instance = event.targetElasticSearchNode.ec2Instance; - const asg: string = event.asgName; + const asg: string = event.destinationAsgName; + + if (targetInstance.autoScalingGroupName !== asg) { + console.log(`New instance launched in different ASG than target instance, so nothing to reattach`); + return event; + } return new Promise((resolve, reject) => { attachInstance(targetInstance, asg) diff --git a/src/utils/handlerInputs.ts b/src/utils/handlerInputs.ts index 20c051c..5ff1e14 100644 --- a/src/utils/handlerInputs.ts +++ b/src/utils/handlerInputs.ts @@ -8,7 +8,10 @@ export interface StateMachineInput { } export interface AsgInput { - asgName: string; + // The ASG a node will be rotated _into_. In certain circumstances this may + // not be the same as the ASG the node was rotated _out of_, which is stored at + // targetElasticSearchNode.ec2Instance.autoScalingGroupName. + destinationAsgName: string; targetElasticSearchNode: ElasticsearchNode; } From a3a04638c7e6f0f5a09cda066d6511f268a1f42d Mon Sep 17 00:00:00 2001 From: Andrew Nowak Date: Thu, 9 Oct 2025 14:30:57 +0100 Subject: [PATCH 2/5] improve checking that destination ASG is a good match for the source ASG --- src/getTargetNode.ts | 48 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 4 deletions(-) diff --git a/src/getTargetNode.ts b/src/getTargetNode.ts index 6609ef1..a841df1 100644 --- a/src/getTargetNode.ts +++ b/src/getTargetNode.ts @@ -4,6 +4,44 @@ import {getInstancesByTag} from './aws/ec2Instances'; import {getASGsByTag} from "./aws/autoscaling"; import {Elasticsearch} from "./elasticsearch/elasticsearch"; import {Instance} from "./aws/types"; +import { type AutoScalingGroup } from '@aws-sdk/client-auto-scaling'; + +function asgTagsToRecord(asg: AutoScalingGroup): Record { + return Object.fromEntries(asg.Tags.map(tag => ([tag.Key, tag.Value]))); +} + +/** attempt to find an ASG with the same tagging as the target instance's, + * but identifies itself as a 'new' ASG (using the 'gu:riffraff:new-asg' + * tag) so should be the destination of the node rotation. + * Only respects the Stage/Stack/App tags, when defined on the target + * instance's ASG. + */ +function findNewAsgMatchingInstanceAsg( + eligibleASGs: AutoScalingGroup[], + targetInstance: Instance, +): AutoScalingGroup | undefined { + + const targetInstanceAsg = eligibleASGs + .find(a => a.AutoScalingGroupName === targetInstance.autoScalingGroupName); + + if (!targetInstanceAsg) { + throw new Error(`Couldn't find target instance's ASG (${targetInstance.autoScalingGroupName}) in the list of eligible ASGs - that shouldn't happen!`); + } + + const targetAsgTags = asgTagsToRecord(targetInstanceAsg); + const expectedTags = ['App', 'Stack', 'Stage'].filter(t => t in targetAsgTags); + + const newAsg = eligibleASGs.find(a => { + const asgTags = asgTagsToRecord(a); + + return a.AutoScalingGroupName !== targetInstanceAsg.AutoScalingGroupName + && expectedTags.every(t => asgTags[t] === targetAsgTags[t]) + && 'gu:riffraff:new-asg' in asgTags; + }); + + return newAsg; + +} export async function handler(event: StateMachineInput): Promise { const runningExecutionsPromise = totalRunningExecutions(event.stepFunctionArn) @@ -21,8 +59,6 @@ export async function handler(event: StateMachineInput): Promise eligibleASGs.some(a => a.AutoScalingGroupName === i.autoScalingGroupName)); - const newASG = eligibleASGs.find(a => a.Tags?.some(tag => tag.Key === 'gu:riffraff:new-asg')); - // We can manually run rotation against a particular instance if needed if(event.targetInstanceId) { const targetInstance = eligibleInstances.find(i => i.id === event.targetInstanceId); @@ -39,7 +75,9 @@ export async function handler(event: StateMachineInput): Promise Date: Thu, 9 Oct 2025 14:31:16 +0100 Subject: [PATCH 3/5] add a bunch of tests for getTargetNode behaviour --- test/getTargetNode.test.ts | 407 +++++++++++++++++++++++++++++++++++++ 1 file changed, 407 insertions(+) create mode 100644 test/getTargetNode.test.ts diff --git a/test/getTargetNode.test.ts b/test/getTargetNode.test.ts new file mode 100644 index 0000000..e2fcb8d --- /dev/null +++ b/test/getTargetNode.test.ts @@ -0,0 +1,407 @@ +import { handler } from '../src/getTargetNode'; +import { totalRunningExecutions } from '../src/aws/stepFunctions'; +import { getInstancesByTag } from '../src/aws/ec2Instances'; +import { getASGsByTag } from '../src/aws/autoscaling'; +import { Elasticsearch } from '../src/elasticsearch/elasticsearch'; +import { StateMachineInput } from '../src/utils/handlerInputs'; +import { Instance } from '../src/aws/types'; +import { AutoScalingGroup } from '@aws-sdk/client-auto-scaling'; + +jest.mock('../src/aws/stepFunctions'); +jest.mock('../src/aws/ec2Instances'); +jest.mock('../src/aws/autoscaling'); +jest.mock('../src/elasticsearch/elasticsearch'); + +const mockedTotalRunningExecutions = totalRunningExecutions as jest.MockedFunction; +const mockedGetInstancesByTag = getInstancesByTag as jest.MockedFunction; +const mockedGetASGsByTag = getASGsByTag as jest.MockedFunction; + +describe('getTargetNode handler', () => { + + beforeEach(() => { + jest.clearAllMocks(); + }); + + it('should skip rotation when multiple executions are running', async () => { + mockedTotalRunningExecutions.mockResolvedValue(2); + + const event: StateMachineInput = { + stepFunctionArn: 'arn:aws:states:us-east-1:123456789:stateMachine:test', + autoScalingGroupDiscoveryTagKey: 'es-rotation', + ageThresholdInDays: 7, + }; + + const result = await handler(event); + + expect(result).toEqual({ skipRotation: true }); + expect(mockedTotalRunningExecutions).toHaveBeenCalledWith(event.stepFunctionArn); + expect(mockedGetInstancesByTag).not.toHaveBeenCalled(); + expect(mockedGetASGsByTag).not.toHaveBeenCalled(); + }); + + it('should skip rotation if there are no instances tagged for rotation', async () => { + mockedTotalRunningExecutions.mockResolvedValue(1); + mockedGetInstancesByTag.mockResolvedValue([]); + + const event: StateMachineInput = { + stepFunctionArn: 'arn:aws:states:us-east-1:123456789:stateMachine:test', + autoScalingGroupDiscoveryTagKey: 'es-rotation', + ageThresholdInDays: 7, + }; + + const result = await handler(event); + + expect(result).toEqual({ skipRotation: true }); + expect(mockedTotalRunningExecutions).toHaveBeenCalledWith(event.stepFunctionArn); + expect(mockedGetInstancesByTag).toHaveBeenCalledWith(event.autoScalingGroupDiscoveryTagKey, "true"); + + }); + + it('should skip rotation if all instances tagged for rotation are too old', async () => { + mockedTotalRunningExecutions.mockResolvedValue(1); + + const twoYearsAgo = new Date(); + twoYearsAgo.setFullYear(twoYearsAgo.getFullYear() - 2); + + const twoDaysAgo = new Date(); + twoDaysAgo.setDate(twoDaysAgo.getDate() - 2); + + const mockInstances: Instance[] = [ + { id: 'i-123', launchTime: twoDaysAgo, autoScalingGroupName: 'asg-1', privateIp: 'IP' }, + { id: 'i-456', launchTime: twoDaysAgo, autoScalingGroupName: 'asg-1', privateIp: 'IP' }, + ]; + + mockedGetInstancesByTag.mockResolvedValue(mockInstances); + + const mockASGs: AutoScalingGroup[] = [ + { AutoScalingGroupName: 'asg-1', Tags: [{ Key: 'es-rotation', Value: 'true' }], MinSize: 1, MaxSize: 3, DesiredCapacity: 2, DefaultCooldown: 300, AvailabilityZones: ['us-east-1a'], HealthCheckType: 'EC2', CreatedTime: twoYearsAgo }, + ]; + + mockedGetASGsByTag.mockResolvedValue(mockASGs); + + const event: StateMachineInput = { + stepFunctionArn: 'arn:aws:states:us-east-1:123456789:stateMachine:test', + autoScalingGroupDiscoveryTagKey: 'es-rotation', + ageThresholdInDays: 7, + }; + + const result = await handler(event); + + expect(result).toEqual({ skipRotation: true }); + expect(mockedTotalRunningExecutions).toHaveBeenCalledWith(event.stepFunctionArn); + expect(mockedGetInstancesByTag).toHaveBeenCalledWith(event.autoScalingGroupDiscoveryTagKey, "true"); + }); + + it('should return the oldest instance and its ASG when a suitable instance is found', async () => { + mockedTotalRunningExecutions.mockResolvedValue(1); + + const tenDaysAgo = new Date(); + tenDaysAgo.setDate(tenDaysAgo.getDate() - 10); + + const twentyDaysAgo = new Date(); + twentyDaysAgo.setDate(twentyDaysAgo.getDate() - 20); + + const mockInstances: Instance[] = [ + { id: 'i-123', launchTime: tenDaysAgo, autoScalingGroupName: 'asg-1', privateIp: '10.0.0.1' }, + { id: 'i-456', launchTime: twentyDaysAgo, autoScalingGroupName: 'asg-1', privateIp: '10.0.0.2' }, + ]; + + mockedGetInstancesByTag.mockResolvedValue(mockInstances); + + const mockASGs: AutoScalingGroup[] = [ + { AutoScalingGroupName: 'asg-1', Tags: [{ Key: 'es-rotation', Value: 'true' }], MinSize: 1, MaxSize: 3, DesiredCapacity: 2, DefaultCooldown: 300, AvailabilityZones: ['us-east-1a'], HealthCheckType: 'EC2', CreatedTime: new Date() }, + ]; + + mockedGetASGsByTag.mockResolvedValue(mockASGs); + + const mockElasticsearch = { + getElasticsearchNode: jest.fn().mockImplementation((instance: Instance) => + ({ ec2Instance: instance, nodeId: 'node-456', isMasterEligible: true })), + }; + (Elasticsearch as jest.MockedClass).mockImplementation(() => mockElasticsearch as any); + + const event: StateMachineInput = { + stepFunctionArn: 'arn:aws:states:us-east-1:123456789:stateMachine:test', + autoScalingGroupDiscoveryTagKey: 'es-rotation', + ageThresholdInDays: 7, + }; + + const result = await handler(event); + + expect(result).toEqual({ + skipRotation: false, + targetElasticSearchNode: { + ec2Instance: mockInstances[1], + nodeId: 'node-456', + isMasterEligible: true + }, + destinationAsgName: 'asg-1', + }); + expect(mockedTotalRunningExecutions).toHaveBeenCalledWith(event.stepFunctionArn); + expect(mockedGetInstancesByTag).toHaveBeenCalledWith(event.autoScalingGroupDiscoveryTagKey, "true"); + expect(mockedGetASGsByTag).toHaveBeenCalledWith(event.autoScalingGroupDiscoveryTagKey, "true"); + expect(mockElasticsearch.getElasticsearchNode).toHaveBeenCalledWith(mockInstances[1]); + }); + + it('should return the specified target instance when provided in the event', async () => { + mockedTotalRunningExecutions.mockResolvedValue(1); + + const tenDaysAgo = new Date(); + tenDaysAgo.setDate(tenDaysAgo.getDate() - 10); + + const twentyDaysAgo = new Date(); + twentyDaysAgo.setDate(twentyDaysAgo.getDate() - 20); + + const mockInstances: Instance[] = [ + { id: 'i-123', launchTime: tenDaysAgo, autoScalingGroupName: 'asg-1', privateIp: '10.0.0.1' }, + { id: 'i-456', launchTime: twentyDaysAgo, autoScalingGroupName: 'asg-1', privateIp: '10.0.0.2' }, + ]; + + mockedGetInstancesByTag.mockResolvedValue(mockInstances); + + const mockASGs: AutoScalingGroup[] = [ + { AutoScalingGroupName: 'asg-1', Tags: [{ Key: 'es-rotation', Value: 'true' }], MinSize: 1, MaxSize: 3, DesiredCapacity: 2, DefaultCooldown: 300, AvailabilityZones: ['us-east-1a'], HealthCheckType: 'EC2', CreatedTime: new Date() }, + ]; + + mockedGetASGsByTag.mockResolvedValue(mockASGs); + + const mockElasticsearch = { + getElasticsearchNode: jest.fn().mockImplementation((instance: Instance) => + ({ ec2Instance: instance, nodeId: 'node-123', isMasterEligible: false })), + }; + (Elasticsearch as jest.MockedClass).mockImplementation(() => mockElasticsearch as any); + + const event: StateMachineInput = { + stepFunctionArn: 'arn:aws:states:us-east-1:123456789:stateMachine:test', + autoScalingGroupDiscoveryTagKey: 'es-rotation', + ageThresholdInDays: 7, + targetInstanceId: 'i-123', // an old instance, but not the oldest + }; + + const result = await handler(event); + + expect(result).toEqual({ + skipRotation: false, + targetElasticSearchNode: { + ec2Instance: mockInstances[0], + nodeId: 'node-123', + isMasterEligible: false + }, + destinationAsgName: 'asg-1', + }); + expect(mockedTotalRunningExecutions).toHaveBeenCalledWith(event.stepFunctionArn); + expect(mockedGetInstancesByTag).toHaveBeenCalledWith(event.autoScalingGroupDiscoveryTagKey, "true"); + expect(mockedGetASGsByTag).toHaveBeenCalledWith(event.autoScalingGroupDiscoveryTagKey, "true"); + expect(mockElasticsearch.getElasticsearchNode).toHaveBeenCalledWith(mockInstances[0]); + }); + + it('should rotate a node into a new ASG if one exists with the new asg tag', async () => { + mockedTotalRunningExecutions.mockResolvedValue(1); + + const tenDaysAgo = new Date(); + tenDaysAgo.setDate(tenDaysAgo.getDate() - 10); + + const twentyDaysAgo = new Date(); + twentyDaysAgo.setDate(twentyDaysAgo.getDate() - 20); + + const mockInstances: Instance[] = [ + { id: 'i-123', launchTime: tenDaysAgo, autoScalingGroupName: 'asg-old', privateIp: '10.0.0.1' }, + { id: 'i-456', launchTime: twentyDaysAgo, autoScalingGroupName: 'asg-old', privateIp: '10.0.0.2' }, + ]; + + mockedGetInstancesByTag.mockResolvedValue(mockInstances); + + const mockASGs: AutoScalingGroup[] = [ + { AutoScalingGroupName: 'asg-old', Tags: [{ Key: 'es-rotation', Value: 'true' }], MinSize: 1, MaxSize: 3, DesiredCapacity: 2, DefaultCooldown: 300, AvailabilityZones: ['us-east-1a'], HealthCheckType: 'EC2', CreatedTime: new Date() }, + { AutoScalingGroupName: 'asg-new', Tags: [{ Key: 'es-rotation', Value: 'true' }, { Key: 'gu:riffraff:new-asg', Value: 'true' }], MinSize: 0, MaxSize: 3, DesiredCapacity: 0, DefaultCooldown: 300, AvailabilityZones: ['us-east-1a'], HealthCheckType: 'EC2', CreatedTime: new Date() }, + ]; + + mockedGetASGsByTag.mockResolvedValue(mockASGs); + + const mockElasticsearch = { + getElasticsearchNode: jest.fn().mockImplementation((instance: Instance) => + ({ ec2Instance: instance, nodeId: 'node-456', isMasterEligible: true })), + }; + (Elasticsearch as jest.MockedClass).mockImplementation(() => mockElasticsearch as any); + + const event: StateMachineInput = { + stepFunctionArn: 'arn:aws:states:us-east-1:123456789:stateMachine:test', + autoScalingGroupDiscoveryTagKey: 'es-rotation', + ageThresholdInDays: 7, + }; + + const result = await handler(event); + + expect(result).toEqual({ + skipRotation: false, + targetElasticSearchNode: { + ec2Instance: mockInstances[1], + nodeId: 'node-456', + isMasterEligible: true + }, + destinationAsgName: 'asg-new', + }); + expect(mockedTotalRunningExecutions).toHaveBeenCalledWith(event.stepFunctionArn); + expect(mockedGetInstancesByTag).toHaveBeenCalledWith(event.autoScalingGroupDiscoveryTagKey, "true"); + expect(mockedGetASGsByTag).toHaveBeenCalledWith(event.autoScalingGroupDiscoveryTagKey, "true"); + expect(mockElasticsearch.getElasticsearchNode).toHaveBeenCalledWith(mockInstances[1]); + }); + + it('should not rotate a node into a new ASG if it doesn\'t match with other tags', async () => { + mockedTotalRunningExecutions.mockResolvedValue(1); + + const tenDaysAgo = new Date(); + tenDaysAgo.setDate(tenDaysAgo.getDate() - 10); + + const twentyDaysAgo = new Date(); + twentyDaysAgo.setDate(twentyDaysAgo.getDate() - 20); + + const mockInstances: Instance[] = [ + { id: 'i-123', launchTime: tenDaysAgo, autoScalingGroupName: 'asg-old', privateIp: '10.0.0.1' }, + { id: 'i-456', launchTime: twentyDaysAgo, autoScalingGroupName: 'asg-old', privateIp: '10.0.0.2' }, + ]; + + mockedGetInstancesByTag.mockResolvedValue(mockInstances); + + const mockASGs: AutoScalingGroup[] = [ + { AutoScalingGroupName: 'asg-old', Tags: [{ Key: 'es-rotation', Value: 'true' }, { Key: 'Stage', Value: 'CODE' }], MinSize: 1, MaxSize: 3, DesiredCapacity: 2, DefaultCooldown: 300, AvailabilityZones: ['us-east-1a'], HealthCheckType: 'EC2', CreatedTime: new Date() }, + { AutoScalingGroupName: 'asg-new', Tags: [{ Key: 'es-rotation', Value: 'true' }, { Key: 'Stage', Value: 'PROD' }, { Key: 'gu:riffraff:new-asg', Value: 'true' }], MinSize: 0, MaxSize: 3, DesiredCapacity: 0, DefaultCooldown: 300, AvailabilityZones: ['us-east-1a'], HealthCheckType: 'EC2', CreatedTime: new Date() }, + ]; + + mockedGetASGsByTag.mockResolvedValue(mockASGs); + + const mockElasticsearch = { + getElasticsearchNode: jest.fn().mockImplementation((instance: Instance) => + ({ ec2Instance: instance, nodeId: 'node-456', isMasterEligible: true })), + }; + (Elasticsearch as jest.MockedClass).mockImplementation(() => mockElasticsearch as any); + + const event: StateMachineInput = { + stepFunctionArn: 'arn:aws:states:us-east-1:123456789:stateMachine:test', + autoScalingGroupDiscoveryTagKey: 'es-rotation', + ageThresholdInDays: 7, + }; + + const result = await handler(event); + + expect(result).toEqual({ + skipRotation: false, + targetElasticSearchNode: { + ec2Instance: mockInstances[1], + nodeId: 'node-456', + isMasterEligible: true + }, + destinationAsgName: 'asg-old', // despite there being a new ASG, it doesn't match so rotate into the old ASG + }); + expect(mockedTotalRunningExecutions).toHaveBeenCalledWith(event.stepFunctionArn); + expect(mockedGetInstancesByTag).toHaveBeenCalledWith(event.autoScalingGroupDiscoveryTagKey, "true"); + expect(mockedGetASGsByTag).toHaveBeenCalledWith(event.autoScalingGroupDiscoveryTagKey, "true"); + expect(mockElasticsearch.getElasticsearchNode).toHaveBeenCalledWith(mockInstances[1]); + }); + + it('should rotate a chosen node into a new ASG if there is one with matching tags', async () => { + mockedTotalRunningExecutions.mockResolvedValue(1); + + const tenDaysAgo = new Date(); + tenDaysAgo.setDate(tenDaysAgo.getDate() - 10); + + const twentyDaysAgo = new Date(); + twentyDaysAgo.setDate(twentyDaysAgo.getDate() - 20); + + const mockInstances: Instance[] = [ + { id: 'i-123', launchTime: tenDaysAgo, autoScalingGroupName: 'asg-old', privateIp: '10.0.0.1' }, + { id: 'i-456', launchTime: twentyDaysAgo, autoScalingGroupName: 'asg-old', privateIp: '10.0.0.2' }, + ]; + + mockedGetInstancesByTag.mockResolvedValue(mockInstances); + + const mockASGs: AutoScalingGroup[] = [ + { AutoScalingGroupName: 'asg-old', Tags: [{ Key: 'es-rotation', Value: 'true' }, { Key: 'Stage', Value: 'PROD' }], MinSize: 1, MaxSize: 3, DesiredCapacity: 2, DefaultCooldown: 300, AvailabilityZones: ['us-east-1a'], HealthCheckType: 'EC2', CreatedTime: new Date() }, + { AutoScalingGroupName: 'asg-new', Tags: [{ Key: 'es-rotation', Value: 'true' }, { Key: 'Stage', Value: 'PROD' }, { Key: 'gu:riffraff:new-asg', Value: 'true' }], MinSize: 0, MaxSize: 3, DesiredCapacity: 0, DefaultCooldown: 300, AvailabilityZones: ['us-east-1a'], HealthCheckType: 'EC2', CreatedTime: new Date() }, + ]; + + mockedGetASGsByTag.mockResolvedValue(mockASGs); + + const mockElasticsearch = { + getElasticsearchNode: jest.fn().mockImplementation((instance: Instance) => + ({ ec2Instance: instance, nodeId: instance.id, isMasterEligible: true })), + }; + (Elasticsearch as jest.MockedClass).mockImplementation(() => mockElasticsearch as any); + + const event: StateMachineInput = { + stepFunctionArn: 'arn:aws:states:us-east-1:123456789:stateMachine:test', + autoScalingGroupDiscoveryTagKey: 'es-rotation', + ageThresholdInDays: 7, + targetInstanceId: 'i-123', + }; + + const result = await handler(event); + + expect(result).toEqual({ + skipRotation: false, + targetElasticSearchNode: { + ec2Instance: mockInstances[0], + nodeId: 'i-123', + isMasterEligible: true + }, + destinationAsgName: 'asg-new', + }); + expect(mockedTotalRunningExecutions).toHaveBeenCalledWith(event.stepFunctionArn); + expect(mockedGetInstancesByTag).toHaveBeenCalledWith(event.autoScalingGroupDiscoveryTagKey, "true"); + expect(mockedGetASGsByTag).toHaveBeenCalledWith(event.autoScalingGroupDiscoveryTagKey, "true"); + expect(mockElasticsearch.getElasticsearchNode).toHaveBeenCalledWith(mockInstances[0]); + }); + + it('should not rotate a chosen node into a new ASG if there is not one with matching tags', async () => { + mockedTotalRunningExecutions.mockResolvedValue(1); + + const tenDaysAgo = new Date(); + tenDaysAgo.setDate(tenDaysAgo.getDate() - 10); + + const twentyDaysAgo = new Date(); + twentyDaysAgo.setDate(twentyDaysAgo.getDate() - 20); + + const mockInstances: Instance[] = [ + { id: 'i-123', launchTime: tenDaysAgo, autoScalingGroupName: 'asg-old', privateIp: '10.0.0.1' }, + { id: 'i-456', launchTime: twentyDaysAgo, autoScalingGroupName: 'asg-old', privateIp: '10.0.0.2' }, + ]; + + mockedGetInstancesByTag.mockResolvedValue(mockInstances); + + const mockASGs: AutoScalingGroup[] = [ + { AutoScalingGroupName: 'asg-old', Tags: [{ Key: 'es-rotation', Value: 'true' }, { Key: 'Stage', Value: 'CODE' }], MinSize: 1, MaxSize: 3, DesiredCapacity: 2, DefaultCooldown: 300, AvailabilityZones: ['us-east-1a'], HealthCheckType: 'EC2', CreatedTime: new Date() }, + { AutoScalingGroupName: 'asg-new', Tags: [{ Key: 'es-rotation', Value: 'true' }, { Key: 'Stage', Value: 'PROD' }, { Key: 'gu:riffraff:new-asg', Value: 'true' }], MinSize: 0, MaxSize: 3, DesiredCapacity: 0, DefaultCooldown: 300, AvailabilityZones: ['us-east-1a'], HealthCheckType: 'EC2', CreatedTime: new Date() }, + ]; + + mockedGetASGsByTag.mockResolvedValue(mockASGs); + + const mockElasticsearch = { + getElasticsearchNode: jest.fn().mockImplementation((instance: Instance) => + ({ ec2Instance: instance, nodeId: instance.id, isMasterEligible: true })), + }; + (Elasticsearch as jest.MockedClass).mockImplementation(() => mockElasticsearch as any); + + const event: StateMachineInput = { + stepFunctionArn: 'arn:aws:states:us-east-1:123456789:stateMachine:test', + autoScalingGroupDiscoveryTagKey: 'es-rotation', + ageThresholdInDays: 7, + targetInstanceId: 'i-123', + }; + + const result = await handler(event); + + expect(result).toEqual({ + skipRotation: false, + targetElasticSearchNode: { + ec2Instance: mockInstances[0], + nodeId: 'i-123', + isMasterEligible: true + }, + destinationAsgName: 'asg-old', + }); + expect(mockedTotalRunningExecutions).toHaveBeenCalledWith(event.stepFunctionArn); + expect(mockedGetInstancesByTag).toHaveBeenCalledWith(event.autoScalingGroupDiscoveryTagKey, "true"); + expect(mockedGetASGsByTag).toHaveBeenCalledWith(event.autoScalingGroupDiscoveryTagKey, "true"); + expect(mockElasticsearch.getElasticsearchNode).toHaveBeenCalledWith(mockInstances[0]); + }); +}); From d6ed006d9eae5f4a0e7273e39612f9166ffe2e90 Mon Sep 17 00:00:00 2001 From: Andrew Nowak Date: Thu, 9 Oct 2025 14:48:04 +0100 Subject: [PATCH 4/5] a couple more README clarifications --- README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index beca381..c351a06 100644 --- a/README.md +++ b/README.md @@ -35,11 +35,18 @@ Very occasionally, it is required to migrate a cluster into a new Autoscaling Gr 1. Follow the setup steps above. 1. Create the new ASG with DesiredCapacity set to 0. 1. Set the MinimumCapacity of the old ASG to 0. -1. Tag the new ASG with `gu:riffraff:new-asg = True`. +1. Tag the new ASG with `gu:riffraff:new-asg = True`. (This is the tag that is already used by riff-raff for identifying the newer ASG during migrations). 1. Run as normal, either manually or letting the schedule rotate the instances. The step function will detect and launch new instances in the new ASG, while removing nodes from the old ASG. +> [!WARNING] +> This feature has been developed and tested for Elasticsearch clusters which exist in a single ASG, and the "new" ASG can be +> matched to the "old" one using Stage/Stack/App tags. If your usecase doesn't match this, you'll likely need to do some more testing +> and possibly improve this feature. If you're at all unsure, get in touch in the Elasticsearch chat space and we can figure out +> any potential issues together. + + ## Implementation This Step Function triggers a number of TypeScript lambdas, which coordinate the process of replacing a node by: From 00c7b7ada8350610fec88e6e2d256074ab7ca65d Mon Sep 17 00:00:00 2001 From: Andrew Nowak <10963046+andrew-nowak@users.noreply.github.com> Date: Tue, 14 Oct 2025 12:43:16 +0100 Subject: [PATCH 5/5] More defensive programming on values returned from AWS API Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/aws/autoscaling.ts | 6 ++++++ src/getTargetNode.ts | 6 +++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/aws/autoscaling.ts b/src/aws/autoscaling.ts index 93ce530..edcdc38 100644 --- a/src/aws/autoscaling.ts +++ b/src/aws/autoscaling.ts @@ -32,7 +32,13 @@ export async function launchNewInstance(instance: Instance, asgName: string): Pr `getting current capacity in ${asgName}`, 5, ); + if (!asgs.AutoScalingGroups || asgs.AutoScalingGroups.length === 0) { + throw new Error(`No AutoScalingGroup found with name ${asgName}`); + } const capacity = asgs.AutoScalingGroups[0].DesiredCapacity; + if (typeof capacity !== 'number' || isNaN(capacity)) { + throw new Error(`DesiredCapacity is not defined or not a number for ASG ${asgName}`); + } return retry( () => awsAutoscaling.send(new SetDesiredCapacityCommand({ diff --git a/src/getTargetNode.ts b/src/getTargetNode.ts index a841df1..591ad05 100644 --- a/src/getTargetNode.ts +++ b/src/getTargetNode.ts @@ -7,7 +7,11 @@ import {Instance} from "./aws/types"; import { type AutoScalingGroup } from '@aws-sdk/client-auto-scaling'; function asgTagsToRecord(asg: AutoScalingGroup): Record { - return Object.fromEntries(asg.Tags.map(tag => ([tag.Key, tag.Value]))); + return Object.fromEntries( + (asg.Tags ?? []) + .filter(tag => tag.Key !== undefined && tag.Value !== undefined) + .map(tag => [tag.Key!, tag.Value!]) + ); } /** attempt to find an ASG with the same tagging as the target instance's,