fix: Align UnhealthyInstance alarm period with underlying metric period

GuUnhealthyInstancesAlarm is built on top of the UnHealthyHostCount metric, which the Load Balancer measures and posts in 60-second intervals. Having the alarm period set to 5 minutes requires cloudwatch alarms to bucket the underlying metric into buckets of 5, and pick (in our case) a maximum value. The result of this operation forms the basis for the alarm to calculate the set of `alarm data points` used to decide if we are in an alarm state or not. The results of this bucketing operation can be unstable, as cloudwatch alarms operate on a rolling window basis. This makes the triggering of the alarm itself unstable, prone to false-recoveries upon initial alarm, and false alarms upon recovery. By setting the alarm period to the same period as the underlying metric they become synchronised, and alarm conditions become much more stable.
guardian · Jan 27, 2022 · 54ef1ef · 54ef1ef
1 parent a7c03c0
commit 54ef1ef
Showing 1 changed file with 3 additions and 3 deletions.
diff --git a/src/constructs/cloudwatch/ec2-alarms.ts b/src/constructs/cloudwatch/ec2-alarms.ts
@@ -63,8 +63,8 @@ export class GuUnhealthyInstancesAlarm extends GuAlarm {
   constructor(scope: GuStack, props: GuUnhealthyInstancesAlarmProps) {
     const alarmName = `Unhealthy instances for ${props.app} in ${scope.stage}`;
 
-    const period = Duration.minutes(5);
-    const evaluationPeriods = 12;
+    const period = Duration.minutes(1);
+    const evaluationPeriods = 60;
     const evaluationInterval = Duration.minutes(period.toMinutes() * evaluationPeriods).toHumanString();
 
     const alarmDescription = `${props.app}'s instances have failed healthchecks several times over the last ${evaluationInterval}.
@@ -80,7 +80,7 @@ export class GuUnhealthyInstancesAlarm extends GuAlarm {
       treatMissingData: TreatMissingData.NOT_BREACHING,
       threshold: 1,
       comparisonOperator: ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
-      datapointsToAlarm: 6,
+      datapointsToAlarm: 30,
       evaluationPeriods,
     };
     super(scope, AppIdentity.suffixText(props, "UnhealthyInstancesAlarm"), alarmProps);