Support monitor period variables for all metrics

dubiety · Sep 2, 2021 · eb23546 · eb23546
1 parent 41e82ab
commit eb23546
Show file tree

Hide file tree

Showing 4 changed files with 90 additions and 13 deletions.
diff --git a/Makefile b/Makefile
@@ -1,5 +1,5 @@
 SHELL := /bin/bash
-TERRAFORM_VERSION ?= 1.0.1
+TERRAFORM_VERSION ?= 1.0.5
 
 -include $(shell curl -sSL -o .build-harness "https://git.io/build-harness"; echo .build-harness)
 

diff --git a/README.md b/README.md
@@ -102,6 +102,17 @@ module "es_alarms" {
 | `monitor_kms`                                 | Enable monitoring of KMS-related metrics, enable if using KMS | bool | `false` | no |
 | `monitor_master_cpu_utilization_too_high`     | Enable monitoring of CPU utilization of master nodes are too high. Only enable this when dedicated master is enabled | bool | `false` | no |
 | `monitor_master_jvm_memory_pressure_too_high` | Enable monitoring of JVM memory pressure of master nodes are too high. Only enable this wwhen dedicated master is enabled | bool | `false` | no |
+| `monitor_min_available_nodes_period`          | The period of the minimum available nodes should the statistics be applied in seconds | string | `86400` | no |
+| `monitor_automated_snapshot_failure_period`   | The period of the automated snapshot failure should the statistics be applied in seconds | string | `60` | no |
+| `monitor_cluster_index_writes_blocked_period` | The period of the cluster index writes being blocked should the statistics be applied in seconds | string | `300` | no |
+| `monitor_cluster_status_is_red_period`        | The period of the cluster status is in red should the statistics be applied in seconds | string | `60` | no |
+| `monitor_cluster_status_is_yellow_period`     | The period of the cluster status is in yellow should the statistics be applied in seconds | string | `60` | no |
+| `monitor_cpu_utilization_too_high_period`     | The period of the CPU utilization is too high should the statistics be applied in seconds | string | `900` | no |
+| `monitor_free_storage_space_too_low_period`   | The period of the cluster average free storage is too low should the statistics be applied in seconds | string | `60` | no |
+| `monitor_jvm_memory_pressure_too_high_period` | The period of the JVM memory pressure is too high should the statistics be applied in seconds | string | `900` | no |
+| `monitor_kms_period`                          | The period of the KMS-related metrics should the statistics be applied in seconds | string | `60` | no |
+| `monitor_master_cpu_utilization_too_high_period`     | The period of the CPU utilization of master nodes are too high should the statistics be applied in seconds | string | `900` | no |
+| `monitor_master_jvm_memory_pressure_too_high_period` | The period of the JVM memory pressure of master nodes are too high should the statistics be applied in seconds | string | `900` | no |
 | `create_sns_topic`                            | Will create an SNS topic, if you set this to false you MUST set `sns_topic` to a FULL ARN | bool | `true` | no |
 | `sns_topic`                                   | SNS topic you want to specify. If leave empty, it will use a prefix and a timestamp appended.  If `create_sns_topic` is set to false, this MUST be a FULL ARN | string | `""` | no |
 | `sns_topic_postfix`                           | SNS topic postfix | string | `""` | no |

diff --git a/alarms.tf b/alarms.tf
@@ -16,7 +16,7 @@ resource "aws_cloudwatch_metric_alarm" "cluster_status_is_red" {
   evaluation_periods  = "1"
   metric_name         = "ClusterStatus.red"
   namespace           = "AWS/ES"
-  period              = "60"
+  period              = var.monitor_cluster_status_is_red_period
   statistic           = "Maximum"
   threshold           = "1"
   alarm_description   = "Average elasticsearch cluster status is in red over last 1 minute"
@@ -38,7 +38,7 @@ resource "aws_cloudwatch_metric_alarm" "cluster_status_is_yellow" {
   evaluation_periods  = var.alarm_cluster_status_is_yellow_periods
   metric_name         = "ClusterStatus.yellow"
   namespace           = "AWS/ES"
-  period              = "60"
+  period              = var.monitor_cluster_status_is_yellow_period
   statistic           = "Maximum"
   threshold           = "1"
   alarm_description   = "Average elasticsearch cluster status is in yellow over last ${var.alarm_cluster_status_is_yellow_periods} minute(s)"
@@ -60,7 +60,7 @@ resource "aws_cloudwatch_metric_alarm" "free_storage_space_too_low" {
   evaluation_periods  = var.alarm_free_storage_space_too_low_periods
   metric_name         = "FreeStorageSpace"
   namespace           = "AWS/ES"
-  period              = "60"
+  period              = var.monitor_free_storage_space_too_low_period
   statistic           = "Minimum"
   threshold           = local.thresholds["FreeStorageSpaceThreshold"]
   alarm_description   = "Average elasticsearch free storage space over last ${var.alarm_free_storage_space_too_low_periods} minute(s) is too low"
@@ -82,7 +82,7 @@ resource "aws_cloudwatch_metric_alarm" "cluster_index_writes_blocked" {
   evaluation_periods  = "1"
   metric_name         = "ClusterIndexWritesBlocked"
   namespace           = "AWS/ES"
-  period              = "300"
+  period              = var.monitor_cluster_index_writes_blocked_period
   statistic           = "Maximum"
   threshold           = "1"
   alarm_description   = "Elasticsearch index writes being blocker over last 5 minutes"
@@ -104,7 +104,7 @@ resource "aws_cloudwatch_metric_alarm" "insufficient_available_nodes" {
   evaluation_periods  = "1"
   metric_name         = "Nodes"
   namespace           = "AWS/ES"
-  period              = "86400"
+  period              = var.monitor_min_available_nodes_period
   statistic           = "Minimum"
   threshold           = local.thresholds["MinimumAvailableNodes"]
   alarm_description   = "Elasticsearch nodes minimum < ${local.thresholds["MinimumAvailableNodes"]} for 1 day"
@@ -126,7 +126,7 @@ resource "aws_cloudwatch_metric_alarm" "automated_snapshot_failure" {
   evaluation_periods  = "1"
   metric_name         = "AutomatedSnapshotFailure"
   namespace           = "AWS/ES"
-  period              = "60"
+  period              = var.monitor_automated_snapshot_failure_period
   statistic           = "Maximum"
   threshold           = "1"
   alarm_description   = "Elasticsearch automated snapshot failed over last 1 minute"
@@ -148,7 +148,7 @@ resource "aws_cloudwatch_metric_alarm" "cpu_utilization_too_high" {
   evaluation_periods  = "3"
   metric_name         = "CPUUtilization"
   namespace           = "AWS/ES"
-  period              = "900"
+  period              = var.monitor_cpu_utilization_too_high_period
   statistic           = "Average"
   threshold           = local.thresholds["CPUUtilizationThreshold"]
   alarm_description   = "Average elasticsearch cluster CPU utilization over last 45 minutes too high"
@@ -169,7 +169,7 @@ resource "aws_cloudwatch_metric_alarm" "jvm_memory_pressure_too_high" {
   evaluation_periods  = "1"
   metric_name         = "JVMMemoryPressure"
   namespace           = "AWS/ES"
-  period              = "900"
+  period              = var.monitor_jvm_memory_pressure_too_high_period
   statistic           = "Maximum"
   threshold           = local.thresholds["JVMMemoryPressureThreshold"]
   alarm_description   = "Elasticsearch JVM memory pressure is too high over last 15 minutes"
@@ -190,7 +190,7 @@ resource "aws_cloudwatch_metric_alarm" "master_cpu_utilization_too_high" {
   evaluation_periods  = "3"
   metric_name         = "MasterCPUUtilization"
   namespace           = "AWS/ES"
-  period              = "900"
+  period              = var.monitor_master_cpu_utilization_too_high_period
   statistic           = "Average"
   threshold           = local.thresholds["MasterCPUUtilizationThreshold"]
   alarm_description   = "Average elasticsearch cluster CPU utilization over last 45 minutes too high"
@@ -211,7 +211,7 @@ resource "aws_cloudwatch_metric_alarm" "master_jvm_memory_pressure_too_high" {
   evaluation_periods  = "1"
   metric_name         = "MasterJVMMemoryPressure"
   namespace           = "AWS/ES"
-  period              = "900"
+  period              = var.monitor_master_jvm_memory_pressure_too_high_period
   statistic           = "Maximum"
   threshold           = local.thresholds["MasterJVMMemoryPressureThreshold"]
   alarm_description   = "Elasticsearch JVM memory pressure is too high over last 15 minutes"
@@ -232,7 +232,7 @@ resource "aws_cloudwatch_metric_alarm" "kms_key_error" {
   evaluation_periods  = "1"
   metric_name         = "KMSKeyError"
   namespace           = "AWS/ES"
-  period              = "60"
+  period              = var.monitor_kms_period
   statistic           = "Maximum"
   threshold           = "1"
   alarm_description   = "Elasticsearch KMS Key Error failed over last 1 minute"
@@ -254,7 +254,7 @@ resource "aws_cloudwatch_metric_alarm" "kms_key_inaccessible" {
   evaluation_periods  = "1"
   metric_name         = "KMSKeyInaccessible"
   namespace           = "AWS/ES"
-  period              = "60"
+  period              = var.monitor_kms_period
   statistic           = "Maximum"
   threshold           = "1"
   alarm_description   = "Elasticsearch KMS Key Inaccessible failed over last 1 minute"

diff --git a/variables.tf b/variables.tf
@@ -99,6 +99,72 @@ variable "monitor_master_jvm_memory_pressure_too_high" {
   default     = false
 }
 
+variable "monitor_min_available_nodes_period" {
+  description = "The period of the minimum available nodes should the statistics be applied in seconds"
+  type        = string
+  default     = "86400"
+}
+
+variable "monitor_cluster_status_is_red_period" {
+  description = "The period of the cluster status is in red should the statistics be applied in seconds"
+  type        = string
+  default     = "60"
+}
+
+variable "monitor_cluster_status_is_yellow_period" {
+  description = "The period of the cluster status is in yellow should the statistics be applied in seconds"
+  type        = string
+  default     = "60"
+}
+
+variable "monitor_free_storage_space_too_low_period" {
+  description = "The period of the cluster average free storage is too low should the statistics be applied in seconds"
+  type        = string
+  default     = "60"
+}
+
+variable "monitor_cluster_index_writes_blocked_period" {
+  description = "The period of the cluster index writes being blocked should the statistics be applied in seconds"
+  type        = string
+  default     = "300"
+}
+
+variable "monitor_automated_snapshot_failure_period" {
+  description = "The period of the automated snapshot failure should the statistics be applied in seconds"
+  type        = string
+  default     = "60"
+}
+
+variable "monitor_cpu_utilization_too_high_period" {
+  description = "The period of the CPU utilization is too high should the statistics be applied in seconds"
+  type        = string
+  default     = "900"
+}
+
+variable "monitor_jvm_memory_pressure_too_high_period" {
+  description = "The period of the JVM memory pressure is too high should the statistics be applied in seconds"
+  type        = string
+  default     = "900"
+}
+
+variable "monitor_kms_period" {
+  description = "The period of the KMS-related metrics should the statistics be applied in seconds"
+  type        = string
+  default     = "60"
+}
+
+variable "monitor_master_cpu_utilization_too_high_period" {
+  description = "The period of the CPU utilization of master nodes are too high should the statistics be applied in seconds"
+  type        = string
+  default     = "900"
+}
+
+variable "monitor_master_jvm_memory_pressure_too_high_period" {
+  description = "The period of the JVM memory pressure of master nodes are too high should the statistics be applied in seconds"
+  type        = string
+  default     = "900"
+}
+
 variable "free_storage_space_threshold" {
   description = "The minimum amount of available storage space in MegaByte."
   type        = number