Skip to content

Commit

Permalink
Support monitor period variables for all metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
dubiety committed Sep 2, 2021
1 parent 41e82ab commit eb23546
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 13 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
SHELL := /bin/bash
TERRAFORM_VERSION ?= 1.0.1
TERRAFORM_VERSION ?= 1.0.5

-include $(shell curl -sSL -o .build-harness "https://git.io/build-harness"; echo .build-harness)

Expand Down
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,17 @@ module "es_alarms" {
| `monitor_kms` | Enable monitoring of KMS-related metrics, enable if using KMS | bool | `false` | no |
| `monitor_master_cpu_utilization_too_high` | Enable monitoring of CPU utilization of master nodes are too high. Only enable this when dedicated master is enabled | bool | `false` | no |
| `monitor_master_jvm_memory_pressure_too_high` | Enable monitoring of JVM memory pressure of master nodes are too high. Only enable this wwhen dedicated master is enabled | bool | `false` | no |
| `monitor_min_available_nodes_period` | The period of the minimum available nodes should the statistics be applied in seconds | string | `86400` | no |
| `monitor_automated_snapshot_failure_period` | The period of the automated snapshot failure should the statistics be applied in seconds | string | `60` | no |
| `monitor_cluster_index_writes_blocked_period` | The period of the cluster index writes being blocked should the statistics be applied in seconds | string | `300` | no |
| `monitor_cluster_status_is_red_period` | The period of the cluster status is in red should the statistics be applied in seconds | string | `60` | no |
| `monitor_cluster_status_is_yellow_period` | The period of the cluster status is in yellow should the statistics be applied in seconds | string | `60` | no |
| `monitor_cpu_utilization_too_high_period` | The period of the CPU utilization is too high should the statistics be applied in seconds | string | `900` | no |
| `monitor_free_storage_space_too_low_period` | The period of the cluster average free storage is too low should the statistics be applied in seconds | string | `60` | no |
| `monitor_jvm_memory_pressure_too_high_period` | The period of the JVM memory pressure is too high should the statistics be applied in seconds | string | `900` | no |
| `monitor_kms_period` | The period of the KMS-related metrics should the statistics be applied in seconds | string | `60` | no |
| `monitor_master_cpu_utilization_too_high_period` | The period of the CPU utilization of master nodes are too high should the statistics be applied in seconds | string | `900` | no |
| `monitor_master_jvm_memory_pressure_too_high_period` | The period of the JVM memory pressure of master nodes are too high should the statistics be applied in seconds | string | `900` | no |
| `create_sns_topic` | Will create an SNS topic, if you set this to false you MUST set `sns_topic` to a FULL ARN | bool | `true` | no |
| `sns_topic` | SNS topic you want to specify. If leave empty, it will use a prefix and a timestamp appended. If `create_sns_topic` is set to false, this MUST be a FULL ARN | string | `""` | no |
| `sns_topic_postfix` | SNS topic postfix | string | `""` | no |
Expand Down
24 changes: 12 additions & 12 deletions alarms.tf
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ resource "aws_cloudwatch_metric_alarm" "cluster_status_is_red" {
evaluation_periods = "1"
metric_name = "ClusterStatus.red"
namespace = "AWS/ES"
period = "60"
period = var.monitor_cluster_status_is_red_period
statistic = "Maximum"
threshold = "1"
alarm_description = "Average elasticsearch cluster status is in red over last 1 minute"
Expand All @@ -38,7 +38,7 @@ resource "aws_cloudwatch_metric_alarm" "cluster_status_is_yellow" {
evaluation_periods = var.alarm_cluster_status_is_yellow_periods
metric_name = "ClusterStatus.yellow"
namespace = "AWS/ES"
period = "60"
period = var.monitor_cluster_status_is_yellow_period
statistic = "Maximum"
threshold = "1"
alarm_description = "Average elasticsearch cluster status is in yellow over last ${var.alarm_cluster_status_is_yellow_periods} minute(s)"
Expand All @@ -60,7 +60,7 @@ resource "aws_cloudwatch_metric_alarm" "free_storage_space_too_low" {
evaluation_periods = var.alarm_free_storage_space_too_low_periods
metric_name = "FreeStorageSpace"
namespace = "AWS/ES"
period = "60"
period = var.monitor_free_storage_space_too_low_period
statistic = "Minimum"
threshold = local.thresholds["FreeStorageSpaceThreshold"]
alarm_description = "Average elasticsearch free storage space over last ${var.alarm_free_storage_space_too_low_periods} minute(s) is too low"
Expand All @@ -82,7 +82,7 @@ resource "aws_cloudwatch_metric_alarm" "cluster_index_writes_blocked" {
evaluation_periods = "1"
metric_name = "ClusterIndexWritesBlocked"
namespace = "AWS/ES"
period = "300"
period = var.monitor_cluster_index_writes_blocked_period
statistic = "Maximum"
threshold = "1"
alarm_description = "Elasticsearch index writes being blocker over last 5 minutes"
Expand All @@ -104,7 +104,7 @@ resource "aws_cloudwatch_metric_alarm" "insufficient_available_nodes" {
evaluation_periods = "1"
metric_name = "Nodes"
namespace = "AWS/ES"
period = "86400"
period = var.monitor_min_available_nodes_period
statistic = "Minimum"
threshold = local.thresholds["MinimumAvailableNodes"]
alarm_description = "Elasticsearch nodes minimum < ${local.thresholds["MinimumAvailableNodes"]} for 1 day"
Expand All @@ -126,7 +126,7 @@ resource "aws_cloudwatch_metric_alarm" "automated_snapshot_failure" {
evaluation_periods = "1"
metric_name = "AutomatedSnapshotFailure"
namespace = "AWS/ES"
period = "60"
period = var.monitor_automated_snapshot_failure_period
statistic = "Maximum"
threshold = "1"
alarm_description = "Elasticsearch automated snapshot failed over last 1 minute"
Expand All @@ -148,7 +148,7 @@ resource "aws_cloudwatch_metric_alarm" "cpu_utilization_too_high" {
evaluation_periods = "3"
metric_name = "CPUUtilization"
namespace = "AWS/ES"
period = "900"
period = var.monitor_cpu_utilization_too_high_period
statistic = "Average"
threshold = local.thresholds["CPUUtilizationThreshold"]
alarm_description = "Average elasticsearch cluster CPU utilization over last 45 minutes too high"
Expand All @@ -169,7 +169,7 @@ resource "aws_cloudwatch_metric_alarm" "jvm_memory_pressure_too_high" {
evaluation_periods = "1"
metric_name = "JVMMemoryPressure"
namespace = "AWS/ES"
period = "900"
period = var.monitor_jvm_memory_pressure_too_high_period
statistic = "Maximum"
threshold = local.thresholds["JVMMemoryPressureThreshold"]
alarm_description = "Elasticsearch JVM memory pressure is too high over last 15 minutes"
Expand All @@ -190,7 +190,7 @@ resource "aws_cloudwatch_metric_alarm" "master_cpu_utilization_too_high" {
evaluation_periods = "3"
metric_name = "MasterCPUUtilization"
namespace = "AWS/ES"
period = "900"
period = var.monitor_master_cpu_utilization_too_high_period
statistic = "Average"
threshold = local.thresholds["MasterCPUUtilizationThreshold"]
alarm_description = "Average elasticsearch cluster CPU utilization over last 45 minutes too high"
Expand All @@ -211,7 +211,7 @@ resource "aws_cloudwatch_metric_alarm" "master_jvm_memory_pressure_too_high" {
evaluation_periods = "1"
metric_name = "MasterJVMMemoryPressure"
namespace = "AWS/ES"
period = "900"
period = var.monitor_master_jvm_memory_pressure_too_high_period
statistic = "Maximum"
threshold = local.thresholds["MasterJVMMemoryPressureThreshold"]
alarm_description = "Elasticsearch JVM memory pressure is too high over last 15 minutes"
Expand All @@ -232,7 +232,7 @@ resource "aws_cloudwatch_metric_alarm" "kms_key_error" {
evaluation_periods = "1"
metric_name = "KMSKeyError"
namespace = "AWS/ES"
period = "60"
period = var.monitor_kms_period
statistic = "Maximum"
threshold = "1"
alarm_description = "Elasticsearch KMS Key Error failed over last 1 minute"
Expand All @@ -254,7 +254,7 @@ resource "aws_cloudwatch_metric_alarm" "kms_key_inaccessible" {
evaluation_periods = "1"
metric_name = "KMSKeyInaccessible"
namespace = "AWS/ES"
period = "60"
period = var.monitor_kms_period
statistic = "Maximum"
threshold = "1"
alarm_description = "Elasticsearch KMS Key Inaccessible failed over last 1 minute"
Expand Down
66 changes: 66 additions & 0 deletions variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,72 @@ variable "monitor_master_jvm_memory_pressure_too_high" {
default = false
}

variable "monitor_min_available_nodes_period" {
description = "The period of the minimum available nodes should the statistics be applied in seconds"
type = string
default = "86400"
}

variable "monitor_cluster_status_is_red_period" {
description = "The period of the cluster status is in red should the statistics be applied in seconds"
type = string
default = "60"
}

variable "monitor_cluster_status_is_yellow_period" {
description = "The period of the cluster status is in yellow should the statistics be applied in seconds"
type = string
default = "60"
}

variable "monitor_free_storage_space_too_low_period" {
description = "The period of the cluster average free storage is too low should the statistics be applied in seconds"
type = string
default = "60"
}

variable "monitor_cluster_index_writes_blocked_period" {
description = "The period of the cluster index writes being blocked should the statistics be applied in seconds"
type = string
default = "300"
}

variable "monitor_automated_snapshot_failure_period" {
description = "The period of the automated snapshot failure should the statistics be applied in seconds"
type = string
default = "60"
}

variable "monitor_cpu_utilization_too_high_period" {
description = "The period of the CPU utilization is too high should the statistics be applied in seconds"
type = string
default = "900"
}

variable "monitor_jvm_memory_pressure_too_high_period" {
description = "The period of the JVM memory pressure is too high should the statistics be applied in seconds"
type = string
default = "900"
}

variable "monitor_kms_period" {
description = "The period of the KMS-related metrics should the statistics be applied in seconds"
type = string
default = "60"
}

variable "monitor_master_cpu_utilization_too_high_period" {
description = "The period of the CPU utilization of master nodes are too high should the statistics be applied in seconds"
type = string
default = "900"
}

variable "monitor_master_jvm_memory_pressure_too_high_period" {
description = "The period of the JVM memory pressure of master nodes are too high should the statistics be applied in seconds"
type = string
default = "900"
}

variable "free_storage_space_threshold" {
description = "The minimum amount of available storage space in MegaByte."
type = number
Expand Down

0 comments on commit eb23546

Please sign in to comment.