Skip to content

Commit

Permalink
Merge branch 'master' into heartbeat-running-vm
Browse files Browse the repository at this point in the history
  • Loading branch information
haedri committed Jun 10, 2024
2 parents a8d2623 + 93dd417 commit a94c344
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 0 deletions.
1 change: 1 addition & 0 deletions docs/severity.md
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,7 @@
|AWS ElastiCache redis replication lag|X|X|-|-|-|
|AWS ElastiCache redis commands|-|X|-|-|-|
|AWS ElastiCache redis network conntrack allowance exceeded|X|-|-|-|-|
|AWS ElastiCache redis database capacity usage|X|X|-|-|-|


## integration_aws-elasticsearch
Expand Down
2 changes: 2 additions & 0 deletions modules/integration_aws-elasticache-redis/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ This module creates the following SignalFx detectors which could contain one or
|AWS ElastiCache redis replication lag|X|X|-|-|-|
|AWS ElastiCache redis commands|-|X|-|-|-|
|AWS ElastiCache redis network conntrack allowance exceeded|X|-|-|-|-|
|AWS ElastiCache redis database capacity usage|X|X|-|-|-|

## How to collect required metrics?

Expand All @@ -99,6 +100,7 @@ Here is the list of required metrics for detectors in this module.

* `CacheHits`
* `CacheMisses`
* `DatabaseCapacityUsagePercentage`
* `EngineCPUUtilization`
* `GetTypeCmds`
* `NetworkConntrackAllowanceExceeded`
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
module: AWS ElastiCache redis
name: database capacity usage

transformation: true
aggregation: true
filtering: "filter('namespace', 'AWS/ElastiCache') and filter('stat', 'upper') and filter('CacheNodeId', '*')"

signals:
signal:
metric: DatabaseCapacityUsagePercentage

rules:
critical:
threshold: 90
comparator: ">"
lasting_duration: 10m
major:
threshold: 80
comparator: ">"
dependency: critical
lasting_duration: 10m
41 changes: 41 additions & 0 deletions modules/integration_aws-elasticache-redis/detectors-gen.tf
Original file line number Diff line number Diff line change
Expand Up @@ -196,3 +196,44 @@ EOF
max_delay = var.network_conntrack_allowance_exceeded_max_delay
}

resource "signalfx_detector" "database_capacity_usage" {
name = format("%s %s", local.detector_name_prefix, "AWS ElastiCache redis database capacity usage")

authorized_writer_teams = var.authorized_writer_teams
teams = try(coalescelist(var.teams, var.authorized_writer_teams), null)
tags = compact(concat(local.common_tags, local.tags, var.extra_tags))

program_text = <<-EOF
base_filtering = filter('namespace', 'AWS/ElastiCache') and filter('stat', 'upper') and filter('CacheNodeId', '*')
signal = data('DatabaseCapacityUsagePercentage', filter=base_filtering and ${module.filtering.signalflow})${var.database_capacity_usage_aggregation_function}${var.database_capacity_usage_transformation_function}.publish('signal')
detect(when(signal > ${var.database_capacity_usage_threshold_critical}%{if var.database_capacity_usage_lasting_duration_critical != null}, lasting='${var.database_capacity_usage_lasting_duration_critical}', at_least=${var.database_capacity_usage_at_least_percentage_critical}%{endif})).publish('CRIT')
detect(when(signal > ${var.database_capacity_usage_threshold_major}%{if var.database_capacity_usage_lasting_duration_major != null}, lasting='${var.database_capacity_usage_lasting_duration_major}', at_least=${var.database_capacity_usage_at_least_percentage_major}%{endif}) and (not when(signal > ${var.database_capacity_usage_threshold_critical}%{if var.database_capacity_usage_lasting_duration_critical != null}, lasting='${var.database_capacity_usage_lasting_duration_critical}', at_least=${var.database_capacity_usage_at_least_percentage_critical}%{endif}))).publish('MAJOR')
EOF

rule {
description = "is too high > ${var.database_capacity_usage_threshold_critical}"
severity = "Critical"
detect_label = "CRIT"
disabled = coalesce(var.database_capacity_usage_disabled_critical, var.database_capacity_usage_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.database_capacity_usage_notifications, "critical", []), var.notifications.critical), null)
runbook_url = try(coalesce(var.database_capacity_usage_runbook_url, var.runbook_url), "")
tip = var.database_capacity_usage_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

rule {
description = "is too high > ${var.database_capacity_usage_threshold_major}"
severity = "Major"
detect_label = "MAJOR"
disabled = coalesce(var.database_capacity_usage_disabled_major, var.database_capacity_usage_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.database_capacity_usage_notifications, "major", []), var.notifications.major), null)
runbook_url = try(coalesce(var.database_capacity_usage_runbook_url, var.runbook_url), "")
tip = var.database_capacity_usage_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

max_delay = var.database_capacity_usage_max_delay
}

5 changes: 5 additions & 0 deletions modules/integration_aws-elasticache-redis/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ output "cpu_high" {
value = signalfx_detector.cpu_high
}

output "database_capacity_usage" {
description = "Detector resource for database_capacity_usage"
value = signalfx_detector.database_capacity_usage
}

output "network_conntrack_allowance_exceeded" {
description = "Detector resource for network_conntrack_allowance_exceeded"
value = signalfx_detector.network_conntrack_allowance_exceeded
Expand Down
90 changes: 90 additions & 0 deletions modules/integration_aws-elasticache-redis/variables-gen.tf
Original file line number Diff line number Diff line change
Expand Up @@ -390,3 +390,93 @@ variable "network_conntrack_allowance_exceeded_at_least_percentage_critical" {
type = number
default = 1
}
# database_capacity_usage detector

variable "database_capacity_usage_notifications" {
description = "Notification recipients list per severity overridden for database_capacity_usage detector"
type = map(list(string))
default = {}
}

variable "database_capacity_usage_aggregation_function" {
description = "Aggregation function and group by for database_capacity_usage detector (i.e. \".mean(by=['host'])\")"
type = string
default = ""
}

variable "database_capacity_usage_transformation_function" {
description = "Transformation function for database_capacity_usage detector (i.e. \".mean(over='5m')\")"
type = string
default = ""
}

variable "database_capacity_usage_max_delay" {
description = "Enforce max delay for database_capacity_usage detector (use \"0\" or \"null\" for \"Auto\")"
type = number
default = null
}

variable "database_capacity_usage_tip" {
description = "Suggested first course of action or any note useful for incident handling"
type = string
default = ""
}

variable "database_capacity_usage_runbook_url" {
description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause"
type = string
default = ""
}

variable "database_capacity_usage_disabled" {
description = "Disable all alerting rules for database_capacity_usage detector"
type = bool
default = null
}

variable "database_capacity_usage_disabled_critical" {
description = "Disable critical alerting rule for database_capacity_usage detector"
type = bool
default = null
}

variable "database_capacity_usage_disabled_major" {
description = "Disable major alerting rule for database_capacity_usage detector"
type = bool
default = null
}

variable "database_capacity_usage_threshold_critical" {
description = "Critical threshold for database_capacity_usage detector"
type = number
default = 90
}

variable "database_capacity_usage_lasting_duration_critical" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = "10m"
}

variable "database_capacity_usage_at_least_percentage_critical" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
}
variable "database_capacity_usage_threshold_major" {
description = "Major threshold for database_capacity_usage detector"
type = number
default = 80
}

variable "database_capacity_usage_lasting_duration_major" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = "10m"
}

variable "database_capacity_usage_at_least_percentage_major" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
}

0 comments on commit a94c344

Please sign in to comment.