Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

IPS-623: Add BE alarms #1995

Merged
merged 2 commits into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 49 additions & 1 deletion deploy-delete-user-data/template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,31 @@ Resources:
QueueName: DeleteAccountSQSQueue
RedrivePolicy: !Sub "{ \"deadLetterTargetArn\": \"arn:aws:sqs:${AWS::Region}:${AWS::AccountId}:DeleteAccountSNSDLQ\", \"maxReceiveCount\": \"5\" }"

#Delete Account Queue
#Delete Account Queue alarm for old messages
DeleteAccountSQSMessagesNotConsumedAlarm:
Type: AWS::CloudWatch::Alarm
Properties:
AlarmName: !Sub "${AWS::StackName}-DeleteAccountSQSMessagesNotConsumedAlarm"
AlarmDescription: "Trigger an alarm when the age of the oldest message in the DeleteAccountSQSQueue is 5 or more minutes"
ActionsEnabled: true
AlarmActions:
- !ImportValue alarm-alerts-topic
OKActions:
- !ImportValue alarm-alerts-topic
MetricName: ApproximateAgeOfOldestMessage
Namespace: AWS/SQS
Statistic: Maximum
Dimensions:
- Name: QueueName
Value: !GetAtt DeleteAccountSQSQueue.QueueName
Period: 60
EvaluationPeriods: 1
DatapointsToAlarm: 1
Threshold: 300
ComparisonOperator: GreaterThanOrEqualToThreshold
TreatMissingData: notBreaching

#IPV Delete Account Queue
IPVDeleteAccountSQSQueue:
DependsOn: DeleteAccountSNSDLQ
Type: AWS::SQS::Queue
Expand All @@ -282,6 +306,30 @@ Resources:
QueueName: IPVDeleteAccountSQSQueue
RedrivePolicy: !Sub "{ \"deadLetterTargetArn\": \"arn:aws:sqs:${AWS::Region}:${AWS::AccountId}:DeleteAccountSNSDLQ\", \"maxReceiveCount\": \"5\" }"

#IPV Delete Account Queue alarm for old messages
IPVDeleteAccountSQSMessagesNotConsumedAlarm:
Type: AWS::CloudWatch::Alarm
Properties:
AlarmName: !Sub "${AWS::StackName}-IPVDeleteAccountSQSMessagesNotConsumedAlarm"
AlarmDescription: "Trigger an alarm when the age of the oldest message in the IPVDeleteAccountSQSQueue is 5 or more minutes"
ActionsEnabled: true
AlarmActions:
- !ImportValue alarm-alerts-topic
OKActions:
- !ImportValue alarm-alerts-topic
MetricName: ApproximateAgeOfOldestMessage
Namespace: AWS/SQS
Statistic: Maximum
Dimensions:
- Name: QueueName
Value: !GetAtt IPVDeleteAccountSQSQueue.QueueName
Period: 60
EvaluationPeriods: 1
DatapointsToAlarm: 1
Threshold: 300
ComparisonOperator: GreaterThanOrEqualToThreshold
TreatMissingData: notBreaching

#DLQ for SNS Subsciption
DeleteAccountSNSDLQ:
Type: AWS::SQS::Queue
Expand Down
92 changes: 92 additions & 0 deletions deploy/template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3184,6 +3184,12 @@ Resources:
ArnLike:
"kms:EncryptionContext:aws:logs:arn": !Sub "arn:aws:logs:${AWS::Region}:${AWS::AccountId}:*"

####################################################################
# #
# Monitoring & Alerts #
# #
####################################################################

CoreApiPrivateGw5xxErrors:
Type: AWS::CloudWatch::Alarm
Condition: IsNotDevelopment
Expand Down Expand Up @@ -3250,6 +3256,92 @@ Resources:
Period: 300
Stat: Sum

LatencyAlarm:
Type: AWS::CloudWatch::Alarm
Properties:
AlarmName: !Sub "${AWS::StackName}-apiGWLatencyAlarm"
AlarmDescription: "There has been increased latency on backend api-gateway"
ActionsEnabled: true
AlarmActions:
- !ImportValue alarm-alerts-topic
OKActions:
- !ImportValue alarm-alerts-topic
InsufficientDataActions: []
Dimensions: []
EvaluationPeriods: 5
DatapointsToAlarm: 2
Threshold: 2500
ComparisonOperator: GreaterThanOrEqualToThreshold
TreatMissingData: notBreaching
Metrics:
- Id: safeLatency
Label: safeLatency
ReturnData: true
Expression: IF(invocations<10,0,maxLatency)
- Id: invocations
ReturnData: false
MetricStat:
Metric:
Namespace: AWS/ApiGateway
MetricName: Count
Dimensions:
- Name: ApiName
Value: !Sub "${AWS::StackName} - Core API"
Period: 60
Stat: Sum
- Id: maxLatency
ReturnData: false
MetricStat:
Metric:
Namespace: AWS/ApiGateway
MetricName: Latency
Dimensions:
- Name: ApiName
Value: !Sub "${AWS::StackName} - Core API"
Period: 60
Stat: Maximum

LambdaThrottleAlarm:
Type: AWS::CloudWatch::Alarm
Properties:
AlarmDescription: "Trigger the alarm if any lambda in the account throttles"
AlarmName: !Sub "${AWS::StackName}-LambdaThrottleAlarm"
ActionsEnabled: true
AlarmActions:
- !ImportValue alarm-alerts-topic
OKActions:
- !ImportValue alarm-alerts-topic
InsufficientDataActions: []
MetricName: Throttles
Namespace: AWS/Lambda
Statistic: Sum
TreatMissingData: notBreaching
Period: 60 # This is the minimum value for the AWS Namespace
EvaluationPeriods: 1
DatapointsToAlarm: 1
Threshold: 1
ComparisonOperator: GreaterThanThreshold

LambdaDurationAlarm:
Type: AWS::CloudWatch::Alarm
Properties:
AlarmName: !Sub "${AWS::StackName}-LambdaDurationAlarm"
AlarmDescription: Alarm for Lambda functions running longer than 5 minutes
ActionsEnabled: true
AlarmActions:
- !ImportValue alarm-alerts-topic
OKActions:
- !ImportValue alarm-alerts-topic
MetricName: Duration
Namespace: AWS/Lambda
Statistic: Maximum
Period: 60
EvaluationPeriods: 5
DatapointsToAlarm: 3
Threshold: 30000 #30k milliseconds = 30 seconds
ComparisonOperator: GreaterThanThreshold
TreatMissingData: notBreaching

Outputs:
IPVCorePrivateAPIGatewayID:
Description: Core Back Private API Gateway ID
Expand Down