Skip to content

Commit

Permalink
Separate handling of CF and CloudTrail events
Browse files Browse the repository at this point in the history
  • Loading branch information
eoinsha committed Feb 9, 2021
1 parent 478bc92 commit 945bd8a
Show file tree
Hide file tree
Showing 11 changed files with 226 additions and 85 deletions.
1 change: 1 addition & 0 deletions events/stack-delete-event.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"Action": "Delete"}
119 changes: 103 additions & 16 deletions template.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
AWSTemplateFormatVersion: '2010-09-09'
Transform: AWS::Serverless-2016-10-31
Description: >
slic-watch
Automatic CloudWatch metrics, alarms and dashboards
Metadata:
Expand All @@ -21,9 +23,8 @@ Parameters:
Type: String
Description: >
A comma-separated list of AWS services to watch.
All supported services are included by default
(`lambda,kinesis,sqs,dynamodb`).
Default: 'lambda,kinesis,sqs,dynamodb'
All supported services are included by default (`lambda`).
Default: 'lambda'
TagFilter:
Type: String
Description: >
Expand Down Expand Up @@ -55,14 +56,50 @@ Globals:
POWERTOOLS_SERVICE_NAME: slicwatch
LOG_LEVEL: INFO
SNS_ALARMS_TOPIC: !Ref SNSAlarmsTopic
LAMBDA_THROTTLES_RATE_THRESHOLD: !Ref LambdaThrottlesRateThreshold
LAMBDA_ERRORS_PERIOD: !Ref LambdaErrorsPeriod
LAMBDA_ERRORS_THRESHOLD: !Ref LambdaErrorsThreshold
TAG_FILTER: !Ref TagFilter
WATCH_SERVICES: !Ref WatchServices

Resources:
WatchExistingResources:
HandleStack:
Type: AWS::Serverless::Function
Properties:
Description: Find existing resources and create or update CloudWatch alarms, metrics and dashboards
Description: Create CloudWatch alarms and dashboards in response to CloudFormation update of this stack
CodeUri: watch/
Handler: watch_handler.watch_existing
Handler: watch_handler.handle_stack
Runtime: python3.8
Timeout: 30
Policies:
- Statement:
Effect: Allow
Action:
- cloudwatch:ListMetrics
- cloudwatch:PutDashboard
- cloudwatch:PutMetricAlarm
- cloudwatch:PutMetricData
Resource: '*'
- Statement:
Effect: Allow
Action:
- lambda:GetFunction
- lambda:ListFunctions
Resource: '*'

HandleStackLogGroup:
Type: AWS::Logs::LogGroup
DeletionPolicy: Retain
Properties:
LogGroupName: !Sub /aws/lambda/${HandleStack}
RetentionInDays: 7

HandleFunctionCreateUpdate:
Type: AWS::Serverless::Function
Properties:
Description: Create CloudWatch alarms and dashboards in response to Lambda function create or update
CodeUri: watch/
Handler: watch_handler.handle_function_create_updates
Runtime: python3.8
Timeout: 30
Policies:
Expand Down Expand Up @@ -94,13 +131,53 @@ Resources:
- lambda.amazonaws.com
eventName:
- prefix: CreateFunction
- prefix: UpdateFunctionConfiguration

HandleFunctionCreateUpdateLogGroup:
Type: AWS::Logs::LogGroup
DeletionPolicy: Retain
Properties:
LogGroupName: !Sub /aws/lambda/${HandleFunctionCreateUpdate}
RetentionInDays: 7

HandleFunctionDelete:
Type: AWS::Serverless::Function
Properties:
Description: Delete CloudWatch alarms and dashboards in response to Lambda function delete
CodeUri: watch/
Handler: watch_handler.handle_function_delete
Runtime: python3.8
Timeout: 30
Policies:
- Statement:
Effect: Allow
Action:
- cloudwatch:ListMetrics
- cloudwatch:PutDashboard
- cloudwatch:PutMetricAlarm
- cloudwatch:PutMetricData
Resource: '*'
Events:
CloudTrailTrigger:
Type: EventBridgeRule
Properties:
Pattern:
source:
- aws.lambda
detail-type:
- AWS API Call via CloudTrail
detail:
eventSource:
- lambda.amazonaws.com
eventName:
- prefix: DeleteFunction
- prefix: UpdateFunctionConfiguration

WatchExistingResourcesLogGroup:
HandleFunctionDeleteLogGroup:
Type: AWS::Logs::LogGroup
DeletionPolicy: Retain
Properties:
LogGroupName: !Sub /aws/lambda/${WatchExistingResources}
LogGroupName: !Sub /aws/lambda/${HandleFunctionDelete}
RetentionInDays: 7

DeleteResources:
Expand Down Expand Up @@ -155,23 +232,33 @@ Resources:
ApplicationId: arn:aws:serverlessrepo:us-east-1:374852340823:applications/lambda-invocation-cfn-custom-resource
SemanticVersion: 1.4.0

InvokeWatchExisting:
HandleStackCreate:
Type: Custom::LambdaInvocation
DependsOn:
- WatchExistingResources
- HandleStack
- LambdaInvocationCustomResource
Properties:
ServiceToken: !GetAtt LambdaInvocationCustomResource.Outputs.FunctionArn
FunctionName: !Ref WatchExistingResources
FunctionName: !Ref HandleStack
InvocationType: Event
When:
- Create
- Update
Payload:
WatchServices: !Ref WatchServices
TagFilter: !Ref TagFilter
LambdaErrorsThreshold: !Ref LambdaErrorsThreshold
LambdaErrorsPeriod: !Ref LambdaErrorsPeriod
Action: Create

HandleStackDelete:
Type: Custom::LambdaInvocation
DependsOn:
- HandleStack
- LambdaInvocationCustomResource
Properties:
ServiceToken: !GetAtt LambdaInvocationCustomResource.Outputs.FunctionArn
FunctionName: !Ref HandleStack
InvocationType: Event
When:
- Delete
Payload:
Action: Delete

SNSAlarmsTopic:
Type: AWS::SNS::Topic
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/watch/test_alarms.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def test_update_alarms(lambda_functions):
throttles_percent_threshold=0.0,
duration_percent_timeout_threshold=90.0,
)
update_alarms(config)
update_alarms(config, lambda_functions)

cw = boto3.client("cloudwatch")
alarms_response = cw.describe_alarms()
Expand Down Expand Up @@ -85,7 +85,7 @@ def test_delete_alarm(lambda_functions):
throttles_percent_threshold=0.0,
duration_percent_timeout_threshold=90.0,
)
update_alarms(config)
update_alarms(config, lambda_functions)

fn_name = list(lambda_functions.keys())[-1] # Pick a function for which alarms are to be deleted
delete_alarms(fn_name)
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/watch/test_dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
def test_dashboard(lambda_functions):
from dashboard import update_dashboard

update_dashboard()
update_dashboard(lambda_functions)

cw = boto3.client("cloudwatch")
dash = json.loads(cw.get_dashboard(DashboardName="SLICWatch")["DashboardBody"])
Expand Down
16 changes: 8 additions & 8 deletions tests/unit/watch/test_watch_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@ def __init__(self):

@mock_cloudwatch
def test_handle_event(lambda_functions):
os.environ["SNS_ALARMS_TOPIC"] = "TestAlarmsTopic"
event = {
'Period': 99,
'DurationPercentTimeoutThreshold': 47.3
}

from watch_handler import watch_existing
watch_existing(event, DummyContext())
os.environ.update({
'SNS_ALARMS_TOPIC': 'TestAlarmsTopic',
'PERIOD': '99',
'DURATION_PERCENT_TIMEOUT_THRESHOLD': '47.3',
})

from watch_handler import handle_stack
handle_stack({}, DummyContext())

cw_client = boto3.client('cloudwatch')
f1_lambda_errors_alarm = cw_client.describe_alarms(AlarmNames=['LambdaErrors_TestFunction1'])['MetricAlarms'][0]
Expand Down
86 changes: 52 additions & 34 deletions watch/alarms.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from typing import Iterable, Mapping

import boto3
import os
from functools import partial

from aws_lambda_powertools import Logger
from concurrent import futures

from alarm_config import LambdaAlarmsConfig
from lambda_function import LambdaFunction
from lambdas import get_applicable_lambdas

SNS_ALARMS_TOPIC = os.getenv('SNS_ALARMS_TOPIC')
MAX_PUT_ALARM_CONCURRENCY = 3
Expand All @@ -15,11 +17,15 @@
cloudwatch_client = boto3.client('cloudwatch')


def _create_lambda_errors_alarm(func: LambdaFunction, config: LambdaAlarmsConfig):
""" Create an alarm for Lambda errors """
def _create_lambda_errors_alarm(func: LambdaFunction, config: LambdaAlarmsConfig) -> str:
""" Create an alarm for Lambda errors
:param func The Lambda function for which error alarms are to be created
:param config The alarm configuration
:return The alarm name
"""
alarm_name = f'LambdaErrors_{func.name}'
LOG.info(f'Creating alarm {alarm_name}')
return cloudwatch_client.put_metric_alarm(
LOG.info('Creating errors alarm', extra={'AlarmName': alarm_name})
cloudwatch_client.put_metric_alarm(
AlarmName=alarm_name,
Period=config.period,
EvaluationPeriods=1,
Expand All @@ -33,18 +39,19 @@ def _create_lambda_errors_alarm(func: LambdaFunction, config: LambdaAlarmsConfig
Dimensions=[{'Name': 'FunctionName', 'Value': func.name}],
AlarmActions=[SNS_ALARMS_TOPIC]
)
return alarm_name


def _create_lambda_throttles_alarm(func: LambdaFunction, config: LambdaAlarmsConfig):
def _create_lambda_throttles_alarm(func: LambdaFunction, config: LambdaAlarmsConfig) -> str:
""" Create an alarm on the number of throttles as a percentage
of invocations for a given period
:func_name The Lambda function name
:threshold The minimum percentage of throttles to invocations to raise the alarm
:period The period for evaluation in seconds """
:param func The Lambda function for which error alarms are to be created
:param config The alarm configuration
:return The alarm name """
alarm_name = f'LambdaThrottles_{func.name}'
LOG.info(f'Creating alarm {alarm_name}')
return cloudwatch_client.put_metric_alarm(
LOG.info('Creating throttles alarm', extra={'AlarmName': alarm_name})
cloudwatch_client.put_metric_alarm(
AlarmName=alarm_name,
EvaluationPeriods=1,
DatapointsToAlarm=1,
Expand Down Expand Up @@ -88,13 +95,18 @@ def _create_lambda_throttles_alarm(func: LambdaFunction, config: LambdaAlarmsCon
AlarmDescription=f'Alarm for Lambda {func.name} throttles/invocations',
AlarmActions=[SNS_ALARMS_TOPIC]
)
return alarm_name


def _create_lambda_duration_alarms(func: LambdaFunction, config: LambdaAlarmsConfig):
""" Create an alarm for Lambda duration when it reaches a percentage threshold of the function timeout """
""" Create an alarm for Lambda duration when it reaches a percentage threshold of the function timeout
:param func The Lambda function for which error alarms are to be created
:param config The alarm configuration
:return The alarm name """
alarm_name = f'LambdaDuration_{func.name}'
LOG.info(f'Creating alarm {alarm_name}')
return cloudwatch_client.put_metric_alarm(
LOG.info('Creating duration alarm', extra={'AlarmName': alarm_name})
cloudwatch_client.put_metric_alarm(
AlarmName=alarm_name,
Period=config.period,
EvaluationPeriods=1,
Expand All @@ -108,41 +120,47 @@ def _create_lambda_duration_alarms(func: LambdaFunction, config: LambdaAlarmsCon
Dimensions=[{'Name': 'FunctionName', 'Value': func.name}],
AlarmActions=[SNS_ALARMS_TOPIC]
)
return alarm_name


def _create_lambda_alarms(func: LambdaFunction, config: LambdaAlarmsConfig):
_create_lambda_errors_alarm(func, config)
_create_lambda_throttles_alarm(func, config)
_create_lambda_duration_alarms(func, config)
def create_lambda_alarms(func: LambdaFunction, config: LambdaAlarmsConfig):
return [
_create_lambda_errors_alarm(func, config),
_create_lambda_throttles_alarm(func, config),
_create_lambda_duration_alarms(func, config),
]


def update_alarms(config: LambdaAlarmsConfig):
def update_alarms(config: LambdaAlarmsConfig, lambda_functions: Mapping[str, LambdaFunction]) -> Iterable[str]:
""" Create or update alarms for Lambda functions
:param config: The alarm configuration parameters for Lambda functions
:param lambda_functions: The set of Lambda functions for which alarms are to be created
:return Alarm names
"""
lambda_functions = get_applicable_lambdas()
LOG.info(f'Creating alarms for {lambda_functions}')
LOG.info('Creating alarms', extra={'Functions': lambda_functions, 'Count': len(lambda_functions)})

with futures.ThreadPoolExecutor(max_workers=MAX_PUT_ALARM_CONCURRENCY) as executor:
wait_for = [
executor.submit(_create_lambda_alarms, func, config)
for func in lambda_functions.values()
]
alarm_names = executor.map(
partial(create_lambda_alarms, config=config),
lambda_functions.values()
)
return list(alarm_names)

for future in futures.as_completed(wait_for):
LOG.info(future.result())


def delete_alarms(func_name: str):
def delete_alarms(func_name: str) -> Iterable[str]:
""" Delete alarms created by this stack for a specified Lambda function
:param func_name: Lambda function name
:return The deleted alarm names
"""
# DeleteAlarms fails silently if the alarms with any
response = cloudwatch_client.delete_alarms(AlarmNames=[
# TODO - include SLIC Watch indicator in Alarm names to avoid collision
alarm_names = [
f'LambdaErrors_{func_name}',
f'LambdaThrottles_{func_name}',
f'LambdaDuration_{func_name}',
])
LOG.info(response)
f'LambdaDuration_{func_name}'
]
# DeleteAlarms fails silently if the alarms with any
cloudwatch_client.delete_alarms(AlarmNames=alarm_names)
return alarm_names
6 changes: 4 additions & 2 deletions watch/dashboard.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import json
from typing import Mapping

from aws_lambda_powertools import Logger
import boto3

import dashboard_lambda
from lambda_function import LambdaFunction

DASHBOARD_PERIOD = '-PT3H'
WIDGET_WIDTH = 24
Expand All @@ -15,8 +17,8 @@
cw_client = boto3.client('cloudwatch')


def update_dashboard():
widgets = dashboard_lambda.get_widgets()
def update_dashboard(lambda_functions: Mapping[str, LambdaFunction]):
widgets = dashboard_lambda.get_widgets(lambda_functions)
lay_out_widgets(widgets)
dash = {
'start': DASHBOARD_PERIOD,
Expand Down

0 comments on commit 945bd8a

Please sign in to comment.