diff --git a/events/stack-delete-event.json b/events/stack-delete-event.json new file mode 100644 index 00000000..872c12a8 --- /dev/null +++ b/events/stack-delete-event.json @@ -0,0 +1 @@ +{"Action": "Delete"} diff --git a/template.yaml b/template.yaml index f0441862..55beed10 100644 --- a/template.yaml +++ b/template.yaml @@ -1,6 +1,8 @@ AWSTemplateFormatVersion: '2010-09-09' Transform: AWS::Serverless-2016-10-31 Description: > + slic-watch + Automatic CloudWatch metrics, alarms and dashboards Metadata: @@ -21,9 +23,8 @@ Parameters: Type: String Description: > A comma-separated list of AWS services to watch. - All supported services are included by default - (`lambda,kinesis,sqs,dynamodb`). - Default: 'lambda,kinesis,sqs,dynamodb' + All supported services are included by default (`lambda`). + Default: 'lambda' TagFilter: Type: String Description: > @@ -55,14 +56,50 @@ Globals: POWERTOOLS_SERVICE_NAME: slicwatch LOG_LEVEL: INFO SNS_ALARMS_TOPIC: !Ref SNSAlarmsTopic + LAMBDA_THROTTLES_RATE_THRESHOLD: !Ref LambdaThrottlesRateThreshold + LAMBDA_ERRORS_PERIOD: !Ref LambdaErrorsPeriod + LAMBDA_ERRORS_THRESHOLD: !Ref LambdaErrorsThreshold + TAG_FILTER: !Ref TagFilter + WATCH_SERVICES: !Ref WatchServices Resources: - WatchExistingResources: + HandleStack: Type: AWS::Serverless::Function Properties: - Description: Find existing resources and create or update CloudWatch alarms, metrics and dashboards + Description: Create CloudWatch alarms and dashboards in response to CloudFormation update of this stack CodeUri: watch/ - Handler: watch_handler.watch_existing + Handler: watch_handler.handle_stack + Runtime: python3.8 + Timeout: 30 + Policies: + - Statement: + Effect: Allow + Action: + - cloudwatch:ListMetrics + - cloudwatch:PutDashboard + - cloudwatch:PutMetricAlarm + - cloudwatch:PutMetricData + Resource: '*' + - Statement: + Effect: Allow + Action: + - lambda:GetFunction + - lambda:ListFunctions + Resource: '*' + + HandleStackLogGroup: + Type: AWS::Logs::LogGroup + DeletionPolicy: Retain + Properties: + LogGroupName: !Sub /aws/lambda/${HandleStack} + RetentionInDays: 7 + + HandleFunctionCreateUpdate: + Type: AWS::Serverless::Function + Properties: + Description: Create CloudWatch alarms and dashboards in response to Lambda function create or update + CodeUri: watch/ + Handler: watch_handler.handle_function_create_updates Runtime: python3.8 Timeout: 30 Policies: @@ -94,13 +131,53 @@ Resources: - lambda.amazonaws.com eventName: - prefix: CreateFunction + - prefix: UpdateFunctionConfiguration + + HandleFunctionCreateUpdateLogGroup: + Type: AWS::Logs::LogGroup + DeletionPolicy: Retain + Properties: + LogGroupName: !Sub /aws/lambda/${HandleFunctionCreateUpdate} + RetentionInDays: 7 + + HandleFunctionDelete: + Type: AWS::Serverless::Function + Properties: + Description: Delete CloudWatch alarms and dashboards in response to Lambda function delete + CodeUri: watch/ + Handler: watch_handler.handle_function_delete + Runtime: python3.8 + Timeout: 30 + Policies: + - Statement: + Effect: Allow + Action: + - cloudwatch:ListMetrics + - cloudwatch:PutDashboard + - cloudwatch:PutMetricAlarm + - cloudwatch:PutMetricData + Resource: '*' + Events: + CloudTrailTrigger: + Type: EventBridgeRule + Properties: + Pattern: + source: + - aws.lambda + detail-type: + - AWS API Call via CloudTrail + detail: + eventSource: + - lambda.amazonaws.com + eventName: - prefix: DeleteFunction + - prefix: UpdateFunctionConfiguration - WatchExistingResourcesLogGroup: + HandleFunctionDeleteLogGroup: Type: AWS::Logs::LogGroup DeletionPolicy: Retain Properties: - LogGroupName: !Sub /aws/lambda/${WatchExistingResources} + LogGroupName: !Sub /aws/lambda/${HandleFunctionDelete} RetentionInDays: 7 DeleteResources: @@ -155,23 +232,33 @@ Resources: ApplicationId: arn:aws:serverlessrepo:us-east-1:374852340823:applications/lambda-invocation-cfn-custom-resource SemanticVersion: 1.4.0 - InvokeWatchExisting: + HandleStackCreate: Type: Custom::LambdaInvocation DependsOn: - - WatchExistingResources + - HandleStack - LambdaInvocationCustomResource Properties: ServiceToken: !GetAtt LambdaInvocationCustomResource.Outputs.FunctionArn - FunctionName: !Ref WatchExistingResources + FunctionName: !Ref HandleStack InvocationType: Event When: - Create - - Update Payload: - WatchServices: !Ref WatchServices - TagFilter: !Ref TagFilter - LambdaErrorsThreshold: !Ref LambdaErrorsThreshold - LambdaErrorsPeriod: !Ref LambdaErrorsPeriod + Action: Create + + HandleStackDelete: + Type: Custom::LambdaInvocation + DependsOn: + - HandleStack + - LambdaInvocationCustomResource + Properties: + ServiceToken: !GetAtt LambdaInvocationCustomResource.Outputs.FunctionArn + FunctionName: !Ref HandleStack + InvocationType: Event + When: + - Delete + Payload: + Action: Delete SNSAlarmsTopic: Type: AWS::SNS::Topic diff --git a/tests/unit/watch/test_alarms.py b/tests/unit/watch/test_alarms.py index 3a2b99cf..8f5f7a00 100644 --- a/tests/unit/watch/test_alarms.py +++ b/tests/unit/watch/test_alarms.py @@ -26,7 +26,7 @@ def test_update_alarms(lambda_functions): throttles_percent_threshold=0.0, duration_percent_timeout_threshold=90.0, ) - update_alarms(config) + update_alarms(config, lambda_functions) cw = boto3.client("cloudwatch") alarms_response = cw.describe_alarms() @@ -85,7 +85,7 @@ def test_delete_alarm(lambda_functions): throttles_percent_threshold=0.0, duration_percent_timeout_threshold=90.0, ) - update_alarms(config) + update_alarms(config, lambda_functions) fn_name = list(lambda_functions.keys())[-1] # Pick a function for which alarms are to be deleted delete_alarms(fn_name) diff --git a/tests/unit/watch/test_dashboard.py b/tests/unit/watch/test_dashboard.py index 65909a52..aa9654c6 100644 --- a/tests/unit/watch/test_dashboard.py +++ b/tests/unit/watch/test_dashboard.py @@ -7,7 +7,7 @@ def test_dashboard(lambda_functions): from dashboard import update_dashboard - update_dashboard() + update_dashboard(lambda_functions) cw = boto3.client("cloudwatch") dash = json.loads(cw.get_dashboard(DashboardName="SLICWatch")["DashboardBody"]) diff --git a/tests/unit/watch/test_watch_handler.py b/tests/unit/watch/test_watch_handler.py index f094159c..6c2c4ec0 100644 --- a/tests/unit/watch/test_watch_handler.py +++ b/tests/unit/watch/test_watch_handler.py @@ -15,14 +15,14 @@ def __init__(self): @mock_cloudwatch def test_handle_event(lambda_functions): - os.environ["SNS_ALARMS_TOPIC"] = "TestAlarmsTopic" - event = { - 'Period': 99, - 'DurationPercentTimeoutThreshold': 47.3 - } - - from watch_handler import watch_existing - watch_existing(event, DummyContext()) + os.environ.update({ + 'SNS_ALARMS_TOPIC': 'TestAlarmsTopic', + 'PERIOD': '99', + 'DURATION_PERCENT_TIMEOUT_THRESHOLD': '47.3', + }) + + from watch_handler import handle_stack + handle_stack({}, DummyContext()) cw_client = boto3.client('cloudwatch') f1_lambda_errors_alarm = cw_client.describe_alarms(AlarmNames=['LambdaErrors_TestFunction1'])['MetricAlarms'][0] diff --git a/watch/alarms.py b/watch/alarms.py index e975636f..52e74559 100644 --- a/watch/alarms.py +++ b/watch/alarms.py @@ -1,12 +1,14 @@ +from typing import Iterable, Mapping + import boto3 import os +from functools import partial from aws_lambda_powertools import Logger from concurrent import futures from alarm_config import LambdaAlarmsConfig from lambda_function import LambdaFunction -from lambdas import get_applicable_lambdas SNS_ALARMS_TOPIC = os.getenv('SNS_ALARMS_TOPIC') MAX_PUT_ALARM_CONCURRENCY = 3 @@ -15,11 +17,15 @@ cloudwatch_client = boto3.client('cloudwatch') -def _create_lambda_errors_alarm(func: LambdaFunction, config: LambdaAlarmsConfig): - """ Create an alarm for Lambda errors """ +def _create_lambda_errors_alarm(func: LambdaFunction, config: LambdaAlarmsConfig) -> str: + """ Create an alarm for Lambda errors + :param func The Lambda function for which error alarms are to be created + :param config The alarm configuration + :return The alarm name + """ alarm_name = f'LambdaErrors_{func.name}' - LOG.info(f'Creating alarm {alarm_name}') - return cloudwatch_client.put_metric_alarm( + LOG.info('Creating errors alarm', extra={'AlarmName': alarm_name}) + cloudwatch_client.put_metric_alarm( AlarmName=alarm_name, Period=config.period, EvaluationPeriods=1, @@ -33,18 +39,19 @@ def _create_lambda_errors_alarm(func: LambdaFunction, config: LambdaAlarmsConfig Dimensions=[{'Name': 'FunctionName', 'Value': func.name}], AlarmActions=[SNS_ALARMS_TOPIC] ) + return alarm_name -def _create_lambda_throttles_alarm(func: LambdaFunction, config: LambdaAlarmsConfig): +def _create_lambda_throttles_alarm(func: LambdaFunction, config: LambdaAlarmsConfig) -> str: """ Create an alarm on the number of throttles as a percentage of invocations for a given period - :func_name The Lambda function name - :threshold The minimum percentage of throttles to invocations to raise the alarm - :period The period for evaluation in seconds """ + :param func The Lambda function for which error alarms are to be created + :param config The alarm configuration + :return The alarm name """ alarm_name = f'LambdaThrottles_{func.name}' - LOG.info(f'Creating alarm {alarm_name}') - return cloudwatch_client.put_metric_alarm( + LOG.info('Creating throttles alarm', extra={'AlarmName': alarm_name}) + cloudwatch_client.put_metric_alarm( AlarmName=alarm_name, EvaluationPeriods=1, DatapointsToAlarm=1, @@ -88,13 +95,18 @@ def _create_lambda_throttles_alarm(func: LambdaFunction, config: LambdaAlarmsCon AlarmDescription=f'Alarm for Lambda {func.name} throttles/invocations', AlarmActions=[SNS_ALARMS_TOPIC] ) + return alarm_name def _create_lambda_duration_alarms(func: LambdaFunction, config: LambdaAlarmsConfig): - """ Create an alarm for Lambda duration when it reaches a percentage threshold of the function timeout """ + """ Create an alarm for Lambda duration when it reaches a percentage threshold of the function timeout + + :param func The Lambda function for which error alarms are to be created + :param config The alarm configuration + :return The alarm name """ alarm_name = f'LambdaDuration_{func.name}' - LOG.info(f'Creating alarm {alarm_name}') - return cloudwatch_client.put_metric_alarm( + LOG.info('Creating duration alarm', extra={'AlarmName': alarm_name}) + cloudwatch_client.put_metric_alarm( AlarmName=alarm_name, Period=config.period, EvaluationPeriods=1, @@ -108,41 +120,47 @@ def _create_lambda_duration_alarms(func: LambdaFunction, config: LambdaAlarmsCon Dimensions=[{'Name': 'FunctionName', 'Value': func.name}], AlarmActions=[SNS_ALARMS_TOPIC] ) + return alarm_name -def _create_lambda_alarms(func: LambdaFunction, config: LambdaAlarmsConfig): - _create_lambda_errors_alarm(func, config) - _create_lambda_throttles_alarm(func, config) - _create_lambda_duration_alarms(func, config) +def create_lambda_alarms(func: LambdaFunction, config: LambdaAlarmsConfig): + return [ + _create_lambda_errors_alarm(func, config), + _create_lambda_throttles_alarm(func, config), + _create_lambda_duration_alarms(func, config), + ] -def update_alarms(config: LambdaAlarmsConfig): +def update_alarms(config: LambdaAlarmsConfig, lambda_functions: Mapping[str, LambdaFunction]) -> Iterable[str]: """ Create or update alarms for Lambda functions :param config: The alarm configuration parameters for Lambda functions + :param lambda_functions: The set of Lambda functions for which alarms are to be created + + :return Alarm names """ - lambda_functions = get_applicable_lambdas() - LOG.info(f'Creating alarms for {lambda_functions}') + LOG.info('Creating alarms', extra={'Functions': lambda_functions, 'Count': len(lambda_functions)}) with futures.ThreadPoolExecutor(max_workers=MAX_PUT_ALARM_CONCURRENCY) as executor: - wait_for = [ - executor.submit(_create_lambda_alarms, func, config) - for func in lambda_functions.values() - ] + alarm_names = executor.map( + partial(create_lambda_alarms, config=config), + lambda_functions.values() + ) + return list(alarm_names) - for future in futures.as_completed(wait_for): - LOG.info(future.result()) - -def delete_alarms(func_name: str): +def delete_alarms(func_name: str) -> Iterable[str]: """ Delete alarms created by this stack for a specified Lambda function :param func_name: Lambda function name + :return The deleted alarm names """ - # DeleteAlarms fails silently if the alarms with any - response = cloudwatch_client.delete_alarms(AlarmNames=[ + # TODO - include SLIC Watch indicator in Alarm names to avoid collision + alarm_names = [ f'LambdaErrors_{func_name}', f'LambdaThrottles_{func_name}', - f'LambdaDuration_{func_name}', - ]) - LOG.info(response) + f'LambdaDuration_{func_name}' + ] + # DeleteAlarms fails silently if the alarms with any + cloudwatch_client.delete_alarms(AlarmNames=alarm_names) + return alarm_names diff --git a/watch/dashboard.py b/watch/dashboard.py index 9708a2c0..df4e4d2f 100644 --- a/watch/dashboard.py +++ b/watch/dashboard.py @@ -1,9 +1,11 @@ import json +from typing import Mapping from aws_lambda_powertools import Logger import boto3 import dashboard_lambda +from lambda_function import LambdaFunction DASHBOARD_PERIOD = '-PT3H' WIDGET_WIDTH = 24 @@ -15,8 +17,8 @@ cw_client = boto3.client('cloudwatch') -def update_dashboard(): - widgets = dashboard_lambda.get_widgets() +def update_dashboard(lambda_functions: Mapping[str, LambdaFunction]): + widgets = dashboard_lambda.get_widgets(lambda_functions) lay_out_widgets(widgets) dash = { 'start': DASHBOARD_PERIOD, diff --git a/watch/dashboard_lambda.py b/watch/dashboard_lambda.py index c5734892..09dde432 100644 --- a/watch/dashboard_lambda.py +++ b/watch/dashboard_lambda.py @@ -1,4 +1,6 @@ -from lambdas import get_applicable_lambdas +from typing import Mapping + +from lambda_function import LambdaFunction from widget import create_metric_widget LAMBDA_FUNCTION_METRICS = [('Duration', 'Average'), ('Duration', 'p95'), @@ -33,6 +35,5 @@ def create_function_widgets(lambda_functions: list): return widgets -def get_widgets(): - lambda_functions = get_applicable_lambdas() +def get_widgets(lambda_functions: Mapping[str, LambdaFunction]): return [create_all_functions_widget()] + create_function_widgets(lambda_functions) diff --git a/watch/lambdas.py b/watch/lambdas.py index bbe2b0e4..65d505a9 100644 --- a/watch/lambdas.py +++ b/watch/lambdas.py @@ -2,17 +2,24 @@ import boto3 +from aws_lambda_powertools import Logger from lambda_function import LambdaFunction lambda_client = boto3.client('lambda') +LOG = Logger() +MAX_FUNCTIONS = 50 # Avoid creating too many resources -def get_applicable_lambdas(): + +def get_applicable_lambdas() -> Mapping[str, LambdaFunction]: """ Find all Lambdas and group them by tag so we can do tag filtering """ functions: Mapping[str: LambdaFunction] = {} # Moto still returns duplicates so we use the dict response to deduplicate for response in lambda_client.get_paginator('list_functions').paginate(): for function in response['Functions']: + if len(functions) == MAX_FUNCTIONS: + LOG.warning(f'Maximum number of Lambda functions ({MAX_FUNCTIONS})' + f' reached. Additional functions will be ignored.') name = function['FunctionName'] func_response = lambda_client.get_function(FunctionName=name) config = func_response['Configuration'] diff --git a/watch/requirements-dev.txt b/watch/requirements-dev.txt index 51510b94..ced7e999 100644 --- a/watch/requirements-dev.txt +++ b/watch/requirements-dev.txt @@ -1,5 +1,4 @@ -#moto>=1.3,<1.4 -git+https://github.com/eoinsha/moto.git@master#egg=moto +moto==1.3.17.dev230 # Awaiting full release https://github.com/spulec/moto/pull/3419#issuecomment-770359267 pytest>=6,<7 coveralls>=2.1,<3 docker>=4.3<5 diff --git a/watch/watch_handler.py b/watch/watch_handler.py index 3ec2d304..7ea92cff 100644 --- a/watch/watch_handler.py +++ b/watch/watch_handler.py @@ -1,34 +1,60 @@ """ Lambda handler module invoked during stack create and update. -Adds monitoring and alarms for new/updated resouces +Adds monitoring and alarms for new/updated resources """ +import os from aws_lambda_powertools import Logger import stringcase from alarm_config import LambdaAlarmsConfig from dashboard import update_dashboard -from alarms import update_alarms +from alarms import update_alarms, delete_alarms, create_lambda_alarms +from lambda_function import LambdaFunction +from lambdas import get_applicable_lambdas LOG = Logger() +def _get_alarm_config() -> LambdaAlarmsConfig: + """ Get alarm configuration from CloudFormation stack parameters passed as environment """ + alarm_config_fields = {} + for field_name, field in LambdaAlarmsConfig.__dataclass_fields__.items(): + env_name = stringcase.uppercase(field_name) + if env_name in os.environ: + alarm_config_fields[field_name] = field.type(os.environ[env_name]) + + return LambdaAlarmsConfig(**alarm_config_fields) + + @LOG.inject_lambda_context -def watch_existing(event, _): - LOG.info({'event': event}) - """ Handle create/update/delete of CloudFormation stacks """ - # services = event.get('WatchServices' - # tag_filter = event['TagFilter'] +def handle_stack(_event, _context): + """ Handle create/update of CloudFormation stacks """ + config = _get_alarm_config() + lambda_functions = get_applicable_lambdas() + update_dashboard(lambda_functions) + alarm_names = update_alarms(config, lambda_functions) - update_dashboard() + return alarm_names - alarm_config_fields = {} - for field_name, field in LambdaAlarmsConfig.__dataclass_fields__.items(): - param_name = stringcase.pascalcase(field_name) - if param_name in event: - alarm_config_fields[field_name] = field.type(event[param_name]) - config = LambdaAlarmsConfig(**alarm_config_fields) - update_alarms(config) +@LOG.inject_lambda_context +def handle_function_create_update(event, _): + """ Handle FunctionCreate or UpdateFunctionConfiguration via AWS CloudTrail via EventBridge """ + req = event['detail']['requestParameters'] + func = LambdaFunction( + func_name=req['functionName'], + runtime=req['runtime'], + timeout=req['timeout'], + memory_size=req['memorySize'], + tags=req['tags'] + ) + config = _get_alarm_config() + create_lambda_alarms(func, config) - return {} + +@LOG.inject_lambda_context +def handle_function_delete(event, _): + """ Handle FunctionDelete via AWS CloudTrail via EventBridge """ + func_name = event['detail']['requestParameters']['functionName'] + delete_alarms(func_name)