Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
85e69c2
Add pipeline to cleanup cloud resources
mrodm Feb 21, 2024
7a21eca
Test in main pipeline
mrodm Feb 21, 2024
95b0cb2
Update secrets in pre-hook
mrodm Feb 22, 2024
bf30001
Enable filter by date
mrodm Feb 22, 2024
8beb942
Fix field name
mrodm Feb 22, 2024
b4d1b9c
Add missing backslash
mrodm Feb 22, 2024
b5bcb16
Add environment variable for docker registry
mrodm Feb 22, 2024
c96510b
Add docker registry var for test step
mrodm Feb 22, 2024
2f50da8
Set the correct paths for configs
mrodm Feb 22, 2024
edebdea
Skip filter by date in aws nodes
mrodm Feb 22, 2024
68f9591
Remove "value" from converters section
mrodm Feb 22, 2024
215fea6
Add SQS queues
mrodm Feb 22, 2024
5322187
Add state filter for nodes
mrodm Feb 22, 2024
45107fd
Add environment filter for nodes gcp
mrodm Feb 22, 2024
d7d47d4
Rename pipeline file
mrodm Feb 22, 2024
cc96647
Add email notificaiton - currently disabled
mrodm Feb 22, 2024
d548a7b
Install awscli
mrodm Feb 22, 2024
890a985
Set quiet for unzip
mrodm Feb 22, 2024
214a2c7
Add validation commands
mrodm Feb 22, 2024
30e551f
Show errors
mrodm Feb 23, 2024
9632fb2
Fix url to download aws binary
mrodm Feb 23, 2024
4b92dc0
Set variables for period
mrodm Feb 23, 2024
92879a3
Refactor functions and rename variable for date
mrodm Feb 23, 2024
d06503c
Update collapsed section pending
mrodm Feb 23, 2024
cbba05d
Update retention period values
mrodm Feb 23, 2024
13ad1c5
Add dry-run option
mrodm Feb 23, 2024
3fd3eb7
Fix docker image name
mrodm Feb 23, 2024
68c6ff1
Restore all steps in main pipeline
mrodm Feb 23, 2024
27a920d
Merge remote-tracking branch 'upstream/main' into add_daily_cleanup_job
mrodm Mar 4, 2024
8617e65
Merge remote-tracking branch 'upstream/main' into add_daily_cleanup_job
mrodm Mar 4, 2024
b1026b0
Remove debug leftovers
mrodm Mar 5, 2024
bbe1265
Keep same name for resources to delete var
mrodm Mar 5, 2024
6889361
Fix condition to check stale resources in AWS
mrodm Mar 5, 2024
cc0ffaa
Add echo messages
mrodm Mar 5, 2024
5138a38
Just take into account lines starting with line 4
mrodm Mar 5, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions .buildkite/configs/cleanup.aws.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
---
version: "1.0"

accounts:
- name: "${ACCOUNT_PROJECT}"
driver: "aws"
options:
key: '${ACCOUNT_KEY}'
secret: '${ACCOUNT_SECRET}'

scanners:
- account_name: "${ACCOUNT_PROJECT}"
resources:
- type: 'node'
regions:
- us-east-1
filters:
- type: "<"
pointer: "/created_at"
param: "${CREATION_DATE}"
converters:
param: "date"
- type: "regex"
pointer: "/extra/tags/repo"
param: "^(elastic-package|integrations)"
- type: "="
pointer: "/extra/tags/environment"
param: "ci"
- type: "regex"
pointer: "/name"
param: "^elastic-package-(.*)"
- type: "!="
pointer: "/state"
param: "unknown"
- type: "!="
pointer: "/state"
param: "terminated"
- type: 'object_storage_bucket'
regions:
- us-east-1
filters:
- type: "<"
pointer: "/created_at"
param: "${CREATION_DATE}"
converters:
param: "date"
value: "date"
- type: "regex"
pointer: "/extra/tags/repo"
param: "^(elastic-package|integrations)"
- type: "="
pointer: "/extra/tags/environment"
param: "ci"
- type: "regex"
pointer: "/name"
param: "^elastic-package-(.*)"
- type: 'queue'
regions:
- us-east-1
filters:
- type: "<"
pointer: "/extra/tags/created_at"
param: "${CREATION_DATE}"
converters:
param: "date"
value: "date_epoch_ms"
- type: "regex"
pointer: "/extra/tags/repo"
param: "^(elastic-package|integrations)"
- type: "="
pointer: "/extra/tags/environment"
param: "ci"
- type: "regex"
pointer: "/id"
param: "^https://(.*)/elastic-package-(.*)"
40 changes: 40 additions & 0 deletions .buildkite/configs/cleanup.gcp.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
---
version: "1.0"

accounts:
- name: "${ACCOUNT_PROJECT}"
driver: "gce"
options:
key: "${ACCOUNT_KEY}"
secret: "${ACCOUNT_SECRET}"
project: "${ACCOUNT_PROJECT}"

scanners:
- account_name: "${ACCOUNT_PROJECT}"
resources:
- type: "node"
regions:
- "us-east1"
filters:
- type: "<"
pointer: "/extra/creationTimestamp"
param: "${CREATION_DATE}"
converters:
param: "date"
value: "date"
- type: "="
pointer: "/extra/labels/repo"
param: "elastic-package"
- type: "="
pointer: "/extra/labels/environment"
param: "ci"
- type: "regex"
pointer: "/name"
param: "^elastic-package-(.*)"
- type: "!="
pointer: "/state"
param: "unknown"
- type: "!="
pointer: "/state"
param: "terminated"

23 changes: 21 additions & 2 deletions .buildkite/hooks/pre-command
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,15 @@ export CREATED_DATE
# https://buildkite.com/docs/pipelines/managing-log-output#redacted-environment-variables

if [[ "$BUILDKITE_PIPELINE_SLUG" == "elastic-package" && ("$BUILDKITE_STEP_KEY" =~ ^integration-parallel || "$BUILDKITE_STEP_KEY" =~ ^integration-false_positives) ]]; then
PRIVATE_CI_GCS_CREDENTIALS_SECRET=$(retry 5 vault kv get -field plaintext -format=json ${PRIVATE_CI_GCS_CREDENTIALS_PATH})
PRIVATE_CI_GCS_CREDENTIALS_SECRET=$(retry 5 vault kv get -field plaintext -format=json ${PRIVATE_CI_GCS_CREDENTIALS_PATH} | jq -c)
export PRIVATE_CI_GCS_CREDENTIALS_SECRET
export JOB_GCS_BUCKET_INTERNAL="ingest-buildkite-ci"
fi

if [[ "$BUILDKITE_PIPELINE_SLUG" == "elastic-package" && "$BUILDKITE_STEP_KEY" == "integration-parallel-gcp" ]]; then
ELASTIC_PACKAGE_GCP_PROJECT_SECRET=$(retry 5 vault read -field projectId ${GCP_SERVICE_ACCOUNT_SECRET_PATH})
export ELASTIC_PACKAGE_GCP_PROJECT_SECRET
ELASTIC_PACKAGE_GCP_CREDENTIALS_SECRET=$(retry 5 vault read -field credentials ${GCP_SERVICE_ACCOUNT_SECRET_PATH})
ELASTIC_PACKAGE_GCP_CREDENTIALS_SECRET=$(retry 5 vault read -field credentials ${GCP_SERVICE_ACCOUNT_SECRET_PATH} | jq -c)
export ELASTIC_PACKAGE_GCP_CREDENTIALS_SECRET

# Environment variables required by the service deployer
Expand Down Expand Up @@ -87,3 +87,22 @@ if [[ "$BUILDKITE_PIPELINE_SLUG" == "elastic-package-test-with-integrations" &&
GITHUB_TOKEN=$(retry 5 vault kv get -field token ${GITHUB_TOKEN_VAULT_PATH})
export GITHUB_TOKEN
fi

if [[ "$BUILDKITE_PIPELINE_SLUG" == "elastic-package-cloud-cleanup" && "$BUILDKITE_STEP_KEY" == "cloud-cleanup" ]]; then
ELASTIC_PACKAGE_AWS_SECRET_KEY=$(retry 5 vault kv get -field secret_key ${AWS_SERVICE_ACCOUNT_SECRET_PATH})
export ELASTIC_PACKAGE_AWS_SECRET_KEY
ELASTIC_PACKAGE_AWS_ACCESS_KEY=$(retry 5 vault kv get -field access_key ${AWS_SERVICE_ACCOUNT_SECRET_PATH})
export ELASTIC_PACKAGE_AWS_ACCESS_KEY
ELASTIC_PACKAGE_AWS_USER_SECRET=$(retry 5 vault kv get -field user ${AWS_SERVICE_ACCOUNT_SECRET_PATH})
export ELASTIC_PACKAGE_AWS_USER_SECRET

ELASTIC_PACKAGE_GCP_CREDENTIALS_SECRET=$(retry 5 vault read -field credentials ${GCP_SERVICE_ACCOUNT_SECRET_PATH} | jq -c)
export ELASTIC_PACKAGE_GCP_CREDENTIALS_SECRET
ELASTIC_PACKAGE_GCP_KEY_SECRET=$(echo "${ELASTIC_PACKAGE_GCP_CREDENTIALS_SECRET}" | jq -r '.private_key' | tr -d '\n')
export ELASTIC_PACKAGE_GCP_KEY_SECRET
ELASTIC_PACKAGE_GCP_PROJECT_SECRET=$(retry 5 vault read -field projectId ${GCP_SERVICE_ACCOUNT_SECRET_PATH})
export ELASTIC_PACKAGE_GCP_PROJECT_SECRET
ELASTIC_PACKAGE_GCP_EMAIL_SECRET=$(retry 5 vault read -field username ${GCP_SERVICE_ACCOUNT_SECRET_PATH})
export ELASTIC_PACKAGE_GCP_EMAIL_SECRET
fi

1 change: 1 addition & 0 deletions .buildkite/hooks/pre-exit
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ unset ELASTIC_PACKAGE_AWS_ACCESS_KEY
unset ELASTIC_PACKAGE_AWS_SECRET_KEY
unset AWS_ACCESS_KEY_ID
unset AWS_SECRET_ACCESS_KEY

22 changes: 22 additions & 0 deletions .buildkite/pipeline.cloud-cleanup.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/buildkite/pipeline-schema/main/schema.json

# Removes stale Cloud resources (AWS and GCP) having matching labels, name prefixes and older than 24 hours
name: elastic-package-cloud-cleanup

env:
DOCKER_REGISTRY: docker.elastic.co
NOTIFY_TO: "ecosystem-team@elastic.co"

steps:
- label: "Cloud Cleanup"
key: "cloud-cleanup"
command: ".buildkite/scripts/cloud-cleanup.sh"
env:
RESOURCE_RETENTION_PERIOD: "24 hours"
DRY_RUN: "true"
agents:
provider: "gcp"

notify:
- email: "$NOTIFY_TO"
if: "build.state == 'failed' && build.env('BUILDKITE_PULL_REQUEST') == 'false'"
126 changes: 126 additions & 0 deletions .buildkite/scripts/cloud-cleanup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#!/usr/bin/env bash

source .buildkite/scripts/install_deps.sh

set -euo pipefail

AWS_RESOURCES_FILE="aws.resources.txt"
GCP_RESOURCES_FILE="gcp.resources.txt"

RESOURCE_RETENTION_PERIOD="${RESOURCE_RETENTION_PERIOD:-"24 hours"}"
export DELETE_RESOURCES_BEFORE_DATE=$(date -Is -d "${RESOURCE_RETENTION_PERIOD} ago")

CLOUD_REAPER_IMAGE="${DOCKER_REGISTRY}/observability-ci/cloud-reaper:0.3.0"

resources_to_delete=0

COMMAND="validate"
if [[ "${DRY_RUN}" != "true" ]]; then
COMMAND="plan" # TODO: to be changed to "destroy --confirm"
else
COMMAND="plan"
fi

any_resources_to_delete() {
local file=$1
local number=0
# First three lines are like:
# ⇒ Loading configuration...
# ✓ Succeeded to load configuration
# Scanning resources... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00
number=$(tail -n +4 "${file}" | wc -l)
if [ "${number}" -eq 0 ]; then
return 1
fi
return 0
}

cloud_reaper_aws() {
echo "Validating configuration"
docker run --rm -v $(pwd)/.buildkite/configs/cleanup.aws.yml:/etc/cloud-reaper/config.yml \
-e ACCOUNT_SECRET="${ELASTIC_PACKAGE_AWS_SECRET_KEY}" \
-e ACCOUNT_KEY="${ELASTIC_PACKAGE_AWS_ACCESS_KEY}" \
-e ACCOUNT_PROJECT="${ELASTIC_PACKAGE_AWS_USER_SECRET}" \
-e CREATION_DATE="${DELETE_RESOURCES_BEFORE_DATE}" \
"${CLOUD_REAPER_IMAGE}" \
cloud-reaper \
--config /etc/cloud-reaper/config.yml \
validate

echo "Scanning resources"
docker run --rm -v $(pwd)/.buildkite/configs/cleanup.aws.yml:/etc/cloud-reaper/config.yml \
-e ACCOUNT_SECRET="${ELASTIC_PACKAGE_AWS_SECRET_KEY}" \
-e ACCOUNT_KEY="${ELASTIC_PACKAGE_AWS_ACCESS_KEY}" \
-e ACCOUNT_PROJECT="${ELASTIC_PACKAGE_AWS_USER_SECRET}" \
-e CREATION_DATE="${DELETE_RESOURCES_BEFORE_DATE}" \
"${CLOUD_REAPER_IMAGE}" \
cloud-reaper \
--config /etc/cloud-reaper/config.yml \
${COMMAND} | tee "${AWS_RESOURCES_FILE}"
}

cloud_reaper_gcp() {
echo "Validating configuration"
docker run --rm -v $(pwd)/.buildkite/configs/cleanup.gcp.yml:/etc/cloud-reaper/config.yml \
-e ACCOUNT_SECRET="${ELASTIC_PACKAGE_GCP_KEY_SECRET}" \
-e ACCOUNT_KEY="${ELASTIC_PACKAGE_GCP_EMAIL_SECRET}" \
-e ACCOUNT_PROJECT="${ELASTIC_PACKAGE_GCP_PROJECT_SECRET}" \
-e CREATION_DATE="${DELETE_RESOURCES_BEFORE_DATE}" \
"${CLOUD_REAPER_IMAGE}" \
cloud-reaper \
--config /etc/cloud-reaper/config.yml \
validate

echo "Scanning resources"
docker run --rm -v $(pwd)/.buildkite/configs/cleanup.gcp.yml:/etc/cloud-reaper/config.yml \
-e ACCOUNT_SECRET="${ELASTIC_PACKAGE_GCP_KEY_SECRET}" \
-e ACCOUNT_KEY="${ELASTIC_PACKAGE_GCP_EMAIL_SECRET}" \
-e ACCOUNT_PROJECT="${ELASTIC_PACKAGE_GCP_PROJECT_SECRET}" \
-e CREATION_DATE="${DELETE_RESOURCES_BEFORE_DATE}" \
"${CLOUD_REAPER_IMAGE}" \
cloud-reaper \
--config /etc/cloud-reaper/config.yml \
${COMMAND} | tee "${GCP_RESOURCES_FILE}"
}

echo "--- Cleaning up GCP resources older than ${DELETE_RESOURCES_BEFORE_DATE}..."
cloud_reaper_gcp

if any_resources_to_delete "${GCP_RESOURCES_FILE}"; then
echo "Pending GCP resources"
resources_to_delete=1
fi

echo "--- Cleaning up AWS resources older than ${DELETE_RESOURCES_BEFORE_DATE}..."
cloud_reaper_aws

if any_resources_to_delete "${AWS_RESOURCES_FILE}" ; then
echo "Pending AWS resources"
resources_to_delete=1
fi

if [ "${resources_to_delete}" -eq 1 ]; then
message="There are resources to be deleted"
echo "${message}"
if running_on_buildkite ; then
buildkite-agent annotate \
"${message}" \
--context "ctx-cloud-reaper-error" \
--style "error"
fi
exit 1
fi

# TODO: List and delete the required resources using aws cli
echo "--- Cleaning up other AWS resources older than ${DELETE_RESOURCES_BEFORE_DATE}"
echo "--- Installing awscli"
with_aws_cli

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the deadline? So we can add support for other resources in cloud-reaper?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no hard deadline for this. We were planning to merge this PR to start checking (with plan command) if there is any stale resource with the current resources supported by cloud-reaper.

We were thinking to add in following PRs, at least listing and filtering by tags for the other resources using aws CLI directly starting with redshift clusters. But, we would prefer to do so also using cloud-reaper to keep all the cleanup process using the same tool.

Should we wait for the cloud-reaper support instead for the other resources ? Or, should we add the redshift listing/filtering with aws CLI for now and some notification ? Those redshift clusters were the resources that were left in some builds @jsoriano @kpollich

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it can wait for some time, I should be able to add support for those resources.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I should be able to add support for those resources.
Thanks @amannocci !

I think probably we could wait for some time, WDYT @jsoriano @kpollich ?

I've just created a PR to list if there is any AWS Redshift cluster stale due to CI builds. That would allow us to receive email notifications if there is any left cluster until that support is added into cloud-reaper:
#1710

Once this support is added into cloud-reaper, this logic could be moved to the cloud-reaper for sure, so the cleanup is carried out with the same tool for all resources.

In any case, this PR could be merged to start checking there is no resource like instances or queues.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think probably we could wait for some time, WDYT @jsoriano @kpollich ?

I think we can go by now with the supported resources and add the missing ones later.

In any case, this PR could be merged to start checking there is no resource like instances or queues.

+1


export AWS_ACCESS_KEY_ID="${ELASTIC_PACKAGE_AWS_ACCESS_KEY}"
export AWS_SECRET_ACCESS_KEY="${ELASTIC_PACKAGE_AWS_ACCESS_KEY}"
export AWS_DEFAULT_REGION=us-east-1

echo "--- TODO: Cleaning up Redshift clusters"
echo "--- TODO: Cleaning up IAM roles"
echo "--- TODO: Cleaning up IAM policies"
echo "--- TODO: Cleaning up Schedulers"
14 changes: 14 additions & 0 deletions .buildkite/scripts/install_deps.sh
Original file line number Diff line number Diff line change
Expand Up @@ -147,3 +147,17 @@ with_jq() {
chmod +x "${WORKSPACE}/bin/jq"
jq --version
}

with_aws_cli() {
check_platform_architecture

if ! which aws; then
curl -s "https://awscli.amazonaws.com/awscli-exe-${platform_type_lowercase}-${hw_type}.zip" -o "awscliv2.zip"
unzip -q awscliv2.zip
sudo ./aws/install
rm -rf awscliv2.zip aws
aws --version
return
fi
echo "\"aws\" already installed"
}
7 changes: 7 additions & 0 deletions .buildkite/scripts/tooling.sh
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,10 @@ google_cloud_logout_active_account() {
unset GOOGLE_APPLICATION_CREDENTIALS
fi
}

running_on_buildkite() {
if [[ "${BUILDKITE:-"false"}" == "true" ]]; then
return 0
fi
return 1
}
47 changes: 47 additions & 0 deletions catalog-info.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,50 @@ spec:
access_level: MANAGE_BUILD_AND_READ
everyone:
access_level: READ_ONLY

---
# yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/e57ee3bed7a6f73077a3f55a38e76e40ec87a7cf/rre.schema.json
apiVersion: backstage.io/v1alpha1
kind: Resource
metadata:
name: buildkite-pipeline-elastic-package-cloud-cleanup
description: Clean up stale cloud resources
links:
- title: Pipeline
url: https://buildkite.com/elastic/elastic-package-cloud-cleanup

spec:
type: buildkite-pipeline
owner: group:ingest-fp
system: buildkite
implementation:
apiVersion: buildkite.elastic.dev/v1
kind: Pipeline
metadata:
name: elastic-package-cloud-cleanup
description: Buildkite pipeline for cleaning stale resource in cloud providers
spec:
pipeline_file: ".buildkite/pipeline.cloud-cleanup.yml"
provider_settings:
build_pull_request_forks: false
build_pull_requests: false # requires filter_enabled and filter_condition settings as below when used with buildkite-pr-bot
publish_commit_status: false # do not update status of commits for this pipeline
build_tags: false
build_branches: false
filter_enabled: true
filter_condition: >-
build.pull_request.id == null || (build.creator.name == 'elasticmachine' && build.pull_request.id != null)
cancel_intermediate_builds: false # do not cancel any build to avoid inconsistent states
skip_intermediate_builds: true # just need to run the latest commit
repository: elastic/elastic-package
schedules:
Daily main:
branch: main
cronline: "00 1 * * *"
message: Daily Cloud cleanup
teams:
ingest-fp:
access_level: MANAGE_BUILD_AND_READ
everyone:
access_level: BUILD_AND_READ