From c4c93c2921ac17c187cccc0171e875b0c2d4785a Mon Sep 17 00:00:00 2001 From: David Eliahu Date: Sun, 21 Nov 2021 21:00:03 -0800 Subject: [PATCH 1/3] Fix nightly CI tests (timeout) --- .circleci/config.yml | 119 ++++++++++++++++++++++++++++++++----------- 1 file changed, 90 insertions(+), 29 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index dec3d496b1..074d02e55a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -25,16 +25,6 @@ commands: name: Login to Quay command: docker login -u=$QUAY_USERNAME -p=$QUAY_PASSWORD quay.io - install-e2e-dependencies: - description: Install E2E Tests Dependencies - steps: - - run: - name: Install Dependencies - command: | - pip install boto3 pyyaml awscli - pip install -e ./test/e2e - pip install https://s3-us-west-2.amazonaws.com/get-cortex/master/python/cortex-master.tar.gz - run-e2e-tests: description: Creates a temporary cluster and runs the cortex E2E tests parameters: @@ -47,16 +37,6 @@ commands: type: string default: "#builds" steps: - - run: - name: Create Cluster - command: cortex cluster up << parameters.config >> --configure-env cortex -y - - run: - name: Run E2E Tests - no_output_timeout: 30m - command: | - pytest -v test/e2e/tests --env cortex --x86-nodegroups spot,cpu,gpu,inferentia --arm-nodegroups arm --skip-autoscaling --skip-load --skip-long-running - pytest -v test/e2e/tests --env cortex --x86-nodegroups spot,cpu,gpu,inferentia -k test_autoscaling - pytest -v test/e2e/tests --env cortex --x86-nodegroups spot,cpu,gpu,inferentia -k test_load - run: name: Delete Cluster command: cortex cluster down --config << parameters.config >> -y @@ -187,15 +167,14 @@ jobs: command: make ci-amend-images no_output_timeout: 20m - e2e-tests: + cluster-up: docker: - image: cimg/python:3.6 - environment: - CORTEX_TEST_BATCH_S3_PATH: s3://cortex-nightly-artifacts/test/jobs steps: - - setup_remote_docker - - checkout - - install-e2e-dependencies + - run: + name: Install Cortex CLI + command: | + pip install https://s3-us-west-2.amazonaws.com/get-cortex/master/python/cortex-master.tar.gz - run: name: Initialize Credentials command: | @@ -240,8 +219,68 @@ jobs: - run: name: Verify configuration of credentials command: aws sts get-caller-identity | jq ".Arn" | grep "dev-cortex-nightly-us-east-1" - - run-e2e-tests: - config: ./cluster.yaml + - run: + name: Create Cluster + command: cortex cluster up cluster.yaml --configure-env cortex -y + - slack/notify: + event: fail + channel: "#builds" + template: basic_fail_1 + + e2e-tests: + docker: + - image: cimg/python:3.6 + environment: + CORTEX_TEST_BATCH_S3_PATH: s3://cortex-nightly-artifacts/test/jobs + steps: + - checkout + - run: + name: Install Dependencies + command: | + pip install boto3 pyyaml awscli + pip install -e ./test/e2e + pip install https://s3-us-west-2.amazonaws.com/get-cortex/master/python/cortex-master.tar.gz + - run: + name: Initialize Credentials + command: | + echo 'export AWS_ACCESS_KEY_ID=${NIGHTLY_AWS_ACCESS_KEY_ID}' >> $BASH_ENV + echo 'export AWS_SECRET_ACCESS_KEY=${NIGHTLY_AWS_SECRET_ACCESS_KEY}' >> $BASH_ENV + - run: + name: Configure Cortex CLI + command: cortex env configure cortex --operator-endpoint # TODO + - run: + name: Run E2E Tests + no_output_timeout: 30m + command: | + pytest -v test/e2e/tests --env cortex --x86-nodegroups spot,cpu,gpu,inferentia --arm-nodegroups arm --skip-autoscaling --skip-load --skip-long-running + pytest -v test/e2e/tests --env cortex --x86-nodegroups spot,cpu,gpu,inferentia -k test_autoscaling + pytest -v test/e2e/tests --env cortex --x86-nodegroups spot,cpu,gpu,inferentia -k test_load + - slack/notify: + event: fail + channel: "#builds" + template: basic_fail_1 + + cluster-down: + docker: + - image: cimg/python:3.6 + steps: + - run: + name: Install Cortex CLI + command: | + pip install https://s3-us-west-2.amazonaws.com/get-cortex/master/python/cortex-master.tar.gz + - run: + name: Initialize Credentials + command: | + echo 'export AWS_ACCESS_KEY_ID=${NIGHTLY_AWS_ACCESS_KEY_ID}' >> $BASH_ENV + echo 'export AWS_SECRET_ACCESS_KEY=${NIGHTLY_AWS_SECRET_ACCESS_KEY}' >> $BASH_ENV + - run: + name: Delete Cluster + command: cortex cluster down --name cortex-nightly --region us-east-1 -y + when: always + - slack/notify: + event: fail + channel: "#builds" + template: basic_fail_1 workflows: build: @@ -294,7 +333,18 @@ workflows: - master - /^[0-9]+\.[0-9]+$/ - nightly: + nightly-cluster-up: + triggers: + - schedule: + cron: "0 0 * * *" + filters: + branches: + only: + - master + jobs: + - cluster-up + + nightly-e2e-tests: triggers: - schedule: cron: "0 0 * * *" @@ -304,3 +354,14 @@ workflows: - master jobs: - e2e-tests + + nightly-cluster-down: + triggers: + - schedule: + cron: "0 0 * * *" + filters: + branches: + only: + - master + jobs: + - cluster-down From 2fd30e4cc7e26ed6a86979cc18fb75f13878595b Mon Sep 17 00:00:00 2001 From: David Eliahu Date: Mon, 22 Nov 2021 11:12:21 -0800 Subject: [PATCH 2/3] Add get_operator_url.py --- .circleci/config.yml | 27 +++------------------------ dev/get_operator_url.py | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 24 deletions(-) create mode 100644 dev/get_operator_url.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 074d02e55a..31f07c6a33 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -25,27 +25,6 @@ commands: name: Login to Quay command: docker login -u=$QUAY_USERNAME -p=$QUAY_PASSWORD quay.io - run-e2e-tests: - description: Creates a temporary cluster and runs the cortex E2E tests - parameters: - config: - description: cluster config file path - type: string - default: ./cluster.yaml - slack_channel: - description: "slack channel where failed builds will be posted (should start with #)" - type: string - default: "#builds" - steps: - - run: - name: Delete Cluster - command: cortex cluster down --config << parameters.config >> -y - when: always - - slack/notify: - event: fail - channel: << parameters.slack_channel >> - template: basic_fail_1 - jobs: lint: docker: @@ -247,7 +226,7 @@ jobs: echo 'export AWS_SECRET_ACCESS_KEY=${NIGHTLY_AWS_SECRET_ACCESS_KEY}' >> $BASH_ENV - run: name: Configure Cortex CLI - command: cortex env configure cortex --operator-endpoint # TODO + command: cortex env configure cortex --operator-endpoint $(python dev/get_operator_url.py cortex-nightly us-east-1) - run: name: Run E2E Tests no_output_timeout: 30m @@ -347,7 +326,7 @@ workflows: nightly-e2e-tests: triggers: - schedule: - cron: "0 0 * * *" + cron: "0 1 * * *" filters: branches: only: @@ -358,7 +337,7 @@ workflows: nightly-cluster-down: triggers: - schedule: - cron: "0 0 * * *" + cron: "0 2 * * *" filters: branches: only: diff --git a/dev/get_operator_url.py b/dev/get_operator_url.py new file mode 100644 index 0000000000..4aec6c93d5 --- /dev/null +++ b/dev/get_operator_url.py @@ -0,0 +1,39 @@ +import sys +import boto3 + + +def main(): + cluster_name = sys.argv[1] + region = sys.argv[2] + operator_url = get_operator_url(cluster_name, region) + print("https://" + operator_url) + + +def get_operator_url(cluster_name, region): + client_elbv2 = boto3.client("elbv2", region_name=region) + + paginator = client_elbv2.get_paginator("describe_load_balancers") + for load_balancer_page in paginator.paginate(PaginationConfig={"PageSize": 20}): + load_balancers = { + load_balancer["LoadBalancerArn"]: load_balancer + for load_balancer in load_balancer_page["LoadBalancers"] + } + tag_descriptions = client_elbv2.describe_tags(ResourceArns=list(load_balancers.keys()))[ + "TagDescriptions" + ] + for tag_description in tag_descriptions: + foundClusterNameTag = False + foundLoadBalancerTag = False + for tags in tag_description["Tags"]: + if tags["Key"] == "cortex.dev/cluster-name" and tags["Value"] == cluster_name: + foundClusterNameTag = True + if tags["Key"] == "cortex.dev/load-balancer" and tags["Value"] == "operator": + foundLoadBalancerTag = True + if foundClusterNameTag and foundLoadBalancerTag: + load_balancer = load_balancers[tag_description["ResourceArn"]] + return load_balancer["DNSName"] + + +# usage: python get_operator_url.py CLUSTER_NAME REGION +if __name__ == "__main__": + main() From ad5208a99940ef2595b092f857a919cf0219caea Mon Sep 17 00:00:00 2001 From: David Eliahu Date: Mon, 22 Nov 2021 11:20:43 -0800 Subject: [PATCH 3/3] Add license --- dev/get_operator_url.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/dev/get_operator_url.py b/dev/get_operator_url.py index 4aec6c93d5..72cb6394d9 100644 --- a/dev/get_operator_url.py +++ b/dev/get_operator_url.py @@ -1,3 +1,17 @@ +# Copyright 2021 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import sys import boto3