Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 90 additions & 50 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,47 +25,6 @@ commands:
name: Login to Quay
command: docker login -u=$QUAY_USERNAME -p=$QUAY_PASSWORD quay.io

install-e2e-dependencies:
description: Install E2E Tests Dependencies
steps:
- run:
name: Install Dependencies
command: |
pip install boto3 pyyaml awscli
pip install -e ./test/e2e
pip install https://s3-us-west-2.amazonaws.com/get-cortex/master/python/cortex-master.tar.gz

run-e2e-tests:
description: Creates a temporary cluster and runs the cortex E2E tests
parameters:
config:
description: cluster config file path
type: string
default: ./cluster.yaml
slack_channel:
description: "slack channel where failed builds will be posted (should start with #)"
type: string
default: "#builds"
steps:
- run:
name: Create Cluster
command: cortex cluster up << parameters.config >> --configure-env cortex -y
- run:
name: Run E2E Tests
no_output_timeout: 30m
command: |
pytest -v test/e2e/tests --env cortex --x86-nodegroups spot,cpu,gpu,inferentia --arm-nodegroups arm --skip-autoscaling --skip-load --skip-long-running
pytest -v test/e2e/tests --env cortex --x86-nodegroups spot,cpu,gpu,inferentia -k test_autoscaling
pytest -v test/e2e/tests --env cortex --x86-nodegroups spot,cpu,gpu,inferentia -k test_load
- run:
name: Delete Cluster
command: cortex cluster down --config << parameters.config >> -y
when: always
- slack/notify:
event: fail
channel: << parameters.slack_channel >>
template: basic_fail_1

jobs:
lint:
docker:
Expand Down Expand Up @@ -187,15 +146,14 @@ jobs:
command: make ci-amend-images
no_output_timeout: 20m

e2e-tests:
cluster-up:
docker:
- image: cimg/python:3.6
environment:
CORTEX_TEST_BATCH_S3_PATH: s3://cortex-nightly-artifacts/test/jobs
steps:
- setup_remote_docker
- checkout
- install-e2e-dependencies
- run:
name: Install Cortex CLI
command: |
pip install https://s3-us-west-2.amazonaws.com/get-cortex/master/python/cortex-master.tar.gz
- run:
name: Initialize Credentials
command: |
Expand Down Expand Up @@ -240,8 +198,68 @@ jobs:
- run:
name: Verify configuration of credentials
command: aws sts get-caller-identity | jq ".Arn" | grep "dev-cortex-nightly-us-east-1"
- run-e2e-tests:
config: ./cluster.yaml
- run:
name: Create Cluster
command: cortex cluster up cluster.yaml --configure-env cortex -y
- slack/notify:
event: fail
channel: "#builds"
template: basic_fail_1

e2e-tests:
docker:
- image: cimg/python:3.6
environment:
CORTEX_TEST_BATCH_S3_PATH: s3://cortex-nightly-artifacts/test/jobs
steps:
- checkout
- run:
name: Install Dependencies
command: |
pip install boto3 pyyaml awscli
pip install -e ./test/e2e
pip install https://s3-us-west-2.amazonaws.com/get-cortex/master/python/cortex-master.tar.gz
- run:
name: Initialize Credentials
command: |
echo 'export AWS_ACCESS_KEY_ID=${NIGHTLY_AWS_ACCESS_KEY_ID}' >> $BASH_ENV
echo 'export AWS_SECRET_ACCESS_KEY=${NIGHTLY_AWS_SECRET_ACCESS_KEY}' >> $BASH_ENV
- run:
name: Configure Cortex CLI
command: cortex env configure cortex --operator-endpoint $(python dev/get_operator_url.py cortex-nightly us-east-1)
- run:
name: Run E2E Tests
no_output_timeout: 30m
command: |
pytest -v test/e2e/tests --env cortex --x86-nodegroups spot,cpu,gpu,inferentia --arm-nodegroups arm --skip-autoscaling --skip-load --skip-long-running
pytest -v test/e2e/tests --env cortex --x86-nodegroups spot,cpu,gpu,inferentia -k test_autoscaling
pytest -v test/e2e/tests --env cortex --x86-nodegroups spot,cpu,gpu,inferentia -k test_load
- slack/notify:
event: fail
channel: "#builds"
template: basic_fail_1

cluster-down:
docker:
- image: cimg/python:3.6
steps:
- run:
name: Install Cortex CLI
command: |
pip install https://s3-us-west-2.amazonaws.com/get-cortex/master/python/cortex-master.tar.gz
- run:
name: Initialize Credentials
command: |
echo 'export AWS_ACCESS_KEY_ID=${NIGHTLY_AWS_ACCESS_KEY_ID}' >> $BASH_ENV
echo 'export AWS_SECRET_ACCESS_KEY=${NIGHTLY_AWS_SECRET_ACCESS_KEY}' >> $BASH_ENV
- run:
name: Delete Cluster
command: cortex cluster down --name cortex-nightly --region us-east-1 -y
when: always
- slack/notify:
event: fail
channel: "#builds"
template: basic_fail_1

workflows:
build:
Expand Down Expand Up @@ -294,13 +312,35 @@ workflows:
- master
- /^[0-9]+\.[0-9]+$/

nightly:
nightly-cluster-up:
triggers:
- schedule:
cron: "0 0 * * *"
filters:
branches:
only:
- master
jobs:
- cluster-up

nightly-e2e-tests:
triggers:
- schedule:
cron: "0 1 * * *"
filters:
branches:
only:
- master
jobs:
- e2e-tests

nightly-cluster-down:
triggers:
- schedule:
cron: "0 2 * * *"
filters:
branches:
only:
- master
jobs:
- cluster-down
53 changes: 53 additions & 0 deletions dev/get_operator_url.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Copyright 2021 Cortex Labs, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import boto3


def main():
cluster_name = sys.argv[1]
region = sys.argv[2]
operator_url = get_operator_url(cluster_name, region)
print("https://" + operator_url)


def get_operator_url(cluster_name, region):
client_elbv2 = boto3.client("elbv2", region_name=region)

paginator = client_elbv2.get_paginator("describe_load_balancers")
for load_balancer_page in paginator.paginate(PaginationConfig={"PageSize": 20}):
load_balancers = {
load_balancer["LoadBalancerArn"]: load_balancer
for load_balancer in load_balancer_page["LoadBalancers"]
}
tag_descriptions = client_elbv2.describe_tags(ResourceArns=list(load_balancers.keys()))[
"TagDescriptions"
]
for tag_description in tag_descriptions:
foundClusterNameTag = False
foundLoadBalancerTag = False
for tags in tag_description["Tags"]:
if tags["Key"] == "cortex.dev/cluster-name" and tags["Value"] == cluster_name:
foundClusterNameTag = True
if tags["Key"] == "cortex.dev/load-balancer" and tags["Value"] == "operator":
foundLoadBalancerTag = True
if foundClusterNameTag and foundLoadBalancerTag:
load_balancer = load_balancers[tag_description["ResourceArn"]]
return load_balancer["DNSName"]


# usage: python get_operator_url.py CLUSTER_NAME REGION
if __name__ == "__main__":
main()