diff --git a/cli/cmd/cluster.go b/cli/cmd/cluster.go index 0467712afc..34620db870 100644 --- a/cli/cmd/cluster.go +++ b/cli/cmd/cluster.go @@ -129,7 +129,7 @@ var _clusterCmd = &cobra.Command{ } var _clusterUpCmd = &cobra.Command{ - Use: "up [CLUSTER_CONFIG_FILE]", + Use: "up CLUSTER_CONFIG_FILE", Short: "spin up a cluster on aws", Args: cobra.ExactArgs(1), Run: func(cmd *cobra.Command, args []string) { @@ -743,7 +743,7 @@ func printInfoOperatorResponse(clusterConfig clusterconfig.Config, operatorEndpo infoResponse.ClusterConfig.Config = clusterConfig fmt.Println(console.Bold("cluster config:")) - fmt.Println(fmt.Sprintf("%s: %s", clusterconfig.APIVersionUserKey, infoResponse.ClusterConfig.APIVersion)) + fmt.Println(fmt.Sprintf("cluster version: %s", infoResponse.ClusterConfig.APIVersion)) fmt.Print(yamlString) printInfoPricing(infoResponse, clusterConfig) diff --git a/docs/clients/cli.md b/docs/clients/cli.md index ddf4e043c5..e33bdd8271 100644 --- a/docs/clients/cli.md +++ b/docs/clients/cli.md @@ -98,7 +98,7 @@ Flags: spin up a cluster on aws Usage: - cortex cluster up [CLUSTER_CONFIG_FILE] [flags] + cortex cluster up CLUSTER_CONFIG_FILE [flags] Flags: -e, --configure-env string name of environment to configure (default "aws") diff --git a/docs/clusters/instances/spot.md b/docs/clusters/instances/spot.md index 9c650ae0dc..2f69f33e7b 100644 --- a/docs/clusters/instances/spot.md +++ b/docs/clusters/instances/spot.md @@ -7,7 +7,7 @@ node_groups: - name: node-group-1 # whether to use spot instances for this node group (default: false) - spot: false + spot: false # this must be set to true to use spot instances spot_config: # additional instance types with identical or better specs than the primary cluster instance type (defaults to only the primary instance type) diff --git a/docs/clusters/management/create.md b/docs/clusters/management/create.md index b7c8e613a2..5e778707f7 100644 --- a/docs/clusters/management/create.md +++ b/docs/clusters/management/create.md @@ -38,7 +38,7 @@ node_groups: instance_volume_size: 50 # disk storage size per instance (GB) instance_volume_type: gp2 # instance volume type [gp2 | io1 | st1 | sc1] # instance_volume_iops: 3000 # instance volume iops (only applicable to io1) - spot: false # enable spot instances + spot: false # whether to use spot instances - name: ng-gpu instance_type: g4dn.xlarge @@ -48,15 +48,6 @@ node_groups: instance_volume_type: gp2 # instance_volume_iops: 3000 spot: false - - - name: ng-inferentia - instance_type: inf1.xlarge - min_instances: 1 - max_instances: 5 - instance_volume_size: 50 - instance_volume_type: gp2 - # instance_volume_iops: 3000 - spot: false ... # subnet visibility [public (instances will have public IPs) | private (instances will not have public IPs)] diff --git a/manager/refresh_cluster_config.py b/manager/refresh_cluster_config.py deleted file mode 100644 index ca1656c137..0000000000 --- a/manager/refresh_cluster_config.py +++ /dev/null @@ -1,173 +0,0 @@ -# Copyright 2021 Cortex Labs, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import boto3 -import sys -import yaml -import os - - -def get_autoscaling_group(): - client = boto3.client("autoscaling", region_name=os.environ["CORTEX_REGION"]) - paginator = client.get_paginator("describe_auto_scaling_groups") - page_iterator = paginator.paginate(PaginationConfig={"PageSize": 100}) - - filtered_asgs = page_iterator.search( - "AutoScalingGroups[?contains(Tags[?Key==`{}`].Value, `{}`) && Tags[?Key==`{}`].Value]".format( - "alpha.eksctl.io/cluster-name", - os.environ["CORTEX_CLUSTER_NAME"], - "k8s.io/cluster-autoscaler/node-template/label/workload", - ) - ) - asgs = list(filtered_asgs) - if len(asgs) == 0: - raise Exception( - "unable to find autoscaling groups belong to cluster " - + os.environ["CORTEX_CLUSTER_NAME"] - ) - return asgs - - -def get_launch_template(launch_template_id): - client = boto3.client("ec2", region_name=os.environ["CORTEX_REGION"]) - resp = client.describe_launch_template_versions(LaunchTemplateId=launch_template_id) - return resp["LaunchTemplateVersions"][0]["LaunchTemplateData"] - - -def extract_nodegroup_name(asg): - for tag in asg["Tags"]: - if tag["Key"] == "eksctl.io/v1alpha2/nodegroup-name": - return tag["Value"] - raise Exception( - "tag {} not set for autoscaling group {}".format( - "eksctl.io/v1alpha2/nodegroup-name", asg["AutoScalingGroupName"] - ) - ) - - -def refresh_yaml(configmap_yaml_path, output_yaml_path): - with open(configmap_yaml_path, "r") as f: - cluster_configmap = yaml.safe_load(f) - - cluster_configmap_str = cluster_configmap["data"]["cluster.yaml"] - cluster_config = yaml.safe_load(cluster_configmap_str) - - asgs = get_autoscaling_group() - # only possible when backup is enabled - - if cluster_config["spot"] and cluster_config.get("spot_config", {}).get( - "on_demand_backup", False - ): - if len(asgs) != 2: - raise Exception( - "expected 2 autoscaling groups but found {} autoscaling groups".format(len(asgs)) - ) - asg_names = set() - for group in asgs: - nodegroup_name = extract_nodegroup_name(group) - if nodegroup_name == "ng-cortex-worker-spot": - asg = group - asg_names.add(nodegroup_name) - if "ng-cortex-worker-on-demand" not in asg_names: - raise Exception( - "expected autoscaling group with tag eksctl.io/v1alpha2/nodegroup-name={}".format( - "ng-cortex-worker-on-demand" - ) - ) - if "ng-cortex-worker-spot" not in asg_names: - raise Exception( - "expected autoscaling group with tag eksctl.io/v1alpha2/nodegroup-name={}".format( - "ng-cortex-worker-spot" - ) - ) - elif cluster_config["spot"]: - if len(asgs) != 1: - raise Exception( - "expected 1 autoscaling groups but found {} autoscaling groups".format(len(asgs)) - ) - if "ng-cortex-worker-spot" not in extract_nodegroup_name(asgs[0]): - raise Exception( - "unable to find autoscaling group with tag eksctl.io/v1alpha2/nodegroup-name={}".format( - "ng-cortex-worker-spot" - ) - ) - asg = asgs[0] - else: - if len(asgs) != 1: - raise Exception( - "expected 1 autoscaling groups but found {} autoscaling groups".format(len(asgs)) - ) - if "ng-cortex-worker-on-demand" not in extract_nodegroup_name(asgs[0]): - raise Exception( - "unable to find autoscaling group with tag eksctl.io/v1alpha2/nodegroup-name={}".format( - "ng-cortex-worker-on-demand" - ) - ) - asg = asgs[0] - - cluster_config["min_instances"] = asg["MinSize"] - cluster_config["max_instances"] = asg["MaxSize"] - - if len(cluster_config.get("subnets", [])) == 0: - cluster_config["availability_zones"] = asg["AvailabilityZones"] - - if asg.get("MixedInstancesPolicy") is not None: - launch_template = get_launch_template( - asg["MixedInstancesPolicy"]["LaunchTemplate"]["LaunchTemplateSpecification"][ - "LaunchTemplateId" - ] - ) - else: - launch_template = get_launch_template(asg["LaunchTemplate"]["LaunchTemplateId"]) - - cluster_config["instance_type"] = launch_template["InstanceType"] - - if launch_template.get("BlockDeviceMappings"): - cluster_config["instance_volume_size"] = launch_template["BlockDeviceMappings"][0]["Ebs"][ - "VolumeSize" - ] - else: - cluster_config["instance_volume_size"] = 20 # AWS volume default - - if asg.get("LaunchTemplate") is not None: - cluster_config["spot"] = False - cluster_config["spot_config"] = None - - if asg.get("MixedInstancesPolicy") is not None: - mixed_instance_policy = asg["MixedInstancesPolicy"] - cluster_config["spot"] = True - spot_config = {"on_demand_backup": len(asgs) == 2} - instances_distribution_metadata = mixed_instance_policy["InstancesDistribution"] - spot_config["on_demand_base_capacity"] = instances_distribution_metadata[ - "OnDemandBaseCapacity" - ] - spot_config["on_demand_percentage_above_base_capacity"] = instances_distribution_metadata[ - "OnDemandPercentageAboveBaseCapacity" - ] - spot_config["max_price"] = float(instances_distribution_metadata["SpotMaxPrice"]) - spot_config["instance_pools"] = instances_distribution_metadata["SpotInstancePools"] - - instance_distribution = [ - node["InstanceType"] for node in mixed_instance_policy["LaunchTemplate"]["Overrides"] - ] - spot_config["instance_distribution"] = instance_distribution - - cluster_config["spot_config"] = spot_config - - with open(output_yaml_path, "w") as f: - yaml.dump(cluster_config, f) - - -if __name__ == "__main__": - refresh_yaml(configmap_yaml_path=sys.argv[1], output_yaml_path=sys.argv[2]) diff --git a/pkg/operator/resources/asyncapi/api.go b/pkg/operator/resources/asyncapi/api.go index 0e5b6153a4..390e59aab5 100644 --- a/pkg/operator/resources/asyncapi/api.go +++ b/pkg/operator/resources/asyncapi/api.go @@ -218,11 +218,6 @@ func GetAllAPIs(pods []kcore.Pod, deployments []kapps.Deployment) ([]schema.APIR return nil, err } - //allMetrics, err := GetMultipleMetrics(apis) - //if err != nil { - // return nil, err - //} - realtimeAPIs := make([]schema.APIResponse, len(apis)) for i := range apis { @@ -233,9 +228,8 @@ func GetAllAPIs(pods []kcore.Pod, deployments []kapps.Deployment) ([]schema.APIR } realtimeAPIs[i] = schema.APIResponse{ - Spec: api, - Status: &statuses[i], - //Metrics: &allMetrics[i], + Spec: api, + Status: &statuses[i], Endpoint: endpoint, } } diff --git a/pkg/types/clusterconfig/config_key.go b/pkg/types/clusterconfig/config_key.go index e5d48c5a81..345fd44315 100644 --- a/pkg/types/clusterconfig/config_key.go +++ b/pkg/types/clusterconfig/config_key.go @@ -34,62 +34,23 @@ const ( InstanceVolumeIOPSKey = "instance_volume_iops" InstancePoolsKey = "instance_pools" MaxPriceKey = "max_price" - - NetworkKey = "network" - SubnetKey = "subnet" - TagsKey = "tags" - ClusterNameKey = "cluster_name" - RegionKey = "region" - AvailabilityZonesKey = "availability_zones" - SubnetsKey = "subnets" - AvailabilityZoneKey = "availability_zone" - SubnetIDKey = "subnet_id" - SSLCertificateARNKey = "ssl_certificate_arn" - CortexPolicyARNKey = "cortex_policy_arn" - IAMPolicyARNsKey = "iam_policy_arns" - BucketKey = "bucket" - SubnetVisibilityKey = "subnet_visibility" - NATGatewayKey = "nat_gateway" - APILoadBalancerSchemeKey = "api_load_balancer_scheme" - OperatorLoadBalancerSchemeKey = "operator_load_balancer_scheme" - VPCCIDRKey = "vpc_cidr" - TelemetryKey = "telemetry" - - // User facing string - NodeGroupsUserKey = "node groups" - SpotUserKey = "use spot instances" - InstanceTypeUserKey = "instance type" - AcceleratorTypeUserKey = "accelerator type" - AcceleratorsPerInstanceUserKey = "accelerators per instance" - MinInstancesUserKey = "min instances" - MaxInstancesUserKey = "max instances" - InstanceVolumeSizeUserKey = "instance volume size (Gi)" - InstanceVolumeTypeUserKey = "instance volume type" - InstanceVolumeIOPSUserKey = "instance volume iops" - InstanceDistributionUserKey = "spot instance distribution" - OnDemandBaseCapacityUserKey = "spot on demand base capacity" - OnDemandPercentageAboveBaseCapacityUserKey = "spot on demand percentage above base capacity" - MaxPriceUserKey = "spot max price ($ per hour)" - InstancePoolsUserKey = "spot instance pools" - - APIVersionUserKey = "cluster version" - ClusterNameUserKey = "cluster name" - RegionUserKey = "aws region" - AvailabilityZonesUserKey = "availability zones" - AvailabilityZoneUserKey = "availability zone" - SubnetsUserKey = "subnets" - SubnetIDUserKey = "subnet id" - TagsUserKey = "tags" - SSLCertificateARNUserKey = "ssl certificate arn" - CortexPolicyARNUserKey = "cortex policy arn" - IAMPolicyARNsUserKey = "iam policy arns" - BucketUserKey = "s3 bucket" - NetworkUserKey = "network" - SubnetUserKey = "subnet" - SubnetVisibilityUserKey = "subnet visibility" - NATGatewayUserKey = "nat gateway" - APILoadBalancerSchemeUserKey = "api load balancer scheme" - OperatorLoadBalancerSchemeUserKey = "operator load balancer scheme" - VPCCIDRUserKey = "vpc cidr" - TelemetryUserKey = "telemetry" + NetworkKey = "network" + SubnetKey = "subnet" + TagsKey = "tags" + ClusterNameKey = "cluster_name" + RegionKey = "region" + AvailabilityZonesKey = "availability_zones" + SubnetsKey = "subnets" + AvailabilityZoneKey = "availability_zone" + SubnetIDKey = "subnet_id" + SSLCertificateARNKey = "ssl_certificate_arn" + CortexPolicyARNKey = "cortex_policy_arn" + IAMPolicyARNsKey = "iam_policy_arns" + BucketKey = "bucket" + SubnetVisibilityKey = "subnet_visibility" + NATGatewayKey = "nat_gateway" + APILoadBalancerSchemeKey = "api_load_balancer_scheme" + OperatorLoadBalancerSchemeKey = "operator_load_balancer_scheme" + VPCCIDRKey = "vpc_cidr" + TelemetryKey = "telemetry" )