Skip to content

Commit

Permalink
chore: Add v1beta1/NodePool metrics controller (aws#486)
Browse files Browse the repository at this point in the history
  • Loading branch information
jonathan-innis committed Aug 24, 2023
1 parent acb9571 commit 1405169
Show file tree
Hide file tree
Showing 6 changed files with 349 additions and 10 deletions.
2 changes: 1 addition & 1 deletion pkg/controllers/metrics/node/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ var provisioner *v1alpha5.Provisioner
func TestAPIs(t *testing.T) {
ctx = TestContextWithLogger(t)
RegisterFailHandler(Fail)
RunSpecs(t, "Controllers/Metrics/State")
RunSpecs(t, "NodeMetrics")
}

var _ = BeforeSuite(func() {
Expand Down
172 changes: 172 additions & 0 deletions pkg/controllers/metrics/nodepool/controller.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
/*
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package nodepool

import (
"context"
"strings"
"time"

"github.com/samber/lo"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"
"knative.dev/pkg/logging"

"github.com/prometheus/client_golang/prometheus"
v1 "k8s.io/api/core/v1"
controllerruntime "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/manager"
crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

"github.com/aws/karpenter-core/pkg/apis/v1beta1"
"github.com/aws/karpenter-core/pkg/metrics"
corecontroller "github.com/aws/karpenter-core/pkg/operator/controller"
)

const (
resourceTypeLabel = "resource_type"
nodePoolNameLabel = "nodepool"
nodePoolSubsystem = "nodepool"
)

var (
limitGaugeVec = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: metrics.Namespace,
Subsystem: nodePoolSubsystem,
Name: "limit",
Help: "The nodepool limits are the limits specified on the provisioner that restrict the quantity of resources provisioned. Labeled by nodepool name and resource type.",
},
[]string{
resourceTypeLabel,
nodePoolNameLabel,
},
)
usageGaugeVec = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: metrics.Namespace,
Subsystem: nodePoolSubsystem,
Name: "usage",
Help: "The nodepool usage is the amount of resources that have been provisioned by a particular nodepool. Labeled by nodepool name and resource type.",
},
[]string{
resourceTypeLabel,
nodePoolNameLabel,
},
)
usagePctGaugeVec = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: metrics.Namespace,
Subsystem: nodePoolSubsystem,
Name: "usage_pct",
Help: "The nodepool usage percentage is the percentage of each resource used based on the resources provisioned and the limits that have been configured. Labeled by nodepool name and resource type.",
},
[]string{
resourceTypeLabel,
nodePoolNameLabel,
},
)
)

func init() {
crmetrics.Registry.MustRegister(limitGaugeVec, usageGaugeVec, usagePctGaugeVec)
}

type Controller struct {
kubeClient client.Client
metricStore *metrics.Store
}

// NewController constructs a controller instance
func NewController(kubeClient client.Client) corecontroller.Controller {
return &Controller{
kubeClient: kubeClient,
metricStore: metrics.NewStore(),
}
}

func (c *Controller) Name() string {
return "metrics.nodepool"
}

// Reconcile executes a termination control loop for the resource
func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) {
ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named(c.Name()).With("nodepool", req.Name))
nodePool := &v1beta1.NodePool{}
if err := c.kubeClient.Get(ctx, req.NamespacedName, nodePool); err != nil {
if errors.IsNotFound(err) {
c.metricStore.Delete(req.NamespacedName.String())
}
return reconcile.Result{}, client.IgnoreNotFound(err)
}
c.metricStore.Update(req.NamespacedName.String(), buildMetrics(nodePool))
// periodically update our metrics per nodepool even if nothing has changed
return reconcile.Result{RequeueAfter: 5 * time.Minute}, nil
}

func buildMetrics(nodePool *v1beta1.NodePool) (res []*metrics.StoreMetric) {
for gaugeVec, resourceList := range map[*prometheus.GaugeVec]v1.ResourceList{
usageGaugeVec: nodePool.Status.Resources,
limitGaugeVec: getLimits(nodePool),
usagePctGaugeVec: getUsagePercentage(nodePool),
} {
for k, v := range resourceList {
res = append(res, &metrics.StoreMetric{
GaugeVec: gaugeVec,
Labels: makeLabels(nodePool, strings.ReplaceAll(strings.ToLower(string(k)), "-", "_")),
Value: lo.Ternary(k == v1.ResourceCPU, float64(v.MilliValue())/float64(1000), float64(v.Value())),
})
}
}
return res
}

func getLimits(nodePool *v1beta1.NodePool) v1.ResourceList {
if nodePool.Spec.Limits != nil {
return v1.ResourceList(nodePool.Spec.Limits)
}
return v1.ResourceList{}
}

func getUsagePercentage(nodePool *v1beta1.NodePool) v1.ResourceList {
usage := v1.ResourceList{}
for k, v := range getLimits(nodePool) {
limitValue := v.AsApproximateFloat64()
usedValue := nodePool.Status.Resources[k]
if limitValue == 0 {
usage[k] = *resource.NewQuantity(100, resource.DecimalSI)
} else {
usage[k] = *resource.NewQuantity(int64(usedValue.AsApproximateFloat64()/limitValue*100), resource.DecimalSI)
}
}
return usage
}

func makeLabels(nodePool *v1beta1.NodePool, resourceTypeName string) prometheus.Labels {
return prometheus.Labels{
resourceTypeLabel: resourceTypeName,
nodePoolNameLabel: nodePool.Name,
}
}

func (c *Controller) Builder(_ context.Context, m manager.Manager) corecontroller.Builder {
return corecontroller.Adapt(
controllerruntime.
NewControllerManagedBy(m).
For(&v1beta1.NodePool{}),
)
}
169 changes: 169 additions & 0 deletions pkg/controllers/metrics/nodepool/suite_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
/*
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package nodepool_test

import (
"context"
"strings"
"testing"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
. "knative.dev/pkg/logging/testing"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/aws/karpenter-core/pkg/apis"
"github.com/aws/karpenter-core/pkg/apis/v1beta1"
"github.com/aws/karpenter-core/pkg/controllers/metrics/nodepool"
"github.com/aws/karpenter-core/pkg/operator/controller"
"github.com/aws/karpenter-core/pkg/operator/scheme"
"github.com/aws/karpenter-core/pkg/test"
. "github.com/aws/karpenter-core/pkg/test/expectations"
)

var nodePoolController controller.Controller
var ctx context.Context
var env *test.Environment

func TestAPIs(t *testing.T) {
ctx = TestContextWithLogger(t)
RegisterFailHandler(Fail)
RunSpecs(t, "NodePoolMetrics")
}

var _ = BeforeSuite(func() {
env = test.NewEnvironment(scheme.Scheme, test.WithCRDs(apis.CRDs...))
nodePoolController = nodepool.NewController(env.Client)
})

var _ = AfterSuite(func() {
Expect(env.Stop()).To(Succeed(), "Failed to stop environment")
})

var _ = Describe("Metrics", func() {
var nodePool *v1beta1.NodePool
BeforeEach(func() {
nodePool = test.NodePool(v1beta1.NodePool{
Spec: v1beta1.NodePoolSpec{
Template: v1beta1.NodeClaimTemplate{
Spec: v1beta1.NodeClaimSpec{
NodeClass: &v1beta1.NodeClassReference{
Name: "default",
},
},
},
},
})
})
It("should update the nodepool limit metrics", func() {
limits := v1beta1.Limits{
v1.ResourceCPU: resource.MustParse("10"),
v1.ResourceMemory: resource.MustParse("10Mi"),
v1.ResourceEphemeralStorage: resource.MustParse("100Gi"),
}
nodePool.Spec.Limits = limits
ExpectApplied(ctx, env.Client, nodePool)
ExpectReconcileSucceeded(ctx, nodePoolController, client.ObjectKeyFromObject(nodePool))

for k, v := range limits {
m, found := FindMetricWithLabelValues("karpenter_nodepool_limit", map[string]string{
"nodepool": nodePool.GetName(),
"resource_type": strings.ReplaceAll(k.String(), "-", "_"),
})
Expect(found).To(BeTrue())
Expect(m.GetGauge().GetValue()).To(BeNumerically("~", v.AsApproximateFloat64()))
}
})
It("should update the nodepool usage metrics", func() {
resources := v1.ResourceList{
v1.ResourceCPU: resource.MustParse("10"),
v1.ResourceMemory: resource.MustParse("10Mi"),
v1.ResourceEphemeralStorage: resource.MustParse("100Gi"),
}
nodePool.Status.Resources = resources

ExpectApplied(ctx, env.Client, nodePool)
ExpectReconcileSucceeded(ctx, nodePoolController, client.ObjectKeyFromObject(nodePool))

for k, v := range resources {
m, found := FindMetricWithLabelValues("karpenter_nodepool_usage", map[string]string{
"nodepool": nodePool.GetName(),
"resource_type": strings.ReplaceAll(k.String(), "-", "_"),
})
Expect(found).To(BeTrue())
Expect(m.GetGauge().GetValue()).To(BeNumerically("~", v.AsApproximateFloat64()))
}
})
It("should update the usage percentage metrics correctly", func() {
resources := v1.ResourceList{
v1.ResourceCPU: resource.MustParse("10"),
v1.ResourceMemory: resource.MustParse("10Mi"),
v1.ResourceEphemeralStorage: resource.MustParse("100Gi"),
}
limits := v1beta1.Limits{
v1.ResourceCPU: resource.MustParse("100"),
v1.ResourceMemory: resource.MustParse("100Mi"),
v1.ResourceEphemeralStorage: resource.MustParse("1000Gi"),
}
nodePool.Spec.Limits = limits
nodePool.Status.Resources = resources

ExpectApplied(ctx, env.Client, nodePool)
ExpectReconcileSucceeded(ctx, nodePoolController, client.ObjectKeyFromObject(nodePool))

for k := range resources {
m, found := FindMetricWithLabelValues("karpenter_nodepool_usage_pct", map[string]string{
"nodepool": nodePool.GetName(),
"resource_type": strings.ReplaceAll(k.String(), "-", "_"),
})
Expect(found).To(BeTrue())
Expect(m.GetGauge().GetValue()).To(BeNumerically("~", 10))
}
})
It("should delete the nodepool state metrics on nodepool delete", func() {
expectedMetrics := []string{"karpenter_nodepool_limit", "karpenter_nodepool_usage", "karpenter_nodepool_usage_pct"}
nodePool.Spec.Limits = v1beta1.Limits{
v1.ResourceCPU: resource.MustParse("100"),
v1.ResourceMemory: resource.MustParse("100Mi"),
v1.ResourceEphemeralStorage: resource.MustParse("1000Gi"),
}
nodePool.Status.Resources = v1.ResourceList{
v1.ResourceCPU: resource.MustParse("10"),
v1.ResourceMemory: resource.MustParse("10Mi"),
v1.ResourceEphemeralStorage: resource.MustParse("100Gi"),
}
ExpectApplied(ctx, env.Client, nodePool)
ExpectReconcileSucceeded(ctx, nodePoolController, client.ObjectKeyFromObject(nodePool))

for _, name := range expectedMetrics {
_, found := FindMetricWithLabelValues(name, map[string]string{
"nodepool": nodePool.GetName(),
})
Expect(found).To(BeTrue())
}

ExpectDeleted(ctx, env.Client, nodePool)
ExpectReconcileSucceeded(ctx, nodePoolController, client.ObjectKeyFromObject(nodePool))

for _, name := range expectedMetrics {
_, found := FindMetricWithLabelValues(name, map[string]string{
"nodepool": nodePool.GetName(),
})
Expect(found).To(BeFalse())
}
})
})
2 changes: 1 addition & 1 deletion pkg/controllers/metrics/pod/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ var env *test.Environment
func TestAPIs(t *testing.T) {
ctx = TestContextWithLogger(t)
RegisterFailHandler(Fail)
RunSpecs(t, "Controllers/Metrics/Pod")
RunSpecs(t, "PodMetrics")
}

var _ = BeforeSuite(func() {
Expand Down
10 changes: 4 additions & 6 deletions pkg/controllers/metrics/provisioner/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ const (
var (
limitGaugeVec = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "karpenter",
Namespace: metrics.Namespace,
Subsystem: "provisioner",
Name: "limit",
Help: "The Provisioner Limits are the limits specified on the provisioner that restrict the quantity of resources provisioned. Labeled by provisioner name and resource type.",
Expand All @@ -54,7 +54,7 @@ var (
)
usageGaugeVec = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "karpenter",
Namespace: metrics.Namespace,
Subsystem: "provisioner",
Name: "usage",
Help: "The Provisioner Usage is the amount of resources that have been provisioned by a particular provisioner. Labeled by provisioner name and resource type.",
Expand All @@ -63,7 +63,7 @@ var (
)
usagePctGaugeVec = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "karpenter",
Namespace: metrics.Namespace,
Subsystem: "provisioner",
Name: "usage_pct",
Help: "The Provisioner Usage Percentage is the percentage of each resource used based on the resources provisioned and the limits that have been configured in the range [0,100]. Labeled by provisioner name and resource type.",
Expand All @@ -73,9 +73,7 @@ var (
)

func init() {
crmetrics.Registry.MustRegister(limitGaugeVec)
crmetrics.Registry.MustRegister(usageGaugeVec)
crmetrics.Registry.MustRegister(usagePctGaugeVec)
crmetrics.Registry.MustRegister(limitGaugeVec, usageGaugeVec, usagePctGaugeVec)
}

func labelNames() []string {
Expand Down

0 comments on commit 1405169

Please sign in to comment.