Skip to content

Commit

Permalink
Merge pull request #7 from converged-computing/add/network-design
Browse files Browse the repository at this point in the history
add networking standalone design
  • Loading branch information
vsoch committed Aug 2, 2023
2 parents 805cb0a + df056c3 commit 0eab453
Show file tree
Hide file tree
Showing 22 changed files with 540 additions and 64 deletions.
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ View our ⭐️ [Documentation](https://converged-computing.github.io/metrics-op

## Dinosaur TODO

- Add networking implementation and test netmark
- Add tests as proper tests in CI
- **Bug that config map not cleaning up with deletion**
- Need a strategy for storing metrics output / logs
Expand Down
24 changes: 18 additions & 6 deletions api/v1alpha1/metric_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"reflect"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
)

// EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN!
Expand All @@ -30,6 +31,7 @@ import (
type MetricSetSpec struct {

// The name of the metric (that will be associated with a flavor like storage)
// +optional
Metrics []Metric `json:"metrics"`

// Service name for the JobSet (MetricsSet) cluster network
Expand Down Expand Up @@ -58,7 +60,7 @@ type MetricSetSpec struct {
// +kubebuilder:default=1
// +default=1
// +optional
Completions int32 `json:"completions"`
Pods int32 `json:"pods"`
}

// Storage that will be monitored
Expand Down Expand Up @@ -137,6 +139,11 @@ type Metric struct {
// +optional
Rate int32 `json:"rate"`

// Metric Options
// Metric specific options
// +optional
Options map[string]intstr.IntOrString `json:"options"`

// Completions
// Number of completions to do, more relevant for service type applications
// that run forever, or a storage metric. If not set (0) then don't set a limit
Expand All @@ -152,6 +159,8 @@ type Metric struct {
func (m *MetricSet) GetPodLabels() map[string]string {
podLabels := map[string]string{}
podLabels["cluster-name"] = m.Name
// This is for the headless service
podLabels["metricset-name"] = m.Name
podLabels["namespace"] = m.Namespace
podLabels["app.kubernetes.io/name"] = m.Name
return podLabels
Expand All @@ -172,25 +181,28 @@ type MetricSet struct {
Status MetricSetStatus `json:"status,omitempty"`
}

// Determine if an application is present
// Determine if an application or storage is present, or standalone
func (m *MetricSet) HasApplication() bool {
return !reflect.DeepEqual(m.Spec.Application, Application{})
}

func (m *MetricSet) HasStorage() bool {
return !reflect.DeepEqual(m.Spec.Storage, Storage{})
}
func (m *MetricSet) IsStandalone() bool {
return !m.HasStorage() && !m.HasApplication()
}

// Validate a requested metricset
func (m *MetricSet) Validate() bool {

// An application or storage setup is required
if !m.HasApplication() && !m.HasStorage() {
fmt.Printf("😥️ An application OR storage entry is required.\n")
if !m.HasApplication() && !m.HasStorage() && !m.IsStandalone() {
fmt.Printf("😥️ An application OR storage OR standalone entry is required.\n")
return false
}

// We don't currently support running both at once
// (but should be fine to allow extra standalone)
if m.HasApplication() && m.HasStorage() {
fmt.Printf("😥️ An application OR storage entry is required, not both.\n")
return false
Expand All @@ -202,7 +214,7 @@ func (m *MetricSet) Validate() bool {
}

// Storage or an application can have completions (replicas)
if m.Spec.Completions < 1 {
if m.Spec.Pods < 1 {
fmt.Printf("😥️ Completions must be >= 1.")
return false
}
Expand Down
8 changes: 8 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 13 additions & 7 deletions config/crd/bases/flux-framework.org_metricsets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,6 @@ spec:
- command
- image
type: object
completions:
default: 1
description: Number of job completions (e.g., pods)
format: int32
type: integer
deadlineSeconds:
default: 31500000
description: Should the job be limited to a particular number of seconds?
Expand Down Expand Up @@ -122,6 +117,14 @@ spec:
type: integer
name:
type: string
options:
additionalProperties:
anyOf:
- type: integer
- type: string
x-kubernetes-int-or-string: true
description: Metric Options Metric specific options
type: object
rate:
default: 10
description: Global attributes shared by all metrics Sampling
Expand All @@ -132,6 +135,11 @@ spec:
- name
type: object
type: array
pods:
default: 1
description: Number of job completions (e.g., pods)
format: int32
type: integer
serviceName:
default: ms
description: Service name for the JobSet (MetricsSet) cluster network
Expand Down Expand Up @@ -172,8 +180,6 @@ spec:
required:
- volume
type: object
required:
- metrics
type: object
status:
description: MetricStatus defines the observed state of Metric
Expand Down
13 changes: 8 additions & 5 deletions controllers/metric/metric.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,23 +91,26 @@ func (r *MetricSetReconciler) ensureJobSet(

var js *jobset.JobSet
if set.HasApplication() {
r.Log.Info("Creating application JobSet for MetricSet")
r.Log.Info("Creating application or standalone JobSet for MetricSet")
js, err = mctrl.GetApplicationJobSet(set, metrics)

} else if set.HasStorage() {
r.Log.Info("Creating storage JobSet for MetricSet")
js, err = mctrl.GetStorageJobSet(set, metrics)

} else {

// We shouldn't get here
r.Log.Info("A MetricSet must be for an application or storage.")
return js, ctrl.Result{}, err
r.Log.Info("Assuming standalone MetricSet.")
js, err = mctrl.GetStandaloneJobSet(set, metrics, map[string]api.Volume{}, false)
}
ctrl.SetControllerReference(set, js, r.Scheme)
if err != nil {
return js, ctrl.Result{}, err
}
r.Log.Info(
"🎉 Creating Metrics JobSet 🎉",
"Namespace:", js.Namespace,
"Name:", js.Name,
)
err = r.Client.Create(ctx, js)
if err != nil {
r.Log.Error(
Expand Down
9 changes: 5 additions & 4 deletions controllers/metric/metric_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,11 @@ func (r *MetricSetReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
} else if m.RequiresStorage() && set.HasStorage() {
r.Log.Info("Found storage metric", metric.Name, m.Description())
metrics = append(metrics, m)
} else if m.RequiresApplication() && set.HasStorage() {
r.Log.Info("Metric %s is for storage, but found application. Skipping.", metric.Name)
} else if m.RequiresStorage() && set.HasApplication() {
r.Log.Info("Metric %s is for application, but found storage. Skipping.", metric.Name)
} else if m.Standalone() && set.IsStandalone() {
r.Log.Info("Found standalone metric", metric.Name, m.Description())
metrics = append(metrics, m)
} else {
r.Log.Info("Metric %s is mismatched for expected MetricSet, skipping.", metric.Name)
}
}

Expand Down
17 changes: 9 additions & 8 deletions docs/development/designs.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,17 @@ That looks like this:
![img/storage-pod.png](img/storage-pod.png)


### Networking
### Standalone

**not implemented yet**
A standalone metric does not require an application container or a storage specification, but rather uses a "standalone" setting that indicates it runs on its own. As an example, for a networking tool that uses MPI to run across nodes, we can set the number of pods (via the indexed job) to a number greater than 1, and then we will be making an indexed job with that many pods to run the command. That might look like this:

For this I am planning a similar design to the above two, except we won't need a shared process namespace or a volume. We can simply create an indexed job with the tool, and then (already implemented) use the shared network (headless service) for the tool to test communication between pods. This will likely need resources exposed to ensure one pod / hostname, so we will add that. I am first going to test this with Netmark, and then look into other HPC network testing tools.
![img/standalone-metric.png](img/standalone-metric.png)

### Others


There are likely others (and I need to think about it)
We don't technically need a shared process space, a storage setup, or an application.
And actually, that headless service that provides the network is available for storage
or applications as well - we just don't use them in the previous example! The ability
to scale (via a number of pods > 1) is also a feature of storage and services if your
tool requires that.


## Database for Metric Storage
Expand All @@ -50,4 +51,4 @@ I want to try creating a consistent database that can be used to store metrics a
separately. Best case, we can manage it for them, or (better) not require it at all.
I don't want anything complicated (I don't want to re-create prometheus or a monitoring service!)

- Original diagrams are available on [Excalidraw](https://excalidraw.com/#json=U1quv0he2C1VpqenUBpa6,Rk-sw8Ku5iqdsSC49aJOBw)
- Original diagrams are available on [Excalidraw](https://excalidraw.com/#json=GSpMds50rqhuwMARRNcgA,m3HHKWx2hwNnWoS8GxoTzg)
Binary file added docs/development/img/standalone-metric.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 2 additions & 2 deletions docs/getting_started/custom-resource-definition.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@ Specifically, you must choose ONE of:
Where an application will be run for some number of pods (completions) and measured by metrics pods (separate pods) OR a storage metric will run directly, and with some
number of pods (completions) to bind to the storage and measure.

### completions
### pods

The number of completions for an application or storage metric test will correspond with the number of indexed job completions (pods) for the storage or application JobSet. This defaults to 1, meaning we run in a non-indexed mode. The indexed mode is determined automatically by this variable, where "1" indicates non-indexed, and >1 is indexed.
The number of pods for an application or storage metric test will correspond with the number of indexed job completions (which comes down to pods) for the storage or application JobSet. This defaults to 1, meaning we run in a non-indexed mode. The indexed mode is determined automatically by this variable, where "1" indicates non-indexed, and >1 is indexed.

### application

Expand Down
11 changes: 10 additions & 1 deletion docs/getting_started/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ The following metrics are under development (or being planned). These will be ea

- *io-sysstat*: the "iostat" executable of the sysstat library.

### Standalone

- *network-netmark*: this is currently a private container/software, but we have support for it when it's ready to be made public.

### Apps to be Measured

- LAMMPS (already in tests)
Expand All @@ -20,7 +24,12 @@ The following metrics are under development (or being planned). These will be ea

### Metrics To Be Added

- https://github.com/glennklockwood/bioinformatics-profile
- https://github.com/glennklockwood/bioinformatics-profile
- HPCToolkit
- https://dl.acm.org/doi/pdf/10.1145/3611007
- https://hpc.fau.de/research/tools/likwid/
- https://www.vi-hps.org/tools/tools.html
- https://open.xdmod.org/10.0/index.html


## Examples
Expand Down
11 changes: 9 additions & 2 deletions docs/getting_started/user-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,15 @@ Generally, you'll be defining an application container with one or more metrics

### Application Metrics

An application with metrics will allow
An application with metrics will allow you to run the application, and measure one or more metrics alongside it. This is done via sidecar containers.

### Storage Metric

A storage or IO metric will simply create the volume of interest that you request, and run the tool there. Read/write is important here - e.g., if the metric needs to write to the volume, a read only volume won't work.

For storage metrics, you aren't required to
### Standalone Metric

A standalone metric does not require special storage or an application! As an example,
a networking metric can simply be run with some number of pods (via the indexed jobs).

For more detail about this design, see the [developer docs](../development/index.md).
2 changes: 0 additions & 2 deletions docs/index.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
# The Metrics Operator

<img style="width:50%" alt="Coming Soon" src="_static/images/coming-soon.png">

Welcome to the Metrics Operator Documentation!

The Metrics Operator is a Kubernetes Cluster [Operator](https://kubernetes.io/docs/concepts/extend-kubernetes/operator/)
Expand Down
20 changes: 13 additions & 7 deletions examples/dist/metrics-operator-arm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,6 @@ spec:
- command
- image
type: object
completions:
default: 1
description: Number of job completions (e.g., pods)
format: int32
type: integer
deadlineSeconds:
default: 31500000
description: Should the job be limited to a particular number of seconds? Approximately one year. This cannot be zero or job won't start
Expand All @@ -117,6 +112,14 @@ spec:
type: integer
name:
type: string
options:
additionalProperties:
anyOf:
- type: integer
- type: string
x-kubernetes-int-or-string: true
description: Metric Options Metric specific options
type: object
rate:
default: 10
description: Global attributes shared by all metrics Sampling rate in seconds. Defaults to every 10 seconds
Expand All @@ -126,6 +129,11 @@ spec:
- name
type: object
type: array
pods:
default: 1
description: Number of job completions (e.g., pods)
format: int32
type: integer
serviceName:
default: ms
description: Service name for the JobSet (MetricsSet) cluster network
Expand Down Expand Up @@ -163,8 +171,6 @@ spec:
required:
- volume
type: object
required:
- metrics
type: object
status:
description: MetricStatus defines the observed state of Metric
Expand Down

0 comments on commit 0eab453

Please sign in to comment.