Merge pull request #7 from converged-computing/add/network-design

add networking standalone design
converged-computing · Aug 2, 2023 · 0eab453 · 0eab453
2 parents 805cb0a + df056c3
commit 0eab453
Show file tree

Hide file tree

Showing 22 changed files with 540 additions and 64 deletions.
diff --git a/README.md b/README.md
@@ -10,7 +10,6 @@ View our ⭐️ [Documentation](https://converged-computing.github.io/metrics-op
 
 ## Dinosaur TODO
 
-- Add networking implementation and test netmark
 - Add tests as proper tests in CI
 - **Bug that config map not cleaning up with deletion**
 - Need a strategy for storing metrics output / logs

diff --git a/api/v1alpha1/metric_types.go b/api/v1alpha1/metric_types.go
@@ -21,6 +21,7 @@ import (
 	"reflect"
 
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
 )
 
 // EDIT THIS FILE!  THIS IS SCAFFOLDING FOR YOU TO OWN!
@@ -30,6 +31,7 @@ import (
 type MetricSetSpec struct {
 
 	// The name of the metric (that will be associated with a flavor like storage)
+	// +optional
 	Metrics []Metric `json:"metrics"`
 
 	// Service name for the JobSet (MetricsSet) cluster network
@@ -58,7 +60,7 @@ type MetricSetSpec struct {
 	// +kubebuilder:default=1
 	// +default=1
 	// +optional
-	Completions int32 `json:"completions"`
+	Pods int32 `json:"pods"`
 }
 
 // Storage that will be monitored
@@ -137,6 +139,11 @@ type Metric struct {
 	// +optional
 	Rate int32 `json:"rate"`
 
+	// Metric Options
+	// Metric specific options
+	// +optional
+	Options map[string]intstr.IntOrString `json:"options"`
+
 	// Completions
 	// Number of completions to do, more relevant for service type applications
 	// that run forever, or a storage metric. If not set (0) then don't set a limit
@@ -152,6 +159,8 @@ type Metric struct {
 func (m *MetricSet) GetPodLabels() map[string]string {
 	podLabels := map[string]string{}
 	podLabels["cluster-name"] = m.Name
+	// This is for the headless service
+	podLabels["metricset-name"] = m.Name
 	podLabels["namespace"] = m.Namespace
 	podLabels["app.kubernetes.io/name"] = m.Name
 	return podLabels
@@ -172,25 +181,28 @@ type MetricSet struct {
 	Status MetricSetStatus `json:"status,omitempty"`
 }
 
-// Determine if an application is present
+// Determine if an application or storage is present, or standalone
 func (m *MetricSet) HasApplication() bool {
 	return !reflect.DeepEqual(m.Spec.Application, Application{})
 }
-
 func (m *MetricSet) HasStorage() bool {
 	return !reflect.DeepEqual(m.Spec.Storage, Storage{})
 }
+func (m *MetricSet) IsStandalone() bool {
+	return !m.HasStorage() && !m.HasApplication()
+}
 
 // Validate a requested metricset
 func (m *MetricSet) Validate() bool {
 
 	// An application or storage setup is required
-	if !m.HasApplication() && !m.HasStorage() {
-		fmt.Printf("😥️ An application OR storage entry is required.\n")
+	if !m.HasApplication() && !m.HasStorage() && !m.IsStandalone() {
+		fmt.Printf("😥️ An application OR storage OR standalone entry is required.\n")
 		return false
 	}
 
 	// We don't currently support running both at once
+	// (but should be fine to allow extra standalone)
 	if m.HasApplication() && m.HasStorage() {
 		fmt.Printf("😥️ An application OR storage entry is required, not both.\n")
 		return false
@@ -202,7 +214,7 @@ func (m *MetricSet) Validate() bool {
 	}
 
 	// Storage or an application can have completions (replicas)
-	if m.Spec.Completions < 1 {
+	if m.Spec.Pods < 1 {
 		fmt.Printf("😥️ Completions must be >= 1.")
 		return false
 	}

diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go
diff --git a/config/crd/bases/flux-framework.org_metricsets.yaml b/config/crd/bases/flux-framework.org_metricsets.yaml
@@ -89,11 +89,6 @@ spec:
                 - command
                 - image
                 type: object
-              completions:
-                default: 1
-                description: Number of job completions (e.g., pods)
-                format: int32
-                type: integer
               deadlineSeconds:
                 default: 31500000
                 description: Should the job be limited to a particular number of seconds?
@@ -122,6 +117,14 @@ spec:
                       type: integer
                     name:
                       type: string
+                    options:
+                      additionalProperties:
+                        anyOf:
+                        - type: integer
+                        - type: string
+                        x-kubernetes-int-or-string: true
+                      description: Metric Options Metric specific options
+                      type: object
                     rate:
                       default: 10
                       description: Global attributes shared by all metrics Sampling
@@ -132,6 +135,11 @@ spec:
                   - name
                   type: object
                 type: array
+              pods:
+                default: 1
+                description: Number of job completions (e.g., pods)
+                format: int32
+                type: integer
               serviceName:
                 default: ms
                 description: Service name for the JobSet (MetricsSet) cluster network
@@ -172,8 +180,6 @@ spec:
                 required:
                 - volume
                 type: object
-            required:
-            - metrics
             type: object
           status:
             description: MetricStatus defines the observed state of Metric

diff --git a/controllers/metric/metric.go b/controllers/metric/metric.go
@@ -91,23 +91,26 @@ func (r *MetricSetReconciler) ensureJobSet(
 
 		var js *jobset.JobSet
 		if set.HasApplication() {
-			r.Log.Info("Creating application JobSet for MetricSet")
+			r.Log.Info("Creating application or standalone JobSet for MetricSet")
 			js, err = mctrl.GetApplicationJobSet(set, metrics)
 
 		} else if set.HasStorage() {
 			r.Log.Info("Creating storage JobSet for MetricSet")
 			js, err = mctrl.GetStorageJobSet(set, metrics)
 
 		} else {
-
-			// We shouldn't get here
-			r.Log.Info("A MetricSet must be for an application or storage.")
-			return js, ctrl.Result{}, err
+			r.Log.Info("Assuming standalone MetricSet.")
+			js, err = mctrl.GetStandaloneJobSet(set, metrics, map[string]api.Volume{}, false)
 		}
 		ctrl.SetControllerReference(set, js, r.Scheme)
 		if err != nil {
 			return js, ctrl.Result{}, err
 		}
+		r.Log.Info(
+			"🎉 Creating Metrics JobSet 🎉",
+			"Namespace:", js.Namespace,
+			"Name:", js.Name,
+		)
 		err = r.Client.Create(ctx, js)
 		if err != nil {
 			r.Log.Error(

diff --git a/controllers/metric/metric_controller.go b/controllers/metric/metric_controller.go
@@ -126,10 +126,11 @@ func (r *MetricSetReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
 		} else if m.RequiresStorage() && set.HasStorage() {
 			r.Log.Info("Found storage metric", metric.Name, m.Description())
 			metrics = append(metrics, m)
-		} else if m.RequiresApplication() && set.HasStorage() {
-			r.Log.Info("Metric %s is for storage, but found application. Skipping.", metric.Name)
-		} else if m.RequiresStorage() && set.HasApplication() {
-			r.Log.Info("Metric %s is for application, but found storage. Skipping.", metric.Name)
+		} else if m.Standalone() && set.IsStandalone() {
+			r.Log.Info("Found standalone metric", metric.Name, m.Description())
+			metrics = append(metrics, m)
+		} else {
+			r.Log.Info("Metric %s is mismatched for expected MetricSet, skipping.", metric.Name)
 		}
 	}
 

diff --git a/docs/development/designs.md b/docs/development/designs.md
@@ -32,16 +32,17 @@ That looks like this:
 ![img/storage-pod.png](img/storage-pod.png)
 
 
-### Networking
+### Standalone
 
-**not implemented yet**
+A standalone metric does not require an application container or a storage specification, but rather uses a "standalone" setting that indicates it runs on its own. As an example, for a networking tool that uses MPI to run across nodes, we can set the number of pods (via the indexed job) to a number greater than 1, and then we will be making an indexed job with that many pods to run the command.  That might look like this:
 
-For this I am planning a similar design to the above two, except we won't need a shared process namespace or a volume. We can simply create an indexed job with the tool, and then (already implemented) use the shared network (headless service) for the tool to test communication between pods. This will likely need resources exposed to ensure one pod / hostname, so we will add that. I am first going to test this with Netmark, and then look into other HPC network testing tools.
+![img/standalone-metric.png](img/standalone-metric.png)
 
-### Others
-
-
-There are likely others (and I need to think about it)
+We don't technically need a shared process space, a storage setup, or an application. 
+And actually, that headless service that provides the network is available for storage
+or applications as well - we just don't use them in the previous example! The ability
+to scale (via a number of pods > 1) is also a feature of storage and services if your
+tool requires that.
 
 
 ## Database for Metric Storage
@@ -50,4 +51,4 @@ I want to try creating a consistent database that can be used to store metrics a
 separately. Best case, we can manage it for them, or (better) not require it at all.
 I don't want anything complicated (I don't want to re-create prometheus or a monitoring service!)
 
- - Original diagrams are available on [Excalidraw](https://excalidraw.com/#json=U1quv0he2C1VpqenUBpa6,Rk-sw8Ku5iqdsSC49aJOBw)
+ - Original diagrams are available on [Excalidraw](https://excalidraw.com/#json=GSpMds50rqhuwMARRNcgA,m3HHKWx2hwNnWoS8GxoTzg)
diff --git a/docs/development/img/standalone-metric.png b/docs/development/img/standalone-metric.png
diff --git a/docs/getting_started/custom-resource-definition.md b/docs/getting_started/custom-resource-definition.md
@@ -40,9 +40,9 @@ Specifically, you must choose ONE of:
 Where an application will be run for some number of pods (completions) and measured by metrics pods (separate pods) OR a storage metric will run directly, and with some
 number of pods (completions) to bind to the storage and measure.
 
-### completions
+### pods
 
-The number of completions for an application or storage metric test will correspond with the number of indexed job completions (pods) for the storage or application JobSet. This defaults to 1, meaning we run in a non-indexed mode. The indexed mode is determined automatically by this variable, where "1" indicates non-indexed, and >1 is indexed.
+The number of pods for an application or storage metric test will correspond with the number of indexed job completions (which comes down to pods) for the storage or application JobSet. This defaults to 1, meaning we run in a non-indexed mode. The indexed mode is determined automatically by this variable, where "1" indicates non-indexed, and >1 is indexed.
 
 ### application
 

diff --git a/docs/getting_started/metrics.md b/docs/getting_started/metrics.md
@@ -12,6 +12,10 @@ The following metrics are under development (or being planned). These will be ea
 
  - *io-sysstat*: the "iostat" executable of the sysstat library.
 
+### Standalone
+
+ - *network-netmark*: this is currently a private container/software, but we have support for it when it's ready to be made public.
+
 ### Apps to be Measured
 
  - LAMMPS (already in tests)
@@ -20,7 +24,12 @@ The following metrics are under development (or being planned). These will be ea
 
 ### Metrics To Be Added
 
- -  https://github.com/glennklockwood/bioinformatics-profile
+ - https://github.com/glennklockwood/bioinformatics-profile
+ - HPCToolkit
+ - https://dl.acm.org/doi/pdf/10.1145/3611007
+ - https://hpc.fau.de/research/tools/likwid/
+ - https://www.vi-hps.org/tools/tools.html
+ - https://open.xdmod.org/10.0/index.html
 
 
 ## Examples

diff --git a/docs/getting_started/user-guide.md b/docs/getting_started/user-guide.md
@@ -77,8 +77,15 @@ Generally, you'll be defining an application container with one or more metrics
 
 ### Application Metrics
 
-An application with metrics will allow 
+An application with metrics will allow you to run the application, and measure one or more metrics alongside it. This is done via sidecar containers.
+
+### Storage Metric
+
 A storage or IO metric will simply create the volume of interest that you request, and run the tool there. Read/write is important here - e.g., if the metric needs to write to the volume, a read only volume won't work.
 
-For storage metrics, you aren't required to 
+### Standalone Metric
+
+A standalone metric does not require special storage or an application! As an example,
+a networking metric can simply be run with some number of pods (via the indexed jobs).
 
+For more detail about this design, see the [developer docs](../development/index.md).
diff --git a/docs/index.md b/docs/index.md
@@ -1,7 +1,5 @@
 # The Metrics Operator
 
-<img style="width:50%" alt="Coming Soon" src="_static/images/coming-soon.png">
-
 Welcome to the Metrics Operator Documentation!
 
 The Metrics Operator is a Kubernetes Cluster [Operator](https://kubernetes.io/docs/concepts/extend-kubernetes/operator/)

diff --git a/examples/dist/metrics-operator-arm.yaml b/examples/dist/metrics-operator-arm.yaml
@@ -91,11 +91,6 @@ spec:
                 - command
                 - image
                 type: object
-              completions:
-                default: 1
-                description: Number of job completions (e.g., pods)
-                format: int32
-                type: integer
               deadlineSeconds:
                 default: 31500000
                 description: Should the job be limited to a particular number of seconds? Approximately one year. This cannot be zero or job won't start
@@ -117,6 +112,14 @@ spec:
                       type: integer
                     name:
                       type: string
+                    options:
+                      additionalProperties:
+                        anyOf:
+                        - type: integer
+                        - type: string
+                        x-kubernetes-int-or-string: true
+                      description: Metric Options Metric specific options
+                      type: object
                     rate:
                       default: 10
                       description: Global attributes shared by all metrics Sampling rate in seconds. Defaults to every 10 seconds
@@ -126,6 +129,11 @@ spec:
                   - name
                   type: object
                 type: array
+              pods:
+                default: 1
+                description: Number of job completions (e.g., pods)
+                format: int32
+                type: integer
               serviceName:
                 default: ms
                 description: Service name for the JobSet (MetricsSet) cluster network
@@ -163,8 +171,6 @@ spec:
                 required:
                 - volume
                 type: object
-            required:
-            - metrics
             type: object
           status:
             description: MetricStatus defines the observed state of Metric