Skip to content

Commit

Permalink
Merge pull request #27 from converged-computing/add-state-endpoint
Browse files Browse the repository at this point in the history
feat: state endpoint
  • Loading branch information
vsoch committed Apr 6, 2024
2 parents e81a52b + 5a90abb commit 85520ee
Show file tree
Hide file tree
Showing 26 changed files with 1,030 additions and 269 deletions.
10 changes: 9 additions & 1 deletion Makefile
Expand Up @@ -79,6 +79,10 @@ test: tidy ## Runs unit tests
server: ## Runs uncompiled version of the server
go run cmd/server/server.go --global-token rainbow

.PHONY: server-verbose
server-verbose: ## Runs uncompiled version of the server
go run cmd/server/server.go --loglevel 6 --global-token rainbow

.PHONY: stream
stream: ## Runs the interface client
go run cmd/stream/stream.go
Expand All @@ -88,9 +92,13 @@ register: ## Run mock registration
go run cmd/rainbow/rainbow.go register cluster --cluster-name keebler --nodes-json ./docs/examples/scheduler/cluster-nodes.json --config-path ./docs/examples/scheduler/rainbow-config.yaml --save

.PHONY: subsystem
subsystem: ## Run mock registration
subsystem: ## Register subsystem
go run cmd/rainbow/rainbow.go register subsystem --subsystem io --nodes-json ./docs/examples/scheduler/cluster-io-subsystem.json --config-path ./docs/examples/scheduler/rainbow-config.yaml

.PHONY: update-state
update-state: ## Update state
go run cmd/rainbow/rainbow.go update state --state-file ./docs/examples/scheduler/cluster-state.json --config-path ./docs/examples/scheduler/rainbow-config.yaml

.PHONY: tag
tag: ## Creates release tag
git tag -s -m "version bump to $(VERSION)" $(VERSION)
Expand Down
27 changes: 27 additions & 0 deletions api/v1/rainbow.proto
Expand Up @@ -18,6 +18,10 @@ service RainbowScheduler {
// Job Submission - request for submitting a job to a named cluster
rpc SubmitJob(SubmitJobRequest) returns (SubmitJobResponse);

// Update State - allow a cluster to provide state metadata
// This is intended for use by a selection algorithm
rpc UpdateState(UpdateStateRequest) returns (UpdateStateResponse);

// Request Job - ask the rainbow scheduler for up to max jobs
rpc ReceiveJobs(ReceiveJobsRequest) returns (ReceiveJobsResponse);

Expand All @@ -37,6 +41,29 @@ message RegisterRequest {
google.protobuf.Timestamp sent = 5;
}

// UpdateStateRequests allows a cluster to set arbitrary metadata
// for its state. State metadata is used for selection algorithms
message UpdateStateRequest {
string cluster = 1;
string secret = 2;

// We are generous that the payload can be a flat
// set of key value pairs, and will be parsed into
// types within the graph database
string payload = 3;
}

message UpdateStateResponse {
enum ResultType {
UPDATE_STATE_UNSPECIFIED = 0;
UPDATE_STATE_PARTIAL = 1;
UPDATE_STATE_SUCCESS = 2;
UPDATE_STATE_ERROR = 3;
}
ResultType status = 1;
}


// SubmitJobRequest takes a job name, cluster name
// and requires the cluster token. Since we want to be generic,
// we currently accept nodes, tasks, and the command
Expand Down
17 changes: 17 additions & 0 deletions cmd/rainbow/rainbow.go
Expand Up @@ -10,6 +10,7 @@ import (
"github.com/converged-computing/rainbow/cmd/rainbow/receive"
"github.com/converged-computing/rainbow/cmd/rainbow/register"
"github.com/converged-computing/rainbow/cmd/rainbow/submit"
"github.com/converged-computing/rainbow/cmd/rainbow/update"
"github.com/converged-computing/rainbow/pkg/types"

// Register database backends and selection algorithms
Expand Down Expand Up @@ -41,6 +42,7 @@ func main() {
submitCmd := parser.NewCommand("submit", "Submit a job to a rainbow scheduler")
receiveCmd := parser.NewCommand("receive", "Receive and accept jobs")
registerClusterCmd := registerCmd.NewCommand("cluster", "Register a new cluster")
updateCmd := parser.NewCommand("update", "Update a cluster")

// Configuration
configCmd := parser.NewCommand("config", "Interact with rainbow configs")
Expand Down Expand Up @@ -69,6 +71,10 @@ func main() {
// Register subsystem (requires config file for authentication)
subsysCmd := registerCmd.NewCommand("subsystem", "Register a new subsystem")

// Update subcommands - currently just supported are state
stateCmd := updateCmd.NewCommand("state", "Update the state for a known cluster")
stateFile := stateCmd.String("", "state-file", &argparse.Options{Help: "JSON file with key, value attributes for the cluster"})

// Submit (note that command for now needs to be in quotes to get the whole thing)
token := submitCmd.String("", "token", &argparse.Options{Default: defaultSecret, Help: "Client token to submit jobs with."})
nodes := submitCmd.Int("n", "nodes", &argparse.Options{Default: 1, Help: "Number of nodes to request"})
Expand All @@ -91,6 +97,17 @@ func main() {
log.Fatalf("Issue with config: %s\n", err)
}

} else if stateCmd.Happened() {
err := update.UpdateState(
*host,
*clusterName,
*stateFile,
*cfg,
)
if err != nil {
log.Fatalf("Issue with register subsystem: %s\n", err)
}

} else if registerCmd.Happened() {

if subsysCmd.Happened() {
Expand Down
51 changes: 51 additions & 0 deletions cmd/rainbow/update/state.go
@@ -0,0 +1,51 @@
package update

import (
"context"
"fmt"
"log"

"github.com/converged-computing/rainbow/pkg/client"
"github.com/converged-computing/rainbow/pkg/config"
)

// UpdateState updates state for a cluster
func UpdateState(
host,
clusterName,
stateFile,
cfgFile string,
) error {

c, err := client.NewClient(host)
if err != nil {
return err
}

// A config file is required here
if cfgFile == "" {
return fmt.Errorf("an existing configuration file is required to update an existing cluster")
}
if stateFile == "" {
return fmt.Errorf("a state file (json with key value pairs) is required to update state")
}
// Read in the config, if provided, command line takes preference
cfg, err := config.NewRainbowClientConfig(cfgFile, "", "", "", "", "")
if err != nil {
return err
}

log.Printf("updating state for cluster: %s", cfg.Scheduler.Name)

// Last argument is subsystem name, which we can derive from graph
response, err := c.UpdateState(
context.Background(),
cfg.Cluster.Name,
cfg.Cluster.Secret,
stateFile,
)
// If we get here, success! Dump all the stuff.
log.Printf("%s", response)
return err

}
38 changes: 38 additions & 0 deletions docs/commands.md
Expand Up @@ -7,7 +7,11 @@ The following commands are currently supported. For Python, see the [README](htt
You can run the server (with defaults) as follows:

```bash
# Regular logging
make server

# Verbose logging
make server-verbose
```
```console
go run cmd/server/server.go --global-token rainbow
Expand Down Expand Up @@ -180,6 +184,40 @@ the following reasons:
In Computer Science I think they are used interchangeably. For next steps we will be updating the memory graph database to be a little more meaty (adding proper metadata and likely a summary of resources at the top as a quick "does it satisfy" heuristic)
and then working on the next interaction, the client submit command, which is going to hit the `Satisfies` endpoint. I will write up more about the database and submit design after that.

## Update State

A cluster state is intended to be a superficial view of the cluster status. It's not considered a subsystem because (for the time being) we are only considering a flat listing of key value pairs that describe a cluster. The data is also intended to be small so it can be provided via this update endpoint more frequently. As an example, an update payload may look like the following:

```json
{
"cost-per-node": 12,
"nodes-free": 100
}
```

While the above would not be suited for a real-world deployment (for example, there are many more costs than per node, and occupancy goes beyond nodes free)
but this will be appropriate for small tests and simulations. The metadata above will be provided, on a cluster level, for a final selection algorithm (to use or not). So after you've created your cluster, let's update the state.

```bash
make update-state
```
```console
Adding edge from socket -contains-> core
Adding edge from socket -contains-> core
2024/04/05 18:38:13 We have made an in memory graph (subsystem cluster) with 45 vertices!
Metrics for subsystem cluster{
"cluster": 1,
"core": 36,
"node": 3,
"rack": 1,
"socket": 3
}
2024/04/05 18:38:16 📝️ received state update: keebler
Updating state cost-per-node to 12
Updating state max-jobs to 100
```
In debug logging mode (`make server-debug`) you will see the values updated, as shown above. They are also in blue, which you can't see! Note that this state metadata is provided to a selection algorithm, and we will be added more interesting ones soon for experiments!

## Register Subsystem

Adding a subsystem means adding another graph that has nodes with edges that connect (in some meaningful way) to the dominant subsystem.
Expand Down
4 changes: 4 additions & 0 deletions docs/examples/scheduler/cluster-state.json
@@ -0,0 +1,4 @@
{
"cost-per-node": 12,
"max-jobs": 100
}
2 changes: 1 addition & 1 deletion docs/examples/scheduler/rainbow-config.yaml
Expand Up @@ -8,7 +8,7 @@ scheduler:
name: match
cluster:
name: keebler
secret: 79643f8e-6408-4cd1-bd1a-76aa499d5864
secret: 76948f58-8655-48c1-aaab-fccedf2bb383
graphdatabase:
name: memory
host: 127.0.0.1:50051
Expand Down

0 comments on commit 85520ee

Please sign in to comment.