From 45b411fd923146423593460b561f3ac7e7f1853c Mon Sep 17 00:00:00 2001 From: Chris Kery <140152984+chriskery@users.noreply.github.com> Date: Mon, 2 Oct 2023 14:10:09 +0800 Subject: [PATCH] Feature/dev torque schema (#6) * torqueue init * support torce cluster create * support centos and ubuntu base image for torque test case * polymerization pbspro cmd --- README.md | 10 +-- docs/quick-start.md | 65 +++++++++------ manifests/samples/kustomization.yaml | 4 +- ...{torque-centos.yaml => pbspro-centos.yaml} | 4 +- ...{torque-pbspro.yaml => pbspro-pbspro.yaml} | 4 +- ...{torque-ubuntu.yaml => pbspro-ubuntu.yaml} | 4 +- .../samples/slurm-centos-hostNetwork.yaml | 2 +- manifests/samples/slurm-centos.yaml | 2 +- .../cluster_schema/pbspro_schema/config.go | 31 +++++++ .../configmap.go | 81 ++++++++++++++----- .../{torque_schema => pbspro_schema}/env.go | 2 +- .../initcontainer.go | 10 +-- .../pbspro_schema.go} | 47 ++++++----- .../{torque_schema => pbspro_schema}/util.go | 30 ++++--- .../cluster_schema/register_cluster_schema.go | 6 +- .../cluster_schema/torque_schema/config.go | 31 ------- 16 files changed, 198 insertions(+), 135 deletions(-) rename manifests/samples/{torque-centos.yaml => pbspro-centos.yaml} (95%) rename manifests/samples/{torque-pbspro.yaml => pbspro-pbspro.yaml} (95%) rename manifests/samples/{torque-ubuntu.yaml => pbspro-ubuntu.yaml} (95%) create mode 100644 pkg/controller/cluster_schema/pbspro_schema/config.go rename pkg/controller/cluster_schema/{torque_schema => pbspro_schema}/configmap.go (55%) rename pkg/controller/cluster_schema/{torque_schema => pbspro_schema}/env.go (99%) rename pkg/controller/cluster_schema/{torque_schema => pbspro_schema}/initcontainer.go (94%) rename pkg/controller/cluster_schema/{torque_schema/torque_schema.go => pbspro_schema/pbspro_schema.go} (83%) rename pkg/controller/cluster_schema/{torque_schema => pbspro_schema}/util.go (85%) delete mode 100644 pkg/controller/cluster_schema/torque_schema/config.go diff --git a/README.md b/README.md index 823a674..0913144 100644 --- a/README.md +++ b/README.md @@ -3,28 +3,26 @@ [![Coverage Status](https://coveralls.io/repos/github/chriskery/kubecluster/badge.svg?branch=master)](https://coveralls.io/github/chriskery/kubecluster?branch=master) [![Go Report Card](https://goreportcard.com/badge/github.com/chriskery/kubecluster)](https://goreportcard.com/report/github.com/chriskery/kubecluster) -### The kubecluster implements a mechanism that makes it easy to build Slurm/Torque clusters on Kubernetes. +### The kubecluster implements a mechanism that makes it easy to build Slurm/pbspro clusters on Kubernetes. ## Features Kubecluster uses Pods to simulate nodes in different clusters, currently supports the following cluster types : - [Slurm](pkg/controller/slurm_schema) -- [Torque( PBS )](pkg/controller/torque_schema) +- [PBS professional](pkg/controller/pbspro_schema) ## Getting Started You’ll need a Kubernetes cluster to run against. You can use [KIND](https://sigs.k8s.io/kind) to get a local cluster for testing, or run against a remote cluster. **Note:** Your controller will automatically use the current context in your kubeconfig file (i.e. whatever cluster `kubectl cluster-info` shows). ## Installation -### Master Branch - -```bash +```sh kubectl apply -k "github.com/chriskery/kubecluster/manifests/default" ``` ## Quick Start -Please refer to the [quick-start.md](docs/quick-start.md) and [Kubeflow Training User Guide](https://www.kubeflow.org/docs/guides/components/tftraining/) for more information. +Please refer to the [quick-start.md](docs/quick-start.md) for more information. ### How it works diff --git a/docs/quick-start.md b/docs/quick-start.md index f59614a..e3cf142 100644 --- a/docs/quick-start.md +++ b/docs/quick-start.md @@ -1,15 +1,15 @@ -## Create a Torque Cluster +## Build a pbspro Cluster -**Create Torque YAML** +**Create pbspro YAML** ``` -kubectl create -f ../manifests/samples/torque-centos.yaml +kubectl create -f ../manifests/samples/pbspro-centos.yaml ``` -The torque centos example create a torque cluster with 1 server and 1 worker, -so it will create two pods to simulate two nodes for the torque cluster +The pbspro centos example create a pbspro cluster with 1 server and 1 worker, +so it will create two pods to simulate two nodes for the pbspro cluster -**Get Torque Status** +**Get kubeclusters Status** Execute the following command: ``` @@ -19,40 +19,41 @@ The output is like: ```shell > kubectl get kubeclusters NAME AGE STATE -torque-centos-sample 3s Running +pbspro-centos-sample 3s Running ``` -Now you can enter the " server node " and use this torque-centos-sample look like you're actually using a physical torque cluster +Now you can enter the " server node " as you're actually using a physical pbspro cluster ``` > kubectl get pods NAME READY STATUS RESTARTS AGE nginx-deployment-5bc4c45dc9-npwxp 1/1 Running 16 46h -torque-centos-sample-cpu-0 1/1 Running 0 2m43s -torque-centos-sample-server-0 1/1 Running 0 2m43s +pbspro-centos-sample-cpu-0 1/1 Running 0 2m43s +pbspro-centos-sample-server-0 1/1 Running 0 2m43s ``` -torque-centos-sample-server-0 is the server node of cluster torque-centos-sample +pbspro-centos-sample-server-0 is the server node of cluster pbspro-centos-sample ``` -> kubectl exec -it torque-centos-sample-server-0 /bin/bash +> kubectl exec -it pbspro-centos-sample-server-0 /bin/bash kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead. -[root@torque-centos-sample-server-0 /]# +[root@pbspro-centos-sample-server-0 /]# ``` -**Using Torque Cluster** -Viewing Nodes' status of torque-centos-sample +**Using pbspro Cluster** + +Viewing Nodes' status of pbspro-centos-sample ``` -[root@torque-centos-sample-server-0 pbs]# pbsnodes -a -torque-centos-sample-server-0 - Mom = torque-centos-sample-server-0 +[root@pbspro-centos-sample-server-0 pbs]# pbsnodes -a +pbspro-centos-sample-server-0 + Mom = pbspro-centos-sample-server-0 Port = 15002 pbs_version = 19.0.0 ntype = PBS state = free pcpus = 16 resources_available.arch = linux - resources_available.host = torque-centos-sample-server-0 + resources_available.host = pbspro-centos-sample-server-0 resources_available.mem = 64756484kb resources_available.ncpus = 16 - resources_available.vnode = torque-centos-sample-server-0 + resources_available.vnode = pbspro-centos-sample-server-0 resources_assigned.accelerator_memory = 0kb resources_assigned.hbmem = 0kb resources_assigned.mem = 0kb @@ -63,8 +64,8 @@ torque-centos-sample-server-0 sharing = default_shared last_state_change_time = Thu Sep 28 07:05:43 2023 -torque-centos-sample-cpu-0 - Mom = 10-244-0-56.torque-centos-sample-cpu-0.default.svc.cluster.local +pbspro-centos-sample-cpu-0 + Mom = 10-244-0-56.pbspro-centos-sample-cpu-0.default.svc.cluster.local Port = 15002 pbs_version = 19.0.0 ntype = PBS @@ -74,7 +75,7 @@ torque-centos-sample-cpu-0 resources_available.host = 10-244-0-56 resources_available.mem = 64756484kb resources_available.ncpus = 16 - resources_available.vnode = torque-centos-sample-cpu-0 + resources_available.vnode = pbspro-centos-sample-cpu-0 resources_assigned.accelerator_memory = 0kb resources_assigned.hbmem = 0kb resources_assigned.mem = 0kb @@ -85,5 +86,23 @@ torque-centos-sample-cpu-0 sharing = default_shared last_state_change_time = Thu Sep 28 07:05:43 2023 ``` +Switch to the normal user and submit the job using [qsub](https://www.jlab.org/hpc/PBS/qsub.html) +```shell +[root@pbspro-centos-sample-server-0 /]# useradd pbsexample +[root@pbspro-centos-sample-server-0 /]# su pbsexample +[pbsexample@pbspro-centos-sample-server-0 /]$ qsub -- hostname +2.pbspro-centos-sample-server-0 +[pbsexample@pbspro-centos-sample-server-0 /]$ +``` +Use [qstat](https://docs.adaptivecomputing.com/torque/4-0-2/Content/topics/commands/qstat.htm) to view the job we just submitted +``` +[pbsexample@pbspro-centos-sample-server-0 /]$ qstat -a + +pbspro-centos-sample-server-0: + Req'd Req'd Elap +Job ID Username Queue Jobname SessID NDS TSK Memory Time S Time +--------------- -------- -------- ---------- ------ --- --- ------ ----- - ----- +2.pbspro-centos pbsexamp workq STDIN 1377 1 1 -- -- E 00:00 +``` diff --git a/manifests/samples/kustomization.yaml b/manifests/samples/kustomization.yaml index fec454f..993b872 100644 --- a/manifests/samples/kustomization.yaml +++ b/manifests/samples/kustomization.yaml @@ -3,6 +3,6 @@ resources: - slurm-centos.yaml - slurm-centos-hostNetwork.yaml - slurm-ubuntu.yaml -- torque-centos.yaml -- torque-ubuntu.yaml +- pbspro-centos.yaml +- pbspro-ubuntu.yaml #+kubebuilder:scaffold:manifestskustomizesamples diff --git a/manifests/samples/torque-centos.yaml b/manifests/samples/pbspro-centos.yaml similarity index 95% rename from manifests/samples/torque-centos.yaml rename to manifests/samples/pbspro-centos.yaml index 30cf8c2..0bb1e63 100644 --- a/manifests/samples/torque-centos.yaml +++ b/manifests/samples/pbspro-centos.yaml @@ -5,9 +5,9 @@ metadata: app.kubernetes.io/part-of: kubecluster app.kubernetes.io/managed-by: kustomize app.kubernetes.io/created-by: kubecluster - name: torque-centos-sample + name: pbspro-centos-sample spec: - clusterType: torque + clusterType: pbspro clusterReplicaSpec: Server: replicas: 1 diff --git a/manifests/samples/torque-pbspro.yaml b/manifests/samples/pbspro-pbspro.yaml similarity index 95% rename from manifests/samples/torque-pbspro.yaml rename to manifests/samples/pbspro-pbspro.yaml index 1bf03d5..95ff4dc 100644 --- a/manifests/samples/torque-pbspro.yaml +++ b/manifests/samples/pbspro-pbspro.yaml @@ -5,9 +5,9 @@ metadata: app.kubernetes.io/part-of: kubecluster app.kubernetes.io/managed-by: kustomize app.kubernetes.io/created-by: kubecluster - name: torque-pbspro-sample + name: pbspro-pbspro-sample spec: - clusterType: torque + clusterType: pbspro clusterReplicaSpec: Server: replicas: 1 diff --git a/manifests/samples/torque-ubuntu.yaml b/manifests/samples/pbspro-ubuntu.yaml similarity index 95% rename from manifests/samples/torque-ubuntu.yaml rename to manifests/samples/pbspro-ubuntu.yaml index ca13e0e..38d7fa6 100644 --- a/manifests/samples/torque-ubuntu.yaml +++ b/manifests/samples/pbspro-ubuntu.yaml @@ -5,9 +5,9 @@ metadata: app.kubernetes.io/part-of: kubecluster app.kubernetes.io/managed-by: kustomize app.kubernetes.io/created-by: kubecluster - name: torque-ubuntu-sample + name: pbspro-ubuntu-sample spec: - clusterType: torque + clusterType: pbspro clusterReplicaSpec: Server: replicas: 1 diff --git a/manifests/samples/slurm-centos-hostNetwork.yaml b/manifests/samples/slurm-centos-hostNetwork.yaml index 586b31b..ddaeaa0 100644 --- a/manifests/samples/slurm-centos-hostNetwork.yaml +++ b/manifests/samples/slurm-centos-hostNetwork.yaml @@ -5,7 +5,7 @@ metadata: app.kubernetes.io/part-of: kubecluster app.kubernetes.io/managed-by: kustomize app.kubernetes.io/created-by: kubecluster - name: centos-hostnetwork-sample + name: slurm-centos-hostnetwork-sample spec: clusterType: slurm clusterReplicaSpec: diff --git a/manifests/samples/slurm-centos.yaml b/manifests/samples/slurm-centos.yaml index b9f594e..7fe613e 100644 --- a/manifests/samples/slurm-centos.yaml +++ b/manifests/samples/slurm-centos.yaml @@ -5,7 +5,7 @@ metadata: app.kubernetes.io/part-of: kubecluster app.kubernetes.io/managed-by: kustomize app.kubernetes.io/created-by: kubecluster - name: centos-sample + name: slurm-centos-sample spec: clusterType: slurm clusterReplicaSpec: diff --git a/pkg/controller/cluster_schema/pbspro_schema/config.go b/pkg/controller/cluster_schema/pbspro_schema/config.go new file mode 100644 index 0000000..b656ccc --- /dev/null +++ b/pkg/controller/cluster_schema/pbspro_schema/config.go @@ -0,0 +1,31 @@ +package pbspro_schema + +import "flag" + +// Config is the global configuration for the training operator. +var config struct { + pbsproSchemaInitContainerTemplateFile string + pbsproSchemaInitContainerImage string + pbsproSchemaInitContainerMaxTries int +} + +const ( + // pbsproSchemaInitContainerImageDefault is the default image for the pbsproSchema + // init container. + pbsproSchemaInitContainerImageDefault = "registry.cn-shanghai.aliyuncs.com/eflops-bcp/pbs-minimal:v1" + // pbsproSchemaInitContainerTemplateFileDefault is the default template file for + // the pbsproSchema init container. + pbsproSchemaInitContainerTemplateFileDefault = "/etc/config/initContainer.yaml" + // pbsproSchemaInitContainerMaxTriesDefault is the default number of tries for the pbsproSchema init container. + pbsproSchemaInitContainerMaxTriesDefault = 100 +) + +func init() { + // pbsproSchema related flags + flag.StringVar(&config.pbsproSchemaInitContainerImage, "pbsproSchema-init-container-image", + pbsproSchemaInitContainerImageDefault, "The image for pbsproSchema init container") + flag.StringVar(&config.pbsproSchemaInitContainerTemplateFile, "pbsproSchema-init-container-template-file", + pbsproSchemaInitContainerTemplateFileDefault, "The template file for pbsproSchema init container") + flag.IntVar(&config.pbsproSchemaInitContainerMaxTries, "pbsproSchema-init-container-max-tries", + pbsproSchemaInitContainerMaxTriesDefault, "The number of tries for the pbsproSchema init container") +} diff --git a/pkg/controller/cluster_schema/torque_schema/configmap.go b/pkg/controller/cluster_schema/pbspro_schema/configmap.go similarity index 55% rename from pkg/controller/cluster_schema/torque_schema/configmap.go rename to pkg/controller/cluster_schema/pbspro_schema/configmap.go index af055ff..5c479ce 100644 --- a/pkg/controller/cluster_schema/torque_schema/configmap.go +++ b/pkg/controller/cluster_schema/pbspro_schema/configmap.go @@ -1,4 +1,4 @@ -package torque_schema +package pbspro_schema import ( "fmt" @@ -17,18 +17,20 @@ const ( ) const ( - TorqueConfDir = "/etc" + pbsproConfDir = "/etc" PBSConfKey = "pbs.conf" PBSServerConfKey = "pbs-server.conf" PBSWorkerConfKey = "pbs-worker.conf" - PBSConf = TorqueConfDir + "/" + PBSConfKey + PBSConf = pbsproConfDir + "/" + PBSConfKey PBSMonPrivConfig = "/var/spool/pbs/mom_priv/config" PBSMomConfigKey = "privConfig" PBSMonPrivMountPath = "/tmp/var/spool/pbs/mom_priv/config" ServerEntrypoint = "entrypoint.sh" + WorkerEntrypoint = "worker-entrypoint.sh" ServerEntryPointMountPath = "/tmp/" + ServerEntrypoint + WorkerEntryPointMountPath = "/tmp/" + WorkerEntrypoint PBS_START_MOM_VALUE = "1" NOT_PBS_START_MOM_VALUE = "0" @@ -54,11 +56,11 @@ PBS_HOME=/var/spool/pbs ` ) -func (t *TorqueClusterSchemaReconciler) ReconcileConfigMap( +func (p *pbsproClusterSchemaReconciler) ReconcileConfigMap( kcluster *kubeclusterorgv1alpha1.KubeCluster, configMap *corev1.ConfigMap, ) error { - isNeedReconcile := t.isNeedReconcileConfigMap(configMap) + isNeedReconcile := p.isNeedReconcileConfigMap(configMap) if !isNeedReconcile { return nil } @@ -68,27 +70,31 @@ func (t *TorqueClusterSchemaReconciler) ReconcileConfigMap( _, exists := configMap.Data[PBSServerConfKey] if !exists { - configMap.Data[PBSServerConfKey] = t.genServerPBSConf(kcluster) + configMap.Data[PBSServerConfKey] = p.genServerPBSConf(kcluster) } _, exists = configMap.Data[PBSWorkerConfKey] if !exists { - configMap.Data[PBSWorkerConfKey] = t.genWorkerPBSConf(kcluster) + configMap.Data[PBSWorkerConfKey] = p.genWorkerPBSConf(kcluster) } _, exists = configMap.Data[PBSMomConfigKey] if !exists { - configMap.Data[PBSMomConfigKey] = t.genPBSMomConfig(kcluster) + configMap.Data[PBSMomConfigKey] = p.genPBSMomConfig(kcluster) } _, exists = configMap.Data[ServerEntrypoint] if !exists { - configMap.Data[ServerEntrypoint] = t.genPBSServerEntrypoint(kcluster) + configMap.Data[ServerEntrypoint] = p.genPBSServerEntrypoint(kcluster) + } + _, exists = configMap.Data[WorkerEntrypoint] + if !exists { + configMap.Data[WorkerEntrypoint] = p.genPBSWorkerEntrypoint(kcluster) } return nil } -func (t *TorqueClusterSchemaReconciler) isNeedReconcileConfigMap(configMap *corev1.ConfigMap) bool { +func (p *pbsproClusterSchemaReconciler) isNeedReconcileConfigMap(configMap *corev1.ConfigMap) bool { _, exists := configMap.Data[PBSConfKey] if !exists { return true @@ -98,7 +104,7 @@ func (t *TorqueClusterSchemaReconciler) isNeedReconcileConfigMap(configMap *core return !exists } -func (t *TorqueClusterSchemaReconciler) genServerPBSConf(kcluster *kubeclusterorgv1alpha1.KubeCluster) string { +func (p *pbsproClusterSchemaReconciler) genServerPBSConf(kcluster *kubeclusterorgv1alpha1.KubeCluster) string { serverName := common.GenGeneralName(kcluster.Name, SchemaReplicaTypeServer, strconv.Itoa(0)) pbsConf := strings.Replace(pbsServerConfTemplate, placeHolderPBS_SERVER, serverName, 1) pbsConf = strings.Replace(pbsConf, placeHolderPBS_START_MOM, PBS_START_MOM_VALUE, 1) @@ -106,36 +112,67 @@ func (t *TorqueClusterSchemaReconciler) genServerPBSConf(kcluster *kubeclusteror return pbsConf } -func (s *TorqueClusterSchemaReconciler) genWorkerPBSConf(kcluster *kubeclusterorgv1alpha1.KubeCluster) string { +func (p *pbsproClusterSchemaReconciler) genWorkerPBSConf(kcluster *kubeclusterorgv1alpha1.KubeCluster) string { serverName := common.GenGeneralName(kcluster.Name, SchemaReplicaTypeServer, strconv.Itoa(0)) pbsConf := strings.Replace(pbsWorkerConfTemplate, placeHolderPBS_SERVER, serverName, 1) pbsConf = strings.Replace(pbsConf, placeHolderPBS_EXEC, PBSExec, 1) return pbsConf } -func (t *TorqueClusterSchemaReconciler) genPBSMomConfig(kcluster *kubeclusterorgv1alpha1.KubeCluster) string { +func (p *pbsproClusterSchemaReconciler) genPBSMomConfig(kcluster *kubeclusterorgv1alpha1.KubeCluster) string { serverName := common.GenGeneralName(kcluster.Name, SchemaReplicaTypeServer, strconv.Itoa(0)) pbsMonPrivConfig := strings.Replace(pbsMonPrivConfigTemplate, placeHolderClientHost, serverName, 1) return pbsMonPrivConfig } -func (t *TorqueClusterSchemaReconciler) UpdateConfigMap(_ *kubeclusterorgv1alpha1.KubeCluster, _ *corev1.ConfigMap) error { +func (p *pbsproClusterSchemaReconciler) UpdateConfigMap(_ *kubeclusterorgv1alpha1.KubeCluster, _ *corev1.ConfigMap) error { return nil } -func (t *TorqueClusterSchemaReconciler) genPBSServerEntrypoint(kcluster *kubeclusterorgv1alpha1.KubeCluster) string { +const qmgrCreateNodeCmds = `for node_name in "${node_names[@]}"; do + node_exists=0 + + while [ $node_exists -eq 0 ]; do + echo "try create node $node_name" + {{.Pbsnodes}} "$node_name" > /dev/null 2>&1 + + return_code=$? + if [ $return_code -eq 0 ]; then + echo "create node $node_name success" + node_exists=1 + else + {{.Qmgr}} -c "create node $node_name" + sleep 5 + fi + done +done` + +func (p *pbsproClusterSchemaReconciler) genPBSServerEntrypoint(kcluster *kubeclusterorgv1alpha1.KubeCluster) string { var entrypointShell = "#!/bin/bash\n" + for _, cmd := range genServerCommand() { + entrypointShell = fmt.Sprintf("%s\n%s\n", entrypointShell, cmd) + } + + nodeNames := make([]string, 0) for replicaType, spec := range kcluster.Spec.ClusterReplicaSpec { totalReplica := *spec.Replicas for i := 0; i < int(totalReplica); i++ { - entrypointShell = fmt.Sprintf( - "%s\n%s", - entrypointShell, - fmt.Sprintf("%s/qmgr -c \"create node %s\"", PBSBin, common.GenGeneralName(kcluster.GetName(), replicaType, strconv.Itoa(i))), - ) + nodeName := common.GenGeneralName(kcluster.GetName(), replicaType, strconv.Itoa(i)) + nodeNames = append(nodeNames, fmt.Sprintf("\"%s\"", nodeName)) } } - entrypointShell = fmt.Sprintf("%s\n%s", entrypointShell, "rm -rf /var/spool/pbs") - entrypointShell = fmt.Sprintf("%s\n%s", entrypointShell, "sleep infinity") + entrypointShell = fmt.Sprintf("%s\n%s\n", entrypointShell, fmt.Sprintf("node_names=(%s)", strings.Join(nodeNames, " "))) + qmgrCreateNodeCmd := strings.Replace(qmgrCreateNodeCmds, "{{.Pbsnodes}}", PBSNodes, 1) + qmgrCreateNodeCmd = strings.Replace(qmgrCreateNodeCmd, "{{.Qmgr}}", fmt.Sprintf("%s/qmgr", PBSBin), 1) + entrypointShell = fmt.Sprintf("%s\n%s\n", entrypointShell, qmgrCreateNodeCmd) + entrypointShell = fmt.Sprintf("%s\n%s\n", entrypointShell, "sleep infinity") + return entrypointShell +} + +func (p *pbsproClusterSchemaReconciler) genPBSWorkerEntrypoint(_ *kubeclusterorgv1alpha1.KubeCluster) string { + var entrypointShell = "#!/bin/bash\n" + for _, cmd := range genWorkerCommand() { + entrypointShell = fmt.Sprintf("%s\n%s\n", entrypointShell, cmd) + } return entrypointShell } diff --git a/pkg/controller/cluster_schema/torque_schema/env.go b/pkg/controller/cluster_schema/pbspro_schema/env.go similarity index 99% rename from pkg/controller/cluster_schema/torque_schema/env.go rename to pkg/controller/cluster_schema/pbspro_schema/env.go index c19216f..b9ef418 100644 --- a/pkg/controller/cluster_schema/torque_schema/env.go +++ b/pkg/controller/cluster_schema/pbspro_schema/env.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License -package torque_schema +package pbspro_schema import ( kubeclusterorgv1alpha1 "github.com/chriskery/kubecluster/apis/kubecluster.org/v1alpha1" diff --git a/pkg/controller/cluster_schema/torque_schema/initcontainer.go b/pkg/controller/cluster_schema/pbspro_schema/initcontainer.go similarity index 94% rename from pkg/controller/cluster_schema/torque_schema/initcontainer.go rename to pkg/controller/cluster_schema/pbspro_schema/initcontainer.go index 5a20726..7a6604d 100644 --- a/pkg/controller/cluster_schema/torque_schema/initcontainer.go +++ b/pkg/controller/cluster_schema/pbspro_schema/initcontainer.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License -package torque_schema +package pbspro_schema import ( "bytes" @@ -31,7 +31,7 @@ import ( var ( initContainerTemplate = ` -- name: init-torque +- name: init-pbspro image: {{.InitContainerImage}} imagePullPolicy: IfNotPresent resources: @@ -55,9 +55,9 @@ type initContainerGenerator struct { func getInitContainerGenerator() *initContainerGenerator { onceInitContainer.Do(func() { icGenerator = &initContainerGenerator{ - template: getInitContainerTemplateOrDefault(config.TorqueSchemaInitContainerTemplateFile), - image: config.TorqueSchemaInitContainerImage, - maxTries: config.TorqueSchemaInitContainerMaxTries, + template: getInitContainerTemplateOrDefault(config.pbsproSchemaInitContainerTemplateFile), + image: config.pbsproSchemaInitContainerImage, + maxTries: config.pbsproSchemaInitContainerMaxTries, } }) return icGenerator diff --git a/pkg/controller/cluster_schema/torque_schema/torque_schema.go b/pkg/controller/cluster_schema/pbspro_schema/pbspro_schema.go similarity index 83% rename from pkg/controller/cluster_schema/torque_schema/torque_schema.go rename to pkg/controller/cluster_schema/pbspro_schema/pbspro_schema.go index f516d6e..2ece23a 100644 --- a/pkg/controller/cluster_schema/torque_schema/torque_schema.go +++ b/pkg/controller/cluster_schema/pbspro_schema/pbspro_schema.go @@ -1,4 +1,4 @@ -package torque_schema +package pbspro_schema import ( "context" @@ -14,13 +14,18 @@ import ( ) const ( - ClusterSchemaKind = "torque" + ClusterSchemaKind = "pbspro" PBSSH = "/etc/profile.d/pbs.sh" PBSCmd = "/etc/init.d/pbs" PBSExec = "/opt/pbs" PBSBin = PBSExec + "/bin" + PBSSBin = PBSExec + "/sbin" + + PBSNodes = PBSBin + "/pbsnodes" + PBSIff = PBSSBin + "/pbs_iff" + PBSRcp = PBSSBin + "/pbs_rcp" PBSInitShell = "/opt/pbs/init.sh" @@ -34,25 +39,25 @@ const ( SchemaReplicaTypeServer kubeclusterorgv1alpha1.ReplicaType = "Server" ) -func NewTorqueClusterReconciler(_ context.Context, mgr ctrl.Manager) (common.ClusterSchemaReconciler, error) { - return &TorqueClusterSchemaReconciler{ +func NewpbsproClusterReconciler(_ context.Context, mgr ctrl.Manager) (common.ClusterSchemaReconciler, error) { + return &pbsproClusterSchemaReconciler{ ControllerExpectations: *expectation.NewControllerExpectations(), Recorder: mgr.GetEventRecorderFor(common.ControllerName), }, nil } -type TorqueClusterSchemaReconciler struct { +type pbsproClusterSchemaReconciler struct { expectation.ControllerExpectations // Recorder is an event recorder for recording Event resources to the // Kubernetes API. Recorder record.EventRecorder } -func (t *TorqueClusterSchemaReconciler) Default(kcluster *kubeclusterorgv1alpha1.KubeCluster) { +func (p *pbsproClusterSchemaReconciler) Default(kcluster *kubeclusterorgv1alpha1.KubeCluster) { // Update the key of Controller replica to camel case. kubeclusterorgv1alpha1.SetTypeNameToCamelCase(kcluster.Spec.ClusterReplicaSpec, SchemaReplicaTypeServer) for _, spec := range kcluster.Spec.ClusterReplicaSpec { - index := kubeclusterorgv1alpha1.GetDefaultContainerIndex(&spec.Template.Spec, t.GetDefaultContainerName()) + index := kubeclusterorgv1alpha1.GetDefaultContainerIndex(&spec.Template.Spec, p.GetDefaultContainerName()) if ok := kubeclusterorgv1alpha1.HasDefaultPort(&spec.Template.Spec, index, "serve-port"); !ok { kubeclusterorgv1alpha1.SetDefaultPort(&spec.Template.Spec, "serve-port", int32(15001), index) } @@ -66,7 +71,7 @@ func (t *TorqueClusterSchemaReconciler) Default(kcluster *kubeclusterorgv1alpha1 } -func (t *TorqueClusterSchemaReconciler) UpdateClusterStatus( +func (p *pbsproClusterSchemaReconciler) UpdateClusterStatus( kcluster *kubeclusterorgv1alpha1.KubeCluster, clusterStatus *kubeclusterorgv1alpha1.ClusterStatus, rtype kubeclusterorgv1alpha1.ReplicaType, @@ -122,13 +127,13 @@ func (t *TorqueClusterSchemaReconciler) UpdateClusterStatus( common.RestartedClustersCounterInc(kcluster.GetNamespace(), kcluster.Spec.ClusterType) } else { if rtype != SchemaReplicaTypeServer { - util.LoggerForCluster(kcluster).Infof("KubeCLuster %s/%s continues regardless %d %s replica(t) failed .", + util.LoggerForCluster(kcluster).Infof("KubeCLuster %s/%s continues regardless %d %s replica(p) failed .", kcluster.Namespace, kcluster.Name, failed, rtype) } else { - msg := fmt.Sprintf("KubeCLuster %s/%s has failed because %d %s replica(t) failed.", + msg := fmt.Sprintf("KubeCLuster %s/%s has failed because %d %s replica(p) failed.", kcluster.Namespace, kcluster.Name, failed, rtype) - t.Recorder.Event(kcluster, corev1.EventTypeNormal, util.NewReason(kubeclusterorgv1alpha1.KubeClusterKind, util.ClusterFailedReason), msg) + p.Recorder.Event(kcluster, corev1.EventTypeNormal, util.NewReason(kubeclusterorgv1alpha1.KubeClusterKind, util.ClusterFailedReason), msg) if clusterStatus.CompletionTime == nil { now := metav1.Now() clusterStatus.CompletionTime = &now @@ -140,14 +145,14 @@ func (t *TorqueClusterSchemaReconciler) UpdateClusterStatus( } } -func (t *TorqueClusterSchemaReconciler) IsController( +func (p *pbsproClusterSchemaReconciler) IsController( spec map[kubeclusterorgv1alpha1.ReplicaType]*kubeclusterorgv1alpha1.ReplicaSpec, rType kubeclusterorgv1alpha1.ReplicaType, index int) bool { return (SchemaReplicaTypeServer) == (rType) } -func (t *TorqueClusterSchemaReconciler) SetClusterSpec( +func (p *pbsproClusterSchemaReconciler) SetClusterSpec( kcluster *kubeclusterorgv1alpha1.KubeCluster, podTemplate *corev1.PodTemplateSpec, rtype kubeclusterorgv1alpha1.ReplicaType, @@ -155,33 +160,33 @@ func (t *TorqueClusterSchemaReconciler) SetClusterSpec( configMap *corev1.ConfigMap, ) error { - if err := setPodEnv(kcluster, podTemplate, t.GetDefaultContainerName(), rtype, index); err != nil { + if err := setPodEnv(kcluster, podTemplate, p.GetDefaultContainerName(), rtype, index); err != nil { return err } if err := setInitContainer(kcluster, podTemplate, rtype); err != nil { return err } - setVolumes(podTemplate, t.GetDefaultContainerName(), rtype, configMap.Name) + setVolumes(podTemplate, p.GetDefaultContainerName(), rtype, configMap.Name) setPodNetwork(podTemplate) - setCmd(kcluster, podTemplate, t.GetDefaultContainerName(), rtype) - setSecurity(podTemplate, t.GetDefaultContainerName(), rtype) + setCmd(kcluster, podTemplate, p.GetDefaultContainerName(), rtype) + setSecurity(podTemplate, p.GetDefaultContainerName(), rtype) return nil } -func (t *TorqueClusterSchemaReconciler) GetDefaultContainerName() string { +func (p *pbsproClusterSchemaReconciler) GetDefaultContainerName() string { return kubeclusterorgv1alpha1.ClusterDefaultContainerName } -func (t *TorqueClusterSchemaReconciler) ValidateV1KubeCluster(kcluster *kubeclusterorgv1alpha1.KubeCluster) error { +func (p *pbsproClusterSchemaReconciler) ValidateV1KubeCluster(kcluster *kubeclusterorgv1alpha1.KubeCluster) error { for replicaType, spec := range kcluster.Spec.ClusterReplicaSpec { if SchemaReplicaTypeServer != replicaType { continue } if *spec.Replicas != 1 { - return fmt.Errorf("torque clusetr server replica must be 1") + return fmt.Errorf("pbspro clusetr server replica must be 1") } return nil } - return fmt.Errorf("torque cluster need a replica named %v", SchemaReplicaTypeServer) + return fmt.Errorf("pbspro cluster need a replica named %v", SchemaReplicaTypeServer) } diff --git a/pkg/controller/cluster_schema/torque_schema/util.go b/pkg/controller/cluster_schema/pbspro_schema/util.go similarity index 85% rename from pkg/controller/cluster_schema/torque_schema/util.go rename to pkg/controller/cluster_schema/pbspro_schema/util.go index 5b52c65..1eb012e 100644 --- a/pkg/controller/cluster_schema/torque_schema/util.go +++ b/pkg/controller/cluster_schema/pbspro_schema/util.go @@ -1,10 +1,9 @@ -package torque_schema +package pbspro_schema import ( "fmt" kubeclusterorgv1alpha1 "github.com/chriskery/kubecluster/apis/kubecluster.org/v1alpha1" corev1 "k8s.io/api/core/v1" - "strings" ) func setPodNetwork(template *corev1.PodTemplateSpec) { @@ -63,6 +62,12 @@ func setVolumes(template *corev1.PodTemplateSpec, defaultContainerName string, r SubPath: PBSWorkerConfKey, ReadOnly: false, }) + template.Spec.Containers[i].VolumeMounts = append(template.Spec.Containers[i].VolumeMounts, corev1.VolumeMount{ + Name: configMapName, + MountPath: WorkerEntryPointMountPath, + SubPath: WorkerEntrypoint, + ReadOnly: false, + }) } template.Spec.Containers[i].VolumeMounts = append(template.Spec.Containers[i].VolumeMounts, corev1.VolumeMount{ @@ -102,41 +107,40 @@ func setCmd(_ *kubeclusterorgv1alpha1.KubeCluster, podTemplateSpec *corev1.PodTe podTemplateSpec.Spec.Containers[i].Command = make([]string, 0) } if rtype == SchemaReplicaTypeServer { - podTemplateSpec.Spec.Containers[i].Command = []string{"/bin/bash", "-c", genServerCommand()} + podTemplateSpec.Spec.Containers[i].Command = []string{"/bin/bash", "-c", ServerEntryPointMountPath} } else { - podTemplateSpec.Spec.Containers[i].Command = []string{"/bin/bash", "-c", genWorkerCommand()} + podTemplateSpec.Spec.Containers[i].Command = []string{"/bin/bash", "-c", WorkerEntryPointMountPath} } } } -func genServerCommand() string { +func genServerCommand() []string { serverCmds := make([]string, 0) serverCmds = append(serverCmds, "rm -rf /var/spool/pbs") cmds := getGeneralCommand() serverCmds = append(serverCmds, cmds...) - serverCmds = append(serverCmds, fmt.Sprintf("sh %s", ServerEntryPointMountPath)) - return strings.Join(serverCmds, " && ") + return serverCmds } -func genWorkerCommand() string { +func genWorkerCommand() []string { cmds := getGeneralCommand() cmds = append(cmds, "sleep infinity") - return strings.Join(cmds, " && ") + return cmds } func getGeneralCommand() []string { cpPBSProCmd := fmt.Sprintf("if [ -d %s ];then cp -r -p -n %s/* %s;fi", EmptyVolumeMountPathInMainContainer, EmptyVolumeMountPathInMainContainer, "/opt") //cpMomConfigCmd := fmt.Sprintf("mkdir -p /var/spool/pbs/mom_priv && if [ -e %s ];then cp -n -p %s %s;fi", PBSMonPrivMountPath, PBSMonPrivMountPath, PBSMonPrivConfig) + assignPbsSTagCmd := fmt.Sprintf("if [ -d %s ];then chmod a+s %s %s;fi ", PBSExec, PBSIff, PBSRcp) initCmd := fmt.Sprintf("if [ -e %s ];then sh %s;fi", PBSInitShell, PBSInitShell) - pbsSSHStartCmd := fmt.Sprintf(" if [ -e %s ];then chmod +x %s && %s start;fi ", PBSSH, PBSSH, PBSSH) + pbsSSHStartCmd := fmt.Sprintf("if [ -e %s ];then chmod +x %s && . %s;fi ", PBSSH, PBSSH, PBSSH) pbsStatustCmd := fmt.Sprintf("%s status ", PBSCmd) var cmds []string //cmds = append(cmds, "sleep 100") - cmds = append(cmds, cpPBSProCmd, initCmd) - cmds = append(cmds, pbsSSHStartCmd) - cmds = append(cmds, pbsStatustCmd) + cmds = append(cmds, cpPBSProCmd, assignPbsSTagCmd, initCmd) + cmds = append(cmds, pbsSSHStartCmd, pbsStatustCmd) return cmds } diff --git a/pkg/controller/cluster_schema/register_cluster_schema.go b/pkg/controller/cluster_schema/register_cluster_schema.go index ab81311..637a265 100644 --- a/pkg/controller/cluster_schema/register_cluster_schema.go +++ b/pkg/controller/cluster_schema/register_cluster_schema.go @@ -4,8 +4,8 @@ import ( "context" "fmt" "github.com/chriskery/kubecluster/pkg/common" + "github.com/chriskery/kubecluster/pkg/controller/cluster_schema/pbspro_schema" "github.com/chriskery/kubecluster/pkg/controller/cluster_schema/slurm_schema" - "github.com/chriskery/kubecluster/pkg/controller/cluster_schema/torque_schema" ctrl "sigs.k8s.io/controller-runtime" "strings" ) @@ -20,8 +20,8 @@ var SupportedClusterSchemaReconciler = map[ClusterSchema]ClusterSchemaFactory{ slurm_schema.ClusterSchemaKind: func(ctx context.Context, mgr ctrl.Manager) (common.ClusterSchemaReconciler, error) { return slurm_schema.NewSlurmClusterReconciler(ctx, mgr) }, - torque_schema.ClusterSchemaKind: func(ctx context.Context, mgr ctrl.Manager) (common.ClusterSchemaReconciler, error) { - return torque_schema.NewTorqueClusterReconciler(ctx, mgr) + pbspro_schema.ClusterSchemaKind: func(ctx context.Context, mgr ctrl.Manager) (common.ClusterSchemaReconciler, error) { + return pbspro_schema.NewpbsproClusterReconciler(ctx, mgr) }, } diff --git a/pkg/controller/cluster_schema/torque_schema/config.go b/pkg/controller/cluster_schema/torque_schema/config.go deleted file mode 100644 index b131d35..0000000 --- a/pkg/controller/cluster_schema/torque_schema/config.go +++ /dev/null @@ -1,31 +0,0 @@ -package torque_schema - -import "flag" - -// Config is the global configuration for the training operator. -var config struct { - TorqueSchemaInitContainerTemplateFile string - TorqueSchemaInitContainerImage string - TorqueSchemaInitContainerMaxTries int -} - -const ( - // TorqueSchemaInitContainerImageDefault is the default image for the TorqueSchema - // init container. - TorqueSchemaInitContainerImageDefault = "registry.cn-shanghai.aliyuncs.com/eflops-bcp/pbs-minimal:v1" - // TorqueSchemaInitContainerTemplateFileDefault is the default template file for - // the TorqueSchema init container. - TorqueSchemaInitContainerTemplateFileDefault = "/etc/config/initContainer.yaml" - // TorqueSchemaInitContainerMaxTriesDefault is the default number of tries for the TorqueSchema init container. - TorqueSchemaInitContainerMaxTriesDefault = 100 -) - -func init() { - // TorqueSchema related flags - flag.StringVar(&config.TorqueSchemaInitContainerImage, "TorqueSchema-init-container-image", - TorqueSchemaInitContainerImageDefault, "The image for TorqueSchema init container") - flag.StringVar(&config.TorqueSchemaInitContainerTemplateFile, "TorqueSchema-init-container-template-file", - TorqueSchemaInitContainerTemplateFileDefault, "The template file for TorqueSchema init container") - flag.IntVar(&config.TorqueSchemaInitContainerMaxTries, "TorqueSchema-init-container-max-tries", - TorqueSchemaInitContainerMaxTriesDefault, "The number of tries for the TorqueSchema init container") -}