From 2346c454ff74b3f5f9540e37dc5cf0663d1e674e Mon Sep 17 00:00:00 2001 From: john lockman Date: Wed, 19 Feb 2020 11:49:57 -0600 Subject: [PATCH 1/7] First commit --- README.md | 57 +- build-kubernetes-cluster.yml | 35 ++ create_users/create_user.sh | 54 ++ create_users/users | 6 + device-plugin.yaml | 43 ++ example.yaml | 30 + host_inventory_file | 25 + k8s-rdma-sriov-node-config.yaml | 12 + k8s-sriov-cni-installer.yaml | 38 ++ roles/common/files/k8s.conf | 3 + roles/common/files/kubernetes.repo | 8 + roles/common/files/nvidia | 3 + roles/common/handlers/main.yml | 21 + roles/common/tasks/main.yml | 134 +++++ roles/common/vars/main.yml | 10 + roles/computeGPU/files/k8s.conf | 3 + roles/computeGPU/files/kubernetes.repo | 8 + roles/computeGPU/files/nvidia | 3 + roles/computeGPU/handlers/main.yml | 21 + roles/computeGPU/tasks/main.yml | 64 +++ roles/computeGPU/vars/main.yml | 10 + roles/master/files/k8s.conf | 3 + roles/master/files/kubernetes.repo | 8 + roles/master/files/nvidia | 3 + roles/master/tasks/main.yml | 31 + .../startmaster/files/create_admin_user.yaml | 5 + .../files/create_clusterRoleBinding.yaml | 12 + roles/startmaster/files/data-pv.yaml | 20 + roles/startmaster/files/data2-pv.yaml | 20 + roles/startmaster/files/data3-pv.yaml | 20 + roles/startmaster/files/data4-pv.yaml | 20 + roles/startmaster/files/enable_gpu_k8s.sh | 1 + roles/startmaster/files/flannel_net.sh | 3 + roles/startmaster/files/katib-pv.yaml | 16 + roles/startmaster/files/kube-flannel.yaml | 536 ++++++++++++++++++ .../files/kubeflow_persistent_volumes.yaml | 51 ++ roles/startmaster/files/minio-pvc.yaml | 16 + roles/startmaster/files/mysql-pv.yaml | 17 + roles/startmaster/files/nfs-class.yaml | 7 + roles/startmaster/files/nfs-deployment.yaml | 32 ++ .../startmaster/files/nfs-serviceaccount.yaml | 4 + roles/startmaster/files/nfs_clusterrole.yaml | 20 + .../files/nfs_clusterrolebinding.yaml | 12 + roles/startmaster/files/notebook-pv.yaml | 17 + .../startmaster/files/persistent_volumes.yaml | 20 + roles/startmaster/files/pvc.yaml | 12 + roles/startmaster/files/tiller_config.sh | 3 + roles/startmaster/tasks/main.yml | 145 +++++ roles/startworkers/tasks/main.yml | 33 ++ scuttle | 19 + 50 files changed, 1692 insertions(+), 2 deletions(-) create mode 100644 build-kubernetes-cluster.yml create mode 100755 create_users/create_user.sh create mode 100644 create_users/users create mode 100644 device-plugin.yaml create mode 100644 example.yaml create mode 100644 host_inventory_file create mode 100644 k8s-rdma-sriov-node-config.yaml create mode 100644 k8s-sriov-cni-installer.yaml create mode 100644 roles/common/files/k8s.conf create mode 100644 roles/common/files/kubernetes.repo create mode 100644 roles/common/files/nvidia create mode 100644 roles/common/handlers/main.yml create mode 100644 roles/common/tasks/main.yml create mode 100644 roles/common/vars/main.yml create mode 100644 roles/computeGPU/files/k8s.conf create mode 100644 roles/computeGPU/files/kubernetes.repo create mode 100644 roles/computeGPU/files/nvidia create mode 100644 roles/computeGPU/handlers/main.yml create mode 100644 roles/computeGPU/tasks/main.yml create mode 100644 roles/computeGPU/vars/main.yml create mode 100644 roles/master/files/k8s.conf create mode 100644 roles/master/files/kubernetes.repo create mode 100644 roles/master/files/nvidia create mode 100644 roles/master/tasks/main.yml create mode 100644 roles/startmaster/files/create_admin_user.yaml create mode 100644 roles/startmaster/files/create_clusterRoleBinding.yaml create mode 100755 roles/startmaster/files/data-pv.yaml create mode 100755 roles/startmaster/files/data2-pv.yaml create mode 100755 roles/startmaster/files/data3-pv.yaml create mode 100755 roles/startmaster/files/data4-pv.yaml create mode 100755 roles/startmaster/files/enable_gpu_k8s.sh create mode 100755 roles/startmaster/files/flannel_net.sh create mode 100755 roles/startmaster/files/katib-pv.yaml create mode 100644 roles/startmaster/files/kube-flannel.yaml create mode 100644 roles/startmaster/files/kubeflow_persistent_volumes.yaml create mode 100755 roles/startmaster/files/minio-pvc.yaml create mode 100755 roles/startmaster/files/mysql-pv.yaml create mode 100644 roles/startmaster/files/nfs-class.yaml create mode 100644 roles/startmaster/files/nfs-deployment.yaml create mode 100644 roles/startmaster/files/nfs-serviceaccount.yaml create mode 100644 roles/startmaster/files/nfs_clusterrole.yaml create mode 100644 roles/startmaster/files/nfs_clusterrolebinding.yaml create mode 100755 roles/startmaster/files/notebook-pv.yaml create mode 100755 roles/startmaster/files/persistent_volumes.yaml create mode 100644 roles/startmaster/files/pvc.yaml create mode 100755 roles/startmaster/files/tiller_config.sh create mode 100644 roles/startmaster/tasks/main.yml create mode 100644 roles/startworkers/tasks/main.yml create mode 100755 scuttle diff --git a/README.md b/README.md index 8a0279668c..22e625c363 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,55 @@ -# omnia -Software tools for standing up Slurm/Kubernetes clusters on Dell EMC PowerEdge servers from factory OS images +Dancing to the beat of a different drum. + +# Short Version: + +Install Kubernetes and all dependencies +``` +ansible-playbook -i host_inventory_file build-kubernetes-cluster.yml +``` + +Initialize K8S cluster +``` +ansible-playbook -i host_inventory_file build-kubernetes-cluster.yml --tags "init" +``` + + +# What this does: + +## Build/Install + +### Add additional repositories: + +- Kubernetes (Google) +- El Repo (nvidia drivers) +- Nvidia (nvidia-docker) +- EPEL (Extra Packages for Enterprise Linux) + +### Install common packages + - gcc + - python-pip + - docker + - kubelet + - kubeadm + - kubectl + - nvidia-detect + - kmod-nvidia + - nvidia-x11-drv + - nvidia-container-runtime + - ksonnet (CLI framework for K8S configs) + +### Enable GPU Device Plugins (nvidia-container-runtime-hook) + +### Modify kubeadm config to allow GPUs as schedulable resource + +### Start and enable services + - Docker + - Kubelet + +## Initialize Cluster +### Head/master +- Start K8S pass startup token to compute/slaves +- Initialize networking (Currently using WeaveNet) +-Setup K8S Dashboard +- Create dynamic/persistent volumes +### Compute/slaves +- Join k8s cluster diff --git a/build-kubernetes-cluster.yml b/build-kubernetes-cluster.yml new file mode 100644 index 0000000000..bfaf8ec5e8 --- /dev/null +++ b/build-kubernetes-cluster.yml @@ -0,0 +1,35 @@ +--- +#Playbook for kubernetes cluster + +#collect info from everything +- hosts: all + +# Apply Common Installation and Config +- hosts: cluster + gather_facts: false + roles: + - common + +# Apply GPU Node Config +- hosts: gpus + gather_facts: false + roles: + - computeGPU + +# Apply Master Config +- hosts: master + gather_facts: false + roles: + - master + +# Start K8s on master server +- hosts: master + gather_facts: false + roles: + - startmaster + +# Start K8s worker servers +- hosts: compute,gpus + gather_facts: false + roles: + - startworkers diff --git a/create_users/create_user.sh b/create_users/create_user.sh new file mode 100755 index 0000000000..c583bb22b0 --- /dev/null +++ b/create_users/create_user.sh @@ -0,0 +1,54 @@ +SLURM=0 +FILENAME='' +DEFAULT='' +while [[ $# -gt 1 ]] +do +key="$1" + +case $key in + -s|--slurm) + SLURM=1 + ;; + -f|--file) + FILENAME="$2" + shift # past argument + ;; + --default) + DEFAULT=YES + ;; + *) + # unknown option + ;; +esac +shift # past argument or value +done +echo Add Slurm Account = "${SLURM}" +echo FILENAME = "${FILENAME}" + +#input file is in the form: +#username First Last +INFILE=${FILENAME} + +while IFS='' read -r line; do + IFS=" " read -ra ACCOUNT <<< "$line" + user=${ACCOUNT[0]} + password="changeme" + pass=$(perl -e 'print crypt($ARGV[0], "password")' $password) + + echo "Creating account for $user" + useradd -m -p $pass $user + pdsh "useradd -m -p $pass $user" + #force reset on login + chage -d 0 $user + #useradd -m -p $pass $user + + #become user to create home directory + sudo su - $user "exit" + #generate ssh-keys + sudo -u $user ssh-keygen -N "" -t rsa -f /home/$user/.ssh/id_rsa + sudo -u $user cat /home/$user/.ssh/id_rsa.pub > /home/$user/.ssh/authorized_keys + chown $user:$user /home/$user/.ssh/authorized_keys + sudo -u $user chmod 0644 /home/$user/.ssh/authorized_keys + +done < $INFILE + diff --git a/create_users/users b/create_users/users new file mode 100644 index 0000000000..e6d9af2dad --- /dev/null +++ b/create_users/users @@ -0,0 +1,6 @@ +john.lockman +don_smith2 +lwilson +gundev1 +pei_yang +srinivas diff --git a/device-plugin.yaml b/device-plugin.yaml new file mode 100644 index 0000000000..9b3c5927b2 --- /dev/null +++ b/device-plugin.yaml @@ -0,0 +1,43 @@ +apiVersion: extensions/v1beta1 +kind: DaemonSet +metadata: + name: rdma-sriov-dp-ds + namespace: kube-system +spec: + template: + metadata: + # Mark this pod as a critical add-on; when enabled, the critical add-on scheduler + # reserves resources for critical add-on pods so that they can be rescheduled after + # a failure. This annotation works in tandem with the toleration below. + annotations: + scheduler.alpha.kubernetes.io/critical-pod: "" + labels: + name: rdma-sriov-dp-ds + spec: + hostNetwork: true + tolerations: + # Allow this pod to be rescheduled while the node is in "critical add-ons only" mode. + # This, along with the annotation above marks this pod as a critical add-on. + - key: CriticalAddonsOnly + operator: Exists + containers: + - image: rdma/k8s-rdma-sriov-dev-plugin + name: k8s-rdma-sriov-dp-ds + imagePullPolicy: IfNotPresent + securityContext: + privileged: true + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + - name: config + mountPath: /k8s-rdma-sriov-dev-plugin + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins + - name: config + configMap: + name: rdma-devices + items: + - key: config.json + path: config.json diff --git a/example.yaml b/example.yaml new file mode 100644 index 0000000000..7d7bb7254c --- /dev/null +++ b/example.yaml @@ -0,0 +1,30 @@ +apiVersion: "kubeflow.org/v1alpha2" +kind: "TFJob" +metadata: + name: "example-job" +spec: + replicaSpecs: + - replicas: 1 + tfReplicaType: MASTER + template: + spec: + containers: + - image: gcr.io/tf-on-k8s-dogfood/tf_sample:dc944ff + name: tensorflow + restartPolicy: OnFailure + - replicas: 1 + tfReplicaType: WORKER + template: + spec: + containers: + - image: gcr.io/tf-on-k8s-dogfood/tf_sample:dc944ff + name: tensorflow + restartPolicy: OnFailure + - replicas: 2 + tfReplicaType: PS + template: + spec: + containers: + - image: gcr.io/tf-on-k8s-dogfood/tf_sample:dc944ff + name: tensorflow + restartPolicy: OnFailure diff --git a/host_inventory_file b/host_inventory_file new file mode 100644 index 0000000000..db0df5c04b --- /dev/null +++ b/host_inventory_file @@ -0,0 +1,25 @@ +[master] +friday + +[compute] +#compute001 +#compute002 +compute000 +compute[002:005] +#compute[201:204] +#compute[301:304] +#compute[401:404] + +[gpus] +#compute001 +compute002 +compute004 +compute005 + +[workers:children] +compute +gpus + +[cluster:children] +master +workers diff --git a/k8s-rdma-sriov-node-config.yaml b/k8s-rdma-sriov-node-config.yaml new file mode 100644 index 0000000000..3b418810b2 --- /dev/null +++ b/k8s-rdma-sriov-node-config.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: rdma-devices + namespace: kube-system +data: + config.json: | + { + "mode" : "sriov", + #"pfNetdevices": [ "eth0" , "eth1" ] + "pfNetdevices": [ "ib0" ] + } diff --git a/k8s-sriov-cni-installer.yaml b/k8s-sriov-cni-installer.yaml new file mode 100644 index 0000000000..470080dc7d --- /dev/null +++ b/k8s-sriov-cni-installer.yaml @@ -0,0 +1,38 @@ +apiVersion: extensions/v1beta1 +kind: DaemonSet +metadata: + name: kube-sriov-cni-ds-installer + namespace: kube-system +spec: + template: + metadata: + labels: + name: sriov-cni-ds + spec: + hostNetwork: true + tolerations: + - key: node-role.kubernetes.io/master + operator: Exists + effect: NoSchedule + initContainers: + - name: install-cni + image: rdma/k8s-sriov-cni-installer + imagePullPolicy: IfNotPresent + command: [ "/installer/installer.sh" ] + volumeMounts: + - name: host-cni-etc + mountPath: /host-cni-etc + - name: host-cni-bin + mountPath: /host-cni-bin + containers: + - name: install-cni-sleep + image: rdma/k8s-sriov-cni-installer + imagePullPolicy: IfNotPresent + command: [ "/installer/installer_sleep.sh" ] + volumes: + - name: host-cni-etc + hostPath: + path: /etc/cni/net.d/ + - name: host-cni-bin + hostPath: + path: /opt/cni/bin diff --git a/roles/common/files/k8s.conf b/roles/common/files/k8s.conf new file mode 100644 index 0000000000..9994b14823 --- /dev/null +++ b/roles/common/files/k8s.conf @@ -0,0 +1,3 @@ +net.bridge.bridge-nf-call-ip6tables = 1 +net.bridge.bridge-nf-call-iptables = 1 + diff --git a/roles/common/files/kubernetes.repo b/roles/common/files/kubernetes.repo new file mode 100644 index 0000000000..476b99cb2b --- /dev/null +++ b/roles/common/files/kubernetes.repo @@ -0,0 +1,8 @@ +[kubernetes] +name=Kubernetes +baseurl=https://packages.cloud.google.com/yum/repos/kubernetes-el7-x86_64 +enabled=1 +gpgcheck=1 +repo_gpgcheck=1 +gpgkey=https://packages.cloud.google.com/yum/doc/yum-key.gpg https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg + diff --git a/roles/common/files/nvidia b/roles/common/files/nvidia new file mode 100644 index 0000000000..f22e77e0bb --- /dev/null +++ b/roles/common/files/nvidia @@ -0,0 +1,3 @@ +#!/bin/sh +PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" exec nvidia-container-runtime-hook "$@" + diff --git a/roles/common/handlers/main.yml b/roles/common/handlers/main.yml new file mode 100644 index 0000000000..4fdc7000ed --- /dev/null +++ b/roles/common/handlers/main.yml @@ -0,0 +1,21 @@ +--- + +#- name: Enable docker service + #service: + #name: docker + #enabled: yes +# +- name: Start and Enable docker service + service: + name: docker + state: restarted + enabled: yes + #tags: install + +- name: Start and Enable Kubernetes - kubelet + service: + name: kubelet + state: started + enabled: yes + #tags: install + diff --git a/roles/common/tasks/main.yml b/roles/common/tasks/main.yml new file mode 100644 index 0000000000..adcf258073 --- /dev/null +++ b/roles/common/tasks/main.yml @@ -0,0 +1,134 @@ +--- + +- name: add kubernetes repo + copy: src=kubernetes.repo dest=/etc/yum.repos.d/ owner=root group=root mode=644 + tags: install + +# add ElRepo GPG Key +- rpm_key: + state: present + key: https://www.elrepo.org/RPM-GPG-KEY-elrepo.org + tags: install + +- name: add ElRepo (Nvidia kmod drivers) + yum: + name: http://www.elrepo.org/elrepo-release-7.0-3.el7.elrepo.noarch.rpm + state: present + tags: install + +- name: update sysctl to handle incorrectly routed traffic when iptables is bypassed + copy: src=k8s.conf dest=/etc/sysctl.d/ owner=root group=root mode=644 + tags: install + +- name: update sysctl + command: /sbin/sysctl --system + tags: install + +- name: Install EPEL Repository + yum: name=epel-release state=present + tags: install + +#likely need to add a reboot hook in here +#- name: update kernel and all other system packages + #yum: name=* state=latest + #tags: install + +- name: disable swap + command: /sbin/swapoff -a + tags: install + +# Disable selinux +- selinux: + state: disabled + tags: install + +- name: install common packages + yum: + name: + - gcc + - nfs-utils + - python-pip + - docker + - bash-completion + - kubelet + - kubeadm + - kubectl + - nvidia-detect + state: present + tags: install + +- name: install InfiniBand Support + yum: + name: "@Infiniband Support" + state: present + +- name: Install KSonnet + unarchive: + src: https://github.com/ksonnet/ksonnet/releases/download/v0.13.1/ks_0.13.1_linux_amd64.tar.gz + dest: /usr/bin/ + extra_opts: [--strip-components=1] + remote_src: yes + exclude: + - ks_0.11.0_linux_amd64/CHANGELOG.md + - ks_0.11.0_linux_amd64/CODE-OF-CONDUCT.md + - ks_0.11.0_linux_amd64/CONTRIBUTING.md + - ks_0.11.0_linux_amd64/LICENSE + - ks_0.11.0_linux_amd64/README.md + tags: install + +- name: upgrade pip + command: /bin/pip install --upgrade pip + tags: install + +#- name: Enable DevicePlugins for all GPU nodes (nvidia-container-runtime-hook) + #copy: src=nvidia dest=/usr/libexec/oci/hooks.d/ owner=root group=root mode=755 + #tags: install + +- name: Add KUBE_EXTRA_ARGS to enable GPUs + lineinfile: + path: /etc/systemd/system/kubelet.service.d/10-kubeadm.conf + line: 'Environment="KUBELET_EXTRA_ARGS=--feature-gates=DevicePlugins=true"' + insertbefore: 'KUBELET_KUBECONFIG_ARGS=' + tags: install + +- name: Start and Enable docker service + service: + name: docker + state: restarted + enabled: yes + tags: install + +- name: Start and Enable Kubernetes - kubelet + service: + name: kubelet + state: restarted + enabled: yes + tags: install + +- name: Start and rpcbind service + service: + name: rpcbind + state: restarted + enabled: yes + tags: install + +- name: Start and nfs-server service + service: + name: nfs-server + state: restarted + enabled: yes + tags: install + +- name: Start and nfs-lock service + service: + name: nfs-lock + state: restarted + enabled: yes + tags: install + +- name: Start and nfs-idmap service + service: + name: nfs-idmap + state: restarted + enabled: yes + tags: install diff --git a/roles/common/vars/main.yml b/roles/common/vars/main.yml new file mode 100644 index 0000000000..1bdf70a4c5 --- /dev/null +++ b/roles/common/vars/main.yml @@ -0,0 +1,10 @@ +--- + +common_packages: + - epel-release + - python-pip + - docker + - bash-completion + - kubelet + - kubeadm + - kubectl diff --git a/roles/computeGPU/files/k8s.conf b/roles/computeGPU/files/k8s.conf new file mode 100644 index 0000000000..9994b14823 --- /dev/null +++ b/roles/computeGPU/files/k8s.conf @@ -0,0 +1,3 @@ +net.bridge.bridge-nf-call-ip6tables = 1 +net.bridge.bridge-nf-call-iptables = 1 + diff --git a/roles/computeGPU/files/kubernetes.repo b/roles/computeGPU/files/kubernetes.repo new file mode 100644 index 0000000000..476b99cb2b --- /dev/null +++ b/roles/computeGPU/files/kubernetes.repo @@ -0,0 +1,8 @@ +[kubernetes] +name=Kubernetes +baseurl=https://packages.cloud.google.com/yum/repos/kubernetes-el7-x86_64 +enabled=1 +gpgcheck=1 +repo_gpgcheck=1 +gpgkey=https://packages.cloud.google.com/yum/doc/yum-key.gpg https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg + diff --git a/roles/computeGPU/files/nvidia b/roles/computeGPU/files/nvidia new file mode 100644 index 0000000000..f22e77e0bb --- /dev/null +++ b/roles/computeGPU/files/nvidia @@ -0,0 +1,3 @@ +#!/bin/sh +PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" exec nvidia-container-runtime-hook "$@" + diff --git a/roles/computeGPU/handlers/main.yml b/roles/computeGPU/handlers/main.yml new file mode 100644 index 0000000000..4fdc7000ed --- /dev/null +++ b/roles/computeGPU/handlers/main.yml @@ -0,0 +1,21 @@ +--- + +#- name: Enable docker service + #service: + #name: docker + #enabled: yes +# +- name: Start and Enable docker service + service: + name: docker + state: restarted + enabled: yes + #tags: install + +- name: Start and Enable Kubernetes - kubelet + service: + name: kubelet + state: started + enabled: yes + #tags: install + diff --git a/roles/computeGPU/tasks/main.yml b/roles/computeGPU/tasks/main.yml new file mode 100644 index 0000000000..fce5f562d3 --- /dev/null +++ b/roles/computeGPU/tasks/main.yml @@ -0,0 +1,64 @@ +--- +- name: install Nvidia driver + yum: + name: + - kmod-nvidia + #- nvidia-x11-drv + state: present + tags: install + +#- name: add Nvidia container runtime support + #get_url: + #url: https://nvidia.github.io/nvidia-docker/centos7/nvidia-docker.repo + #dest: /etc/yum.repos.d/nvidia-docker.repo + #tags: install + +- name: add Nvidia container runtime support + get_url: + url: https://nvidia.github.io/nvidia-container-runtime/centos7/nvidia-container-runtime.repo + dest: /etc/yum.repos.d/nvidia-container-runtime.repo + tags: install, testing + +# disable gpg key (because Nvidia doesn't know how to make that work yet for some reason) +- replace: + path: /etc/yum.repos.d/nvidia-container-runtime.repo + regexp: 'repo_gpgcheck=1' + replace: 'repo_gpgcheck=0' + backup: yes + tags: testing + +- name: install Nvidia-container-runtime-hook + yum: + name: + #- nvidia-detect + #- kmod-nvidia-410.73-1.el7_5.elrepo + - nvidia-container-runtime-hook + state: present + tags: install + + +# This needs to be done on GPU nodes +#- name: Enable DevicePlugins for all GPU nodes (nvidia-container-runtime-hook) + #copy: src=nvidia dest=/usr/libexec/oci/hooks.d/ owner=root group=root mode=755 + #tags: install + +#- name: Add KUBE_EXTRA_ARGS to enable Plugins (GPU support) --III alreday done in common + #lineinfile: + #path: /etc/systemd/system/kubelet.service.d/10-kubeadm.conf + #line: 'Environment="KUBELET_EXTRA_ARGS=--feature-gates=DevicePlugins=true"' + #insertbefore: 'KUBELET_KUBECONFIG_ARGS=' + #tags: install + +- name: Restart and Enable docker service + service: + name: docker + state: restarted + enabled: yes + tags: install + +- name: Restart and Enable Kubernetes - kubelet + service: + name: kubelet + state: restarted + enabled: yes + tags: install diff --git a/roles/computeGPU/vars/main.yml b/roles/computeGPU/vars/main.yml new file mode 100644 index 0000000000..1bdf70a4c5 --- /dev/null +++ b/roles/computeGPU/vars/main.yml @@ -0,0 +1,10 @@ +--- + +common_packages: + - epel-release + - python-pip + - docker + - bash-completion + - kubelet + - kubeadm + - kubectl diff --git a/roles/master/files/k8s.conf b/roles/master/files/k8s.conf new file mode 100644 index 0000000000..9994b14823 --- /dev/null +++ b/roles/master/files/k8s.conf @@ -0,0 +1,3 @@ +net.bridge.bridge-nf-call-ip6tables = 1 +net.bridge.bridge-nf-call-iptables = 1 + diff --git a/roles/master/files/kubernetes.repo b/roles/master/files/kubernetes.repo new file mode 100644 index 0000000000..476b99cb2b --- /dev/null +++ b/roles/master/files/kubernetes.repo @@ -0,0 +1,8 @@ +[kubernetes] +name=Kubernetes +baseurl=https://packages.cloud.google.com/yum/repos/kubernetes-el7-x86_64 +enabled=1 +gpgcheck=1 +repo_gpgcheck=1 +gpgkey=https://packages.cloud.google.com/yum/doc/yum-key.gpg https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg + diff --git a/roles/master/files/nvidia b/roles/master/files/nvidia new file mode 100644 index 0000000000..f22e77e0bb --- /dev/null +++ b/roles/master/files/nvidia @@ -0,0 +1,3 @@ +#!/bin/sh +PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" exec nvidia-container-runtime-hook "$@" + diff --git a/roles/master/tasks/main.yml b/roles/master/tasks/main.yml new file mode 100644 index 0000000000..ace5f1d01c --- /dev/null +++ b/roles/master/tasks/main.yml @@ -0,0 +1,31 @@ +--- +- name: Firewall Rule K8s:6443/tcp + command: firewall-cmd --zone=internal --add-port=6443/tcp --permanent + tags: master + +- name: Firewall Rule K8s:10250/tcp + command: firewall-cmd --zone=internal --add-port=10250/tcp --permanent + tags: master + +- name: Firewall Reload + command: firewall-cmd --reload + tags: master + +- name: Create /root/bin (if it doesn't exist) + file: + path: /root/bin + state: directory + mode: 0755 + +- name: Get Helm Installer + get_url: + url: https://raw.githubusercontent.com/kubernetes/helm/master/scripts/get + dest: /root/bin/get_helm.sh + mode: 700 + tags: master + +- name: Install Helm + command: /root/bin/get_helm.sh + tags: master + +# install and start up OpenSM - III diff --git a/roles/startmaster/files/create_admin_user.yaml b/roles/startmaster/files/create_admin_user.yaml new file mode 100644 index 0000000000..27b6bb8020 --- /dev/null +++ b/roles/startmaster/files/create_admin_user.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: admin-user + namespace: kube-system diff --git a/roles/startmaster/files/create_clusterRoleBinding.yaml b/roles/startmaster/files/create_clusterRoleBinding.yaml new file mode 100644 index 0000000000..acb5dd6c00 --- /dev/null +++ b/roles/startmaster/files/create_clusterRoleBinding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: admin-user +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cluster-admin +subjects: +- kind: ServiceAccount + name: admin-user + namespace: kube-system diff --git a/roles/startmaster/files/data-pv.yaml b/roles/startmaster/files/data-pv.yaml new file mode 100755 index 0000000000..12bcefd041 --- /dev/null +++ b/roles/startmaster/files/data-pv.yaml @@ -0,0 +1,20 @@ +# yaml file contents +apiVersion: v1 +kind: PersistentVolume +metadata: + name: data-pv +spec: + capacity: + storage: 10Gi + accessModes: + - ReadWriteOnce + - ReadOnlyMany + - ReadWriteMany + nfs: + server: 10.0.0.1 + path: /work/k8s/data1 + persistentVolumeReclaimPolicy: Recycle + #storageClassName: local-storage + #hostPath: + #path: /home/k8s/data1 + diff --git a/roles/startmaster/files/data2-pv.yaml b/roles/startmaster/files/data2-pv.yaml new file mode 100755 index 0000000000..67a2ce77d5 --- /dev/null +++ b/roles/startmaster/files/data2-pv.yaml @@ -0,0 +1,20 @@ +# yaml file contents +apiVersion: v1 +kind: PersistentVolume +metadata: + name: data2-pv +spec: + capacity: + storage: 20Gi + accessModes: + - ReadWriteOnce + - ReadOnlyMany + - ReadWriteMany + nfs: + server: 10.0.0.1 + path: /work/k8s/data2 + persistentVolumeReclaimPolicy: Recycle + #storageClassName: local-storage + #hostPath: + #path: /home/k8s/ + diff --git a/roles/startmaster/files/data3-pv.yaml b/roles/startmaster/files/data3-pv.yaml new file mode 100755 index 0000000000..2d433f7ad9 --- /dev/null +++ b/roles/startmaster/files/data3-pv.yaml @@ -0,0 +1,20 @@ +# yaml file contents +apiVersion: v1 +kind: PersistentVolume +metadata: + name: data3-pv +spec: + capacity: + storage: 50Gi + accessModes: + - ReadWriteOnce + - ReadOnlyMany + - ReadWriteMany + nfs: + server: 10.0.0.1 + path: /work/k8s/data3 + persistentVolumeReclaimPolicy: Recycle + #storageClassName: local-storage + #hostPath: + #path: /home/k8s/ + diff --git a/roles/startmaster/files/data4-pv.yaml b/roles/startmaster/files/data4-pv.yaml new file mode 100755 index 0000000000..c45daaebcd --- /dev/null +++ b/roles/startmaster/files/data4-pv.yaml @@ -0,0 +1,20 @@ +# yaml file contents +apiVersion: v1 +kind: PersistentVolume +metadata: + name: data4-pv +spec: + capacity: + storage: 50Gi + accessModes: + - ReadWriteOnce + - ReadOnlyMany + - ReadWriteMany + nfs: + server: 10.0.0.1 + path: /work/k8s/data4 + persistentVolumeReclaimPolicy: Recycle + #storageClassName: local-storage + #hostPath: + #path: /home/k8s/ + diff --git a/roles/startmaster/files/enable_gpu_k8s.sh b/roles/startmaster/files/enable_gpu_k8s.sh new file mode 100755 index 0000000000..f733d0ad76 --- /dev/null +++ b/roles/startmaster/files/enable_gpu_k8s.sh @@ -0,0 +1 @@ +kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v1.11/nvidia-device-plugin.yml diff --git a/roles/startmaster/files/flannel_net.sh b/roles/startmaster/files/flannel_net.sh new file mode 100755 index 0000000000..ab51e024ff --- /dev/null +++ b/roles/startmaster/files/flannel_net.sh @@ -0,0 +1,3 @@ +kubectl apply -f https://raw.githubusercontent.com/coreos/flannel/v0.10.0/Documentation/kube-flannel.yml +kubectl apply -f https://raw.githubusercontent.com/coreos/flannel/master/Documentation/k8s-manifests/kube-flannel-rbac.yml + diff --git a/roles/startmaster/files/katib-pv.yaml b/roles/startmaster/files/katib-pv.yaml new file mode 100755 index 0000000000..dc39388c66 --- /dev/null +++ b/roles/startmaster/files/katib-pv.yaml @@ -0,0 +1,16 @@ +# yaml file contents +apiVersion: v1 +kind: PersistentVolume +metadata: + name: katib-pv +spec: + capacity: + storage: 20Gi + accessModes: + - ReadWriteOnce + - ReadOnlyMany + - ReadWriteMany + persistentVolumeReclaimPolicy: Recycle + #storageClassName: local-storage + hostPath: + path: /home/k8s/katibsql diff --git a/roles/startmaster/files/kube-flannel.yaml b/roles/startmaster/files/kube-flannel.yaml new file mode 100644 index 0000000000..c69fb26a77 --- /dev/null +++ b/roles/startmaster/files/kube-flannel.yaml @@ -0,0 +1,536 @@ +--- +apiVersion: extensions/v1beta1 +kind: PodSecurityPolicy +metadata: + name: psp.flannel.unprivileged + annotations: + seccomp.security.alpha.kubernetes.io/allowedProfileNames: docker/default + seccomp.security.alpha.kubernetes.io/defaultProfileName: docker/default + apparmor.security.beta.kubernetes.io/allowedProfileNames: runtime/default + apparmor.security.beta.kubernetes.io/defaultProfileName: runtime/default +spec: + privileged: false + volumes: + - configMap + - secret + - emptyDir + - hostPath + allowedHostPaths: + - pathPrefix: "/etc/cni/net.d" + - pathPrefix: "/etc/kube-flannel" + - pathPrefix: "/run/flannel" + readOnlyRootFilesystem: false + # Users and groups + runAsUser: + rule: RunAsAny + supplementalGroups: + rule: RunAsAny + fsGroup: + rule: RunAsAny + # Privilege Escalation + allowPrivilegeEscalation: false + defaultAllowPrivilegeEscalation: false + # Capabilities + allowedCapabilities: ['NET_ADMIN'] + defaultAddCapabilities: [] + requiredDropCapabilities: [] + # Host namespaces + hostPID: false + hostIPC: false + hostNetwork: true + hostPorts: + - min: 0 + max: 65535 + # SELinux + seLinux: + # SELinux is unsed in CaaSP + rule: 'RunAsAny' +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1beta1 +metadata: + name: flannel +rules: + - apiGroups: ['extensions'] + resources: ['podsecuritypolicies'] + verbs: ['use'] + resourceNames: ['psp.flannel.unprivileged'] + - apiGroups: + - "" + resources: + - pods + verbs: + - get + - apiGroups: + - "" + resources: + - nodes + verbs: + - list + - watch + - apiGroups: + - "" + resources: + - nodes/status + verbs: + - patch +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1beta1 +metadata: + name: flannel +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: flannel +subjects: +- kind: ServiceAccount + name: flannel + namespace: kube-system +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: flannel + namespace: kube-system +--- +kind: ConfigMap +apiVersion: v1 +metadata: + name: kube-flannel-cfg + namespace: kube-system + labels: + tier: node + app: flannel +data: + cni-conf.json: | + { + "name": "cbr0", + "plugins": [ + { + "type": "flannel", + "delegate": { + "hairpinMode": true, + "isDefaultGateway": true + } + }, + { + "type": "portmap", + "capabilities": { + "portMappings": true + } + } + ] + } + net-conf.json: | + { + "Network": "10.244.0.0/16", + "Backend": { + "Type": "vxlan" + } + } +--- +apiVersion: extensions/v1beta1 +kind: DaemonSet +metadata: + name: kube-flannel-ds-amd64 + namespace: kube-system + labels: + tier: node + app: flannel +spec: + template: + metadata: + labels: + tier: node + app: flannel + spec: + hostNetwork: true + nodeSelector: + beta.kubernetes.io/arch: amd64 + tolerations: + - operator: Exists + effect: NoSchedule + serviceAccountName: flannel + initContainers: + - name: install-cni + image: quay.io/coreos/flannel:v0.11.0-amd64 + command: + - cp + args: + - -f + - /etc/kube-flannel/cni-conf.json + - /etc/cni/net.d/10-flannel.conflist + volumeMounts: + - name: cni + mountPath: /etc/cni/net.d + - name: flannel-cfg + mountPath: /etc/kube-flannel/ + containers: + - name: kube-flannel + image: quay.io/coreos/flannel:v0.11.0-amd64 + command: + - /opt/bin/flanneld + args: + - --ip-masq + - --kube-subnet-mgr + - --iface=ib0 + resources: + requests: + cpu: "100m" + memory: "50Mi" + limits: + cpu: "100m" + memory: "50Mi" + securityContext: + privileged: false + capabilities: + add: ["NET_ADMIN"] + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + volumeMounts: + - name: run + mountPath: /run/flannel + - name: flannel-cfg + mountPath: /etc/kube-flannel/ + volumes: + - name: run + hostPath: + path: /run/flannel + - name: cni + hostPath: + path: /etc/cni/net.d + - name: flannel-cfg + configMap: + name: kube-flannel-cfg +--- +apiVersion: extensions/v1beta1 +kind: DaemonSet +metadata: + name: kube-flannel-ds-arm64 + namespace: kube-system + labels: + tier: node + app: flannel +spec: + template: + metadata: + labels: + tier: node + app: flannel + spec: + hostNetwork: true + nodeSelector: + beta.kubernetes.io/arch: arm64 + tolerations: + - operator: Exists + effect: NoSchedule + serviceAccountName: flannel + initContainers: + - name: install-cni + image: quay.io/coreos/flannel:v0.11.0-arm64 + command: + - cp + args: + - -f + - /etc/kube-flannel/cni-conf.json + - /etc/cni/net.d/10-flannel.conflist + volumeMounts: + - name: cni + mountPath: /etc/cni/net.d + - name: flannel-cfg + mountPath: /etc/kube-flannel/ + containers: + - name: kube-flannel + image: quay.io/coreos/flannel:v0.11.0-arm64 + command: + - /opt/bin/flanneld + args: + - --ip-masq + - --kube-subnet-mgr + - --iface=ib0 + resources: + requests: + cpu: "100m" + memory: "50Mi" + limits: + cpu: "100m" + memory: "50Mi" + securityContext: + privileged: false + capabilities: + add: ["NET_ADMIN"] + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + volumeMounts: + - name: run + mountPath: /run/flannel + - name: flannel-cfg + mountPath: /etc/kube-flannel/ + volumes: + - name: run + hostPath: + path: /run/flannel + - name: cni + hostPath: + path: /etc/cni/net.d + - name: flannel-cfg + configMap: + name: kube-flannel-cfg +--- +apiVersion: extensions/v1beta1 +kind: DaemonSet +metadata: + name: kube-flannel-ds-arm + namespace: kube-system + labels: + tier: node + app: flannel +spec: + template: + metadata: + labels: + tier: node + app: flannel + spec: + hostNetwork: true + nodeSelector: + beta.kubernetes.io/arch: arm + tolerations: + - operator: Exists + effect: NoSchedule + serviceAccountName: flannel + initContainers: + - name: install-cni + image: quay.io/coreos/flannel:v0.11.0-arm + command: + - cp + args: + - -f + - /etc/kube-flannel/cni-conf.json + - /etc/cni/net.d/10-flannel.conflist + volumeMounts: + - name: cni + mountPath: /etc/cni/net.d + - name: flannel-cfg + mountPath: /etc/kube-flannel/ + containers: + - name: kube-flannel + image: quay.io/coreos/flannel:v0.11.0-arm + command: + - /opt/bin/flanneld + args: + - --ip-masq + - --kube-subnet-mgr + - --iface=ib0 + resources: + requests: + cpu: "100m" + memory: "50Mi" + limits: + cpu: "100m" + memory: "50Mi" + securityContext: + privileged: false + capabilities: + add: ["NET_ADMIN"] + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + volumeMounts: + - name: run + mountPath: /run/flannel + - name: flannel-cfg + mountPath: /etc/kube-flannel/ + volumes: + - name: run + hostPath: + path: /run/flannel + - name: cni + hostPath: + path: /etc/cni/net.d + - name: flannel-cfg + configMap: + name: kube-flannel-cfg +--- +apiVersion: extensions/v1beta1 +kind: DaemonSet +metadata: + name: kube-flannel-ds-ppc64le + namespace: kube-system + labels: + tier: node + app: flannel +spec: + template: + metadata: + labels: + tier: node + app: flannel + spec: + hostNetwork: true + nodeSelector: + beta.kubernetes.io/arch: ppc64le + tolerations: + - operator: Exists + effect: NoSchedule + serviceAccountName: flannel + initContainers: + - name: install-cni + image: quay.io/coreos/flannel:v0.11.0-ppc64le + command: + - cp + args: + - -f + - /etc/kube-flannel/cni-conf.json + - /etc/cni/net.d/10-flannel.conflist + volumeMounts: + - name: cni + mountPath: /etc/cni/net.d + - name: flannel-cfg + mountPath: /etc/kube-flannel/ + containers: + - name: kube-flannel + image: quay.io/coreos/flannel:v0.11.0-ppc64le + command: + - /opt/bin/flanneld + args: + - --ip-masq + - --kube-subnet-mgr + - --iface=ib0 + resources: + requests: + cpu: "100m" + memory: "50Mi" + limits: + cpu: "100m" + memory: "50Mi" + securityContext: + privileged: false + capabilities: + add: ["NET_ADMIN"] + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + volumeMounts: + - name: run + mountPath: /run/flannel + - name: flannel-cfg + mountPath: /etc/kube-flannel/ + volumes: + - name: run + hostPath: + path: /run/flannel + - name: cni + hostPath: + path: /etc/cni/net.d + - name: flannel-cfg + configMap: + name: kube-flannel-cfg +--- +apiVersion: extensions/v1beta1 +kind: DaemonSet +metadata: + name: kube-flannel-ds-s390x + namespace: kube-system + labels: + tier: node + app: flannel +spec: + template: + metadata: + labels: + tier: node + app: flannel + spec: + hostNetwork: true + nodeSelector: + beta.kubernetes.io/arch: s390x + tolerations: + - operator: Exists + effect: NoSchedule + serviceAccountName: flannel + initContainers: + - name: install-cni + image: quay.io/coreos/flannel:v0.11.0-s390x + command: + - cp + args: + - -f + - /etc/kube-flannel/cni-conf.json + - /etc/cni/net.d/10-flannel.conflist + volumeMounts: + - name: cni + mountPath: /etc/cni/net.d + - name: flannel-cfg + mountPath: /etc/kube-flannel/ + containers: + - name: kube-flannel + image: quay.io/coreos/flannel:v0.11.0-s390x + command: + - /opt/bin/flanneld + args: + - --ip-masq + - --kube-subnet-mgr + - --iface=ib0 + resources: + requests: + cpu: "100m" + memory: "50Mi" + limits: + cpu: "100m" + memory: "50Mi" + securityContext: + privileged: false + capabilities: + add: ["NET_ADMIN"] + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + volumeMounts: + - name: run + mountPath: /run/flannel + - name: flannel-cfg + mountPath: /etc/kube-flannel/ + volumes: + - name: run + hostPath: + path: /run/flannel + - name: cni + hostPath: + path: /etc/cni/net.d + - name: flannel-cfg + configMap: + name: kube-flannel-cfg diff --git a/roles/startmaster/files/kubeflow_persistent_volumes.yaml b/roles/startmaster/files/kubeflow_persistent_volumes.yaml new file mode 100644 index 0000000000..d438284a00 --- /dev/null +++ b/roles/startmaster/files/kubeflow_persistent_volumes.yaml @@ -0,0 +1,51 @@ +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: data1-pv +spec: + capacity: + storage: 10Gi + accessModes: + - ReadWriteOnce + - ReadOnlyMany + - ReadWriteMany + nfs: + server: 10.0.0.1 + path: /work/k8s/data1 + persistentVolumeReclaimPolicy: Recycle + +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: data2-pv +spec: + capacity: + storage: 20Gi + accessModes: + - ReadWriteOnce + - ReadOnlyMany + - ReadWriteMany + nfs: + server: 10.0.0.1 + path: /work/k8s/data2 + persistentVolumeReclaimPolicy: Recycle + +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: data3-pv +spec: + capacity: + storage: 50Gi + accessModes: + - ReadWriteOnce + - ReadOnlyMany + - ReadWriteMany + nfs: + server: 10.0.0.1 + path: /work/k8s/data3 + persistentVolumeReclaimPolicy: Recycle + diff --git a/roles/startmaster/files/minio-pvc.yaml b/roles/startmaster/files/minio-pvc.yaml new file mode 100755 index 0000000000..e21c44921e --- /dev/null +++ b/roles/startmaster/files/minio-pvc.yaml @@ -0,0 +1,16 @@ +# yaml file contents +apiVersion: v1 +kind: PersistentVolume +metadata: + name: minio-pv +spec: + capacity: + storage: 20Gi + accessModes: + - ReadWriteOnce + - ReadOnlyMany + - ReadWriteMany + persistentVolumeReclaimPolicy: Recycle + #storageClassName: local-storage + hostPath: + path: /home/k8s diff --git a/roles/startmaster/files/mysql-pv.yaml b/roles/startmaster/files/mysql-pv.yaml new file mode 100755 index 0000000000..9ecce2a83f --- /dev/null +++ b/roles/startmaster/files/mysql-pv.yaml @@ -0,0 +1,17 @@ +# yaml file contents +apiVersion: v1 +kind: PersistentVolume +metadata: + name: mysql-pv +spec: + capacity: + storage: 20Gi + accessModes: + - ReadWriteOnce + - ReadOnlyMany + - ReadWriteMany + persistentVolumeReclaimPolicy: Recycle + #storageClassName: local-storage + hostPath: + path: /home/k8s/ + diff --git a/roles/startmaster/files/nfs-class.yaml b/roles/startmaster/files/nfs-class.yaml new file mode 100644 index 0000000000..4d3b480578 --- /dev/null +++ b/roles/startmaster/files/nfs-class.yaml @@ -0,0 +1,7 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: managed-nfs-storage +provisioner: fuseim.pri/ifs # or choose another name, must match deployment's env PROVISIONER_NAME' +parameters: + archiveOnDelete: "false" diff --git a/roles/startmaster/files/nfs-deployment.yaml b/roles/startmaster/files/nfs-deployment.yaml new file mode 100644 index 0000000000..c6b399dbc4 --- /dev/null +++ b/roles/startmaster/files/nfs-deployment.yaml @@ -0,0 +1,32 @@ +kind: Deployment +apiVersion: extensions/v1beta1 +metadata: + name: nfs-client-provisioner +spec: + replicas: 1 + strategy: + type: Recreate + template: + metadata: + labels: + app: nfs-client-provisioner + spec: + serviceAccountName: nfs-client-provisioner + containers: + - name: nfs-client-provisioner + image: quay.io/external_storage/nfs-client-provisioner:latest + volumeMounts: + - name: nfs-client-root + mountPath: /persistentvolumes + env: + - name: PROVISIONER_NAME + value: fuseim.pri/ifs + - name: NFS_SERVER + value: 10.0.0.1 + - name: NFS_PATH + value: /work/k8s + volumes: + - name: nfs-client-root + nfs: + server: 10.0.0.1 + path: /work/k8s diff --git a/roles/startmaster/files/nfs-serviceaccount.yaml b/roles/startmaster/files/nfs-serviceaccount.yaml new file mode 100644 index 0000000000..edead9ade1 --- /dev/null +++ b/roles/startmaster/files/nfs-serviceaccount.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: nfs-client-provisioner diff --git a/roles/startmaster/files/nfs_clusterrole.yaml b/roles/startmaster/files/nfs_clusterrole.yaml new file mode 100644 index 0000000000..0ecb088bd5 --- /dev/null +++ b/roles/startmaster/files/nfs_clusterrole.yaml @@ -0,0 +1,20 @@ +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: nfs-client-provisioner-runner +rules: + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["get", "list", "watch", "create", "delete"] + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["get", "list", "watch", "update"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["events"] + verbs: ["create", "update", "patch"] + - apiGroups: [""] + resources: ["endpoints"] + verbs: ["get", "list", "watch", "create", "update", "patch"] \ No newline at end of file diff --git a/roles/startmaster/files/nfs_clusterrolebinding.yaml b/roles/startmaster/files/nfs_clusterrolebinding.yaml new file mode 100644 index 0000000000..0e949a27f5 --- /dev/null +++ b/roles/startmaster/files/nfs_clusterrolebinding.yaml @@ -0,0 +1,12 @@ +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: run-nfs-client-provisioner +subjects: + - kind: ServiceAccount + name: nfs-client-provisioner + namespace: default +roleRef: + kind: ClusterRole + name: nfs-client-provisioner-runner + apiGroup: rbac.authorization.k8s.io diff --git a/roles/startmaster/files/notebook-pv.yaml b/roles/startmaster/files/notebook-pv.yaml new file mode 100755 index 0000000000..e03aa6d8d9 --- /dev/null +++ b/roles/startmaster/files/notebook-pv.yaml @@ -0,0 +1,17 @@ +# yaml file contents +apiVersion: v1 +kind: PersistentVolume +metadata: + name: notebooks-pv +spec: + capacity: + storage: 20Gi + accessModes: + - ReadWriteOnce + - ReadOnlyMany + - ReadWriteMany + #persistentVolumeReclaimPolicy: Recycle + #storageClassName: local-storage + hostPath: + path: /home/k8s/ + diff --git a/roles/startmaster/files/persistent_volumes.yaml b/roles/startmaster/files/persistent_volumes.yaml new file mode 100755 index 0000000000..dc03de0cee --- /dev/null +++ b/roles/startmaster/files/persistent_volumes.yaml @@ -0,0 +1,20 @@ +# yaml file contents +apiVersion: v1 +kind: PersistentVolume +metadata: + name: nfs-pv +spec: + capacity: + storage: 20Gi + accessModes: + #- ReadWriteOnce + #- ReadOnlyMany + - ReadWriteMany + nfs: + server: 10.0.0.1 + path: /work/k8s + #persistentVolumeReclaimPolicy: Recycle + #storageClassName: local-storage + #hostPath: + #path: /home/k8s + diff --git a/roles/startmaster/files/pvc.yaml b/roles/startmaster/files/pvc.yaml new file mode 100644 index 0000000000..abf513db58 --- /dev/null +++ b/roles/startmaster/files/pvc.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: pets-pvc + namespace: kubeflow +spec: + accessModes: + - ReadWriteMany + storageClassName: "" + resources: + requests: + storage: 20Gi diff --git a/roles/startmaster/files/tiller_config.sh b/roles/startmaster/files/tiller_config.sh new file mode 100755 index 0000000000..ff1fef79da --- /dev/null +++ b/roles/startmaster/files/tiller_config.sh @@ -0,0 +1,3 @@ +kubectl create serviceaccount --namespace kube-system tiller +kubectl create clusterrolebinding tiller-cluster-rule --clusterrole=cluster-admin --serviceaccount=kube-system:tiller +kubectl patch deploy --namespace kube-system tiller-deploy -p '{"spec":{"template":{"spec":{"serviceAccount":"tiller"}}}}' diff --git a/roles/startmaster/tasks/main.yml b/roles/startmaster/tasks/main.yml new file mode 100644 index 0000000000..97800e39c9 --- /dev/null +++ b/roles/startmaster/tasks/main.yml @@ -0,0 +1,145 @@ +--- +- name: Turn Swap OFF (if not already disabled) + command: /usr/sbin/swapoff -a + tags: init + +- name: Initialize kubeadm + command: /bin/kubeadm init --pod-network-cidr=10.244.0.0/16 --apiserver-advertise-address=10.0.0.1 + #command: /bin/kubeadm init + register: init_output + tags: init + +- name: Setup Directory for Kubernetes environment for root + file: path=/root/.kube state=directory + tags: init + +- name: Copy Kubernetes Config for root #do this for other users too? + copy: src=/etc/kubernetes/admin.conf dest=/root/.kube/config owner=root group=root mode=644 + tags: init + +- name: Cluster token + shell: kubeadm token list | cut -d ' ' -f1 | sed -n '2p' + register: K8S_TOKEN + tags: init + +- name: CA Hash + shell: openssl x509 -pubkey -in /etc/kubernetes/pki/ca.crt | openssl rsa -pubin -outform der 2>/dev/null | openssl dgst -sha256 -hex | sed 's/^.* //' + register: K8S_MASTER_CA_HASH + tags: init + +- name: Add K8S Master IP, Token, and Hash to dummy host + add_host: + name: "K8S_TOKEN_HOLDER" + token: "{{ K8S_TOKEN.stdout }}" + hash: "{{ K8S_MASTER_CA_HASH.stdout }}" + #ip: "{{ ansible_ib0.ipv4.address }}" + ip: "{{ ansible_p3p1.ipv4.address }}" + tags: init + +- name: + debug: + msg: "[Master] K8S_TOKEN_HOLDER K8S token is {{ hostvars['K8S_TOKEN_HOLDER']['token'] }}" + tags: init + +- name: + debug: + msg: "[Master] K8S_TOKEN_HOLDER K8S Hash is {{ hostvars['K8S_TOKEN_HOLDER']['hash'] }}" + tags: init + +- name: + debug: + msg: "[Master] K8S_MASTER_IP is {{ hostvars['K8S_TOKEN_HOLDER']['ip'] }}" + tags: init + + +- name: Setup Flannel SDN network + shell: kubectl apply -f https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml + tags: init + +- name: Enabled GPU support in Kubernetes + #script: enable_gpu_k8s.sh + shell: kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/1.0.0-beta4/nvidia-device-plugin.yml + #https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v1.11/nvidia-device-plugin.yml + register: gpu_enable + tags: init + + +- name: Create yaml repo for setup + file: + path: /root/k8s + state: directory + tags: init + + +#- name: Persistent Volume Setup Files + #copy: src=persistent_volumes.yaml dest=/root/k8s/persistent_volumes.yaml owner=root group=root mode=655 + #tags: init +# +#- name: Persistent Volume Setup - Apply + #shell: kubectl apply -f /root/k8s/persistent_volumes.yaml + #tags: init + # + +#- name: Copy Service Account (NFS Setup) + #copy: src=nfs-serviceaccount.yaml dest=/root/k8s/nfs-serviceaccount.yaml owner=root group=root mode=655 + #tags: init +# +#- name: Copy Cluster Role (NFS Setup) + #copy: src=nfs_clusterrole.yaml dest=/root/k8s/nfs_clusterrole.yaml owner=root group=root mode=655 + #tags: init +# +#- name: Copy Cluster Role Binding (NFS Setup) + #copy: src=nfs_clusterrolebinding.yaml dest=/root/k8s/nfs_clusterrolebinding.yaml owner=root group=root mode=655 + #tags: init +# +#- name: Copy NFS Storage Deployment (NFS Setup) + #copy: src=nfs-deployment.yaml dest=/root/k8s/nfs-deployment.yaml owner=root group=root mode=655 + #tags: init +# +#- name: Copy NFS Storage Class (NFS Setup) + #copy: src=nfs-class.yaml dest=/root/k8s/nfs-class.yaml owner=root group=root mode=655 + #tags: init +# +#- name: Deploy NFS (NFS Setup) + #shell: kubectl create -f /root/k8s/nfs-deployment.yaml -f /root/k8s/nfs-class.yaml -f /root/k8s/nfs-serviceaccount.yaml -f /root/k8s/nfs_clusterrole.yaml -f /root/k8s/nfs_clusterrolebinding.yaml + #tags: init + +#- name: Patch NFS Setup (NFS Setup) + #shell: kubectl patch deployment nfs-client-provisioner -p '{"spec":{"template":{"spec":{"serviceAccount":"nfs-client-provisioner"}}}}' + #tags: init + +#- name: Patch NFS Setup (NFS Setup) + #shell: "kubectl patch storageclass managed-nfs-storage -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'" + #tags: init + + +- name: Create Service Account (K8S Dashboard) Files + copy: src=create_admin_user.yaml dest=/root/k8s/create_admin_user.yaml owner=root group=root mode=655 + tags: init + +- name: Create Service Account (K8S Dashboard) - Create + shell: kubectl create -f /root/k8s/create_admin_user.yaml + tags: init + +- name: Create ClusterRoleBinding (K8S Dashboard) Files + copy: src=create_clusterRoleBinding.yaml dest=/root/k8s/create_clusterRoleBinding.yaml owner=root group=root mode=655 + tags: init + +- name: Create ClusterRoleBinding (K8S Dashboard) - Apply + shell: kubectl create -f /root/k8s/create_clusterRoleBinding.yaml + tags: init + +- name: Start K8S Dashboard + shell: kubectl create -f https://raw.githubusercontent.com/kubernetes/dashboard/v2.0.0-beta6/aio/deploy/recommended.yaml + tags: init + +- name: Dump Bearer Token for K8S Dashboard Login + shell: kubectl -n kube-system describe secret $(kubectl -n kube-system get secret | grep admin-user | awk '{print $1}') > /root/k8s/token + tags: init + +#- name: Proxy K8S Dashboard to 8001 on localhost + #shell: nohup kubectl proxy /dev/null 2>&1 & + #tags: init + +#- debug: var=init_output.stdout_lines + #tags: init diff --git a/roles/startworkers/tasks/main.yml b/roles/startworkers/tasks/main.yml new file mode 100644 index 0000000000..b74f0167bc --- /dev/null +++ b/roles/startworkers/tasks/main.yml @@ -0,0 +1,33 @@ +--- + +- name: Turn Swap OFF (if not already disabled) + command: /usr/sbin/swapoff -a + tags: init + +#- name: + #debug: + #msg: "[Worker] K8S_TOKEN_HOLDER K8S token is {{ hostvars['K8S_TOKEN_HOLDER']['token'] }}" + #tags: init + +#- name: + #debug: + #msg: "[Worker] K8S_TOKEN_HOLDER K8S Hash is {{ hostvars['K8S_TOKEN_HOLDER']['hash'] }}" + #tags: init + +#- name: + #debug: + #msg: "[Worker] K8S_MASTER_IP is {{ hostvars['K8S_TOKEN_HOLDER']['ip'] }}" + #tags: init + +- name: "Kubeadmn join" + shell: > + kubeadm join --token={{ hostvars['K8S_TOKEN_HOLDER']['token'] }} + --discovery-token-ca-cert-hash sha256:{{ hostvars['K8S_TOKEN_HOLDER']['hash'] }} + {{ hostvars['K8S_TOKEN_HOLDER']['ip'] }}:6443 + tags: init + + +#- name: Join Computes to pool +# command: "{{ kubeJoinCommand }}" +# tags: init + diff --git a/scuttle b/scuttle new file mode 100755 index 0000000000..796b7274ab --- /dev/null +++ b/scuttle @@ -0,0 +1,19 @@ +#!/bin/bash + +kubeadm reset -f +clush -ab "kubeadm reset -f" +rm -rf /var/lib/etcd/* +clush -ab "rm -rf /var/lib/etcd/*" +rm -rf /var/lib/cni/ +clush -ab "rm -rf /var/lib/cni/" +rm -rf /run/flannel/ +clush -ab "rm -rf /run/flannel/" +rm -rf /etc/cni/ +clush -ab "rm -rf /etc/cni/" +ifconfig cni0 down +clush -ab "ifconfig cni0 down" +ifconfig flannel.1 down +clush -ab "ifconfig flannel.1 down" +brctl delbr flannel.1 +clush -ab "brctl delbr flannel.1" +clush -ab "brctl delbr cni0" From 4498f2c35f70b886f022823e1781d353a6b7e7c0 Mon Sep 17 00:00:00 2001 From: John Lockman Date: Wed, 19 Feb 2020 11:52:05 -0600 Subject: [PATCH 2/7] Delete k8s-rdma-sriov-node-config.yaml --- k8s-rdma-sriov-node-config.yaml | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100644 k8s-rdma-sriov-node-config.yaml diff --git a/k8s-rdma-sriov-node-config.yaml b/k8s-rdma-sriov-node-config.yaml deleted file mode 100644 index 3b418810b2..0000000000 --- a/k8s-rdma-sriov-node-config.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: rdma-devices - namespace: kube-system -data: - config.json: | - { - "mode" : "sriov", - #"pfNetdevices": [ "eth0" , "eth1" ] - "pfNetdevices": [ "ib0" ] - } From b213021b7418517d089b58968793b6c1ccbd18c5 Mon Sep 17 00:00:00 2001 From: John Lockman Date: Wed, 19 Feb 2020 11:52:17 -0600 Subject: [PATCH 3/7] Delete k8s-sriov-cni-installer.yaml --- k8s-sriov-cni-installer.yaml | 38 ------------------------------------ 1 file changed, 38 deletions(-) delete mode 100644 k8s-sriov-cni-installer.yaml diff --git a/k8s-sriov-cni-installer.yaml b/k8s-sriov-cni-installer.yaml deleted file mode 100644 index 470080dc7d..0000000000 --- a/k8s-sriov-cni-installer.yaml +++ /dev/null @@ -1,38 +0,0 @@ -apiVersion: extensions/v1beta1 -kind: DaemonSet -metadata: - name: kube-sriov-cni-ds-installer - namespace: kube-system -spec: - template: - metadata: - labels: - name: sriov-cni-ds - spec: - hostNetwork: true - tolerations: - - key: node-role.kubernetes.io/master - operator: Exists - effect: NoSchedule - initContainers: - - name: install-cni - image: rdma/k8s-sriov-cni-installer - imagePullPolicy: IfNotPresent - command: [ "/installer/installer.sh" ] - volumeMounts: - - name: host-cni-etc - mountPath: /host-cni-etc - - name: host-cni-bin - mountPath: /host-cni-bin - containers: - - name: install-cni-sleep - image: rdma/k8s-sriov-cni-installer - imagePullPolicy: IfNotPresent - command: [ "/installer/installer_sleep.sh" ] - volumes: - - name: host-cni-etc - hostPath: - path: /etc/cni/net.d/ - - name: host-cni-bin - hostPath: - path: /opt/cni/bin From 821955934d11c72a6c0aa4599541b297a7ce6159 Mon Sep 17 00:00:00 2001 From: john lockman Date: Wed, 19 Feb 2020 11:53:45 -0600 Subject: [PATCH 4/7] cleaned up host inventory file --- k8s-rdma-sriov-node-config.yaml | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100644 k8s-rdma-sriov-node-config.yaml diff --git a/k8s-rdma-sriov-node-config.yaml b/k8s-rdma-sriov-node-config.yaml deleted file mode 100644 index 3b418810b2..0000000000 --- a/k8s-rdma-sriov-node-config.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: rdma-devices - namespace: kube-system -data: - config.json: | - { - "mode" : "sriov", - #"pfNetdevices": [ "eth0" , "eth1" ] - "pfNetdevices": [ "ib0" ] - } From aead513529ce9639ac0e9843cb68fe51cb2bb2b5 Mon Sep 17 00:00:00 2001 From: john lockman Date: Wed, 19 Feb 2020 11:54:56 -0600 Subject: [PATCH 5/7] cleaned up example --- host_inventory_file | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/host_inventory_file b/host_inventory_file index db0df5c04b..10d345c9e6 100644 --- a/host_inventory_file +++ b/host_inventory_file @@ -2,19 +2,13 @@ friday [compute] +compute[000:005] +#compute000 #compute001 #compute002 -compute000 -compute[002:005] -#compute[201:204] -#compute[301:304] -#compute[401:404] [gpus] -#compute001 -compute002 -compute004 -compute005 +compute[003:005] [workers:children] compute From 0b20e75a32f778a49771a6899a3b501a1cebc869 Mon Sep 17 00:00:00 2001 From: john lockman Date: Wed, 19 Feb 2020 11:57:42 -0600 Subject: [PATCH 6/7] removed example --- create_users/create_user.sh | 54 ------------------------------------- create_users/users | 6 ----- 2 files changed, 60 deletions(-) delete mode 100755 create_users/create_user.sh delete mode 100644 create_users/users diff --git a/create_users/create_user.sh b/create_users/create_user.sh deleted file mode 100755 index c583bb22b0..0000000000 --- a/create_users/create_user.sh +++ /dev/null @@ -1,54 +0,0 @@ -SLURM=0 -FILENAME='' -DEFAULT='' -while [[ $# -gt 1 ]] -do -key="$1" - -case $key in - -s|--slurm) - SLURM=1 - ;; - -f|--file) - FILENAME="$2" - shift # past argument - ;; - --default) - DEFAULT=YES - ;; - *) - # unknown option - ;; -esac -shift # past argument or value -done -echo Add Slurm Account = "${SLURM}" -echo FILENAME = "${FILENAME}" - -#input file is in the form: -#username First Last -INFILE=${FILENAME} - -while IFS='' read -r line; do - IFS=" " read -ra ACCOUNT <<< "$line" - user=${ACCOUNT[0]} - password="changeme" - pass=$(perl -e 'print crypt($ARGV[0], "password")' $password) - - echo "Creating account for $user" - useradd -m -p $pass $user - pdsh "useradd -m -p $pass $user" - #force reset on login - chage -d 0 $user - #useradd -m -p $pass $user - - #become user to create home directory - sudo su - $user "exit" - #generate ssh-keys - sudo -u $user ssh-keygen -N "" -t rsa -f /home/$user/.ssh/id_rsa - sudo -u $user cat /home/$user/.ssh/id_rsa.pub > /home/$user/.ssh/authorized_keys - chown $user:$user /home/$user/.ssh/authorized_keys - sudo -u $user chmod 0644 /home/$user/.ssh/authorized_keys - -done < $INFILE - diff --git a/create_users/users b/create_users/users deleted file mode 100644 index e6d9af2dad..0000000000 --- a/create_users/users +++ /dev/null @@ -1,6 +0,0 @@ -john.lockman -don_smith2 -lwilson -gundev1 -pei_yang -srinivas From 12dba17dbb856c4a7fe494f70e36684a352aa84b Mon Sep 17 00:00:00 2001 From: john lockman Date: Wed, 19 Feb 2020 11:58:50 -0600 Subject: [PATCH 7/7] remove exmaple --- device-plugin.yaml | 43 ------------------------------------------- 1 file changed, 43 deletions(-) delete mode 100644 device-plugin.yaml diff --git a/device-plugin.yaml b/device-plugin.yaml deleted file mode 100644 index 9b3c5927b2..0000000000 --- a/device-plugin.yaml +++ /dev/null @@ -1,43 +0,0 @@ -apiVersion: extensions/v1beta1 -kind: DaemonSet -metadata: - name: rdma-sriov-dp-ds - namespace: kube-system -spec: - template: - metadata: - # Mark this pod as a critical add-on; when enabled, the critical add-on scheduler - # reserves resources for critical add-on pods so that they can be rescheduled after - # a failure. This annotation works in tandem with the toleration below. - annotations: - scheduler.alpha.kubernetes.io/critical-pod: "" - labels: - name: rdma-sriov-dp-ds - spec: - hostNetwork: true - tolerations: - # Allow this pod to be rescheduled while the node is in "critical add-ons only" mode. - # This, along with the annotation above marks this pod as a critical add-on. - - key: CriticalAddonsOnly - operator: Exists - containers: - - image: rdma/k8s-rdma-sriov-dev-plugin - name: k8s-rdma-sriov-dp-ds - imagePullPolicy: IfNotPresent - securityContext: - privileged: true - volumeMounts: - - name: device-plugin - mountPath: /var/lib/kubelet/device-plugins - - name: config - mountPath: /k8s-rdma-sriov-dev-plugin - volumes: - - name: device-plugin - hostPath: - path: /var/lib/kubelet/device-plugins - - name: config - configMap: - name: rdma-devices - items: - - key: config.json - path: config.json