From e82b5f1efd24e7e434259832b7be38b6b2f18415 Mon Sep 17 00:00:00 2001 From: I308301 Date: Fri, 25 Jan 2019 16:04:01 +0530 Subject: [PATCH] Short circuit data validation on proper etcd restart. --- chart/templates/etcd-bootstrap-configmap.yaml | 76 +++++++++--- cmd/initializer.go | 14 ++- cmd/miscellaneous.go | 5 + cmd/types.go | 3 + doc/validation.md | 48 ++++++++ example/etcd-statefulset-aws.yaml | 116 +++++++++++++++--- example/etcd-statefulset-azure.yaml | 116 +++++++++++++++--- example/etcd-statefulset-gcp.yaml | 116 +++++++++++++++--- example/etcd-statefulset-local.yaml | 116 +++++++++++++++--- example/etcd-statefulset-openstack.yaml | 116 +++++++++++++++--- hack/templates/etcd-statefulset.yaml.tpl | 114 ++++++++++++++--- pkg/initializer/initializer.go | 4 +- pkg/initializer/types.go | 2 +- pkg/initializer/validator/datavalidator.go | 14 ++- pkg/initializer/validator/types.go | 12 +- pkg/server/httpAPI.go | 13 +- 16 files changed, 750 insertions(+), 135 deletions(-) diff --git a/chart/templates/etcd-bootstrap-configmap.yaml b/chart/templates/etcd-bootstrap-configmap.yaml index 5742ccbc1..0949a5f78 100644 --- a/chart/templates/etcd-bootstrap-configmap.yaml +++ b/chart/templates/etcd-bootstrap-configmap.yaml @@ -9,23 +9,65 @@ metadata: data: bootstrap.sh: |- #!/bin/sh - while true; - do - wget http://localhost:8080/initialization/status -S -O status; - STATUS=`cat status`; - case $STATUS in - "New") - wget http://localhost:8080/initialization/start -S -O - ;; - "Progress") - sleep 1; - continue;; - "Failed") - continue;; - "Successful") - exec etcd --config-file /var/etcd/config/etcd.conf.yaml - ;; - esac; - done + MARKER=/var/etcd/data/etcdmarker + + trap_and_propagate() { + PID=$1 + shift + for sig in "$@" ; do + trap "kill -$sig $PID" "$sig" + done + } + + start_managed_etcd(){ + etcd --config-file /bootstrap/etcd.conf.yml & + ETCDPID=$! + trap_and_propagate $ETCDPID INT TERM + wait $ETCDPID + RET=$? + echo $RET > $MARKER + exit $RET + } + + check_and_start_etcd(){ + while true; + do + wget http://localhost:8080/initialization/status -S -O status; + STATUS=`cat status`; + case $STATUS in + "New") + wget http://localhost:8080/initialization/start?mode=$1 -S -O - ;; + "Progress") + sleep 1; + continue;; + "Failed") + continue;; + "Successful") + start_managed_etcd + break + ;; + esac; + done + } + + if [ ! -f $MARKER ] ; + then + echo "No marker file. Perform complete initialization routine and start and start etcd." + check_and_start_etcd full + else + echo "Marker file present. Check return status and decide on initialization" + run_status=`cat $MARKER` + echo "Marker content: $run_status" + if [ $run_status == '143' ] || [ $run_status == '130' ] || [ $run_status == '0' ] ; then + rm -rf $MARKER + echo "Requesting sidecar to perform sanity validation" + check_and_start_etcd sanity + else + rm -rf $MARKER + echo "Requesting sidecar to perform full validation" + check_and_start_etcd full + fi + fi etcd.conf.yaml: |- # Human-readable name for this member. name: etcd-{{.Values.role}} diff --git a/cmd/initializer.go b/cmd/initializer.go index f5f9e7404..3d77a4e97 100644 --- a/cmd/initializer.go +++ b/cmd/initializer.go @@ -20,6 +20,7 @@ import ( "github.com/coreos/etcd/pkg/types" "github.com/gardener/etcd-backup-restore/pkg/initializer" + "github.com/gardener/etcd-backup-restore/pkg/initializer/validator" "github.com/gardener/etcd-backup-restore/pkg/snapshot/restorer" "github.com/gardener/etcd-backup-restore/pkg/snapstore" "github.com/sirupsen/logrus" @@ -36,6 +37,7 @@ func NewInitializeCommand(stopCh <-chan struct{}) *cobra.Command { Short: "initialize an etcd instance.", Long: fmt.Sprintf(`Initializes an etcd instance. Data directory is checked for corruption and restored in case of corruption.`), Run: func(cmd *cobra.Command, args []string) { + var mode validator.Mode logger := logrus.New() clusterUrlsMap, err := types.NewURLsMap(restoreCluster) @@ -48,6 +50,15 @@ func NewInitializeCommand(stopCh <-chan struct{}) *cobra.Command { logger.Fatalf("failed parsing peers urls for restore cluster: %v", err) } + switch validator.Mode(validationMode) { + case validator.Full: + mode = validator.Full + case validator.Sanity: + mode = validator.Sanity + default: + logger.Fatal("validation-mode can only be one of these values [full/sanity]") + } + options := &restorer.RestoreOptions{ RestoreDataDir: path.Clean(restoreDataDir), Name: restoreName, @@ -70,7 +81,7 @@ func NewInitializeCommand(stopCh <-chan struct{}) *cobra.Command { } } etcdInitializer := initializer.NewInitializer(options, snapstoreConfig, logger) - err = etcdInitializer.Initialize() + err = etcdInitializer.Initialize(mode) if err != nil { logger.Fatalf("initializer failed. %v", err) } @@ -78,5 +89,6 @@ func NewInitializeCommand(stopCh <-chan struct{}) *cobra.Command { } initializeEtcdFlags(initializeCmd) initializeSnapstoreFlags(initializeCmd) + initializeValidatorFlags(initializeCmd) return initializeCmd } diff --git a/cmd/miscellaneous.go b/cmd/miscellaneous.go index 471ce937c..cdc655fc6 100644 --- a/cmd/miscellaneous.go +++ b/cmd/miscellaneous.go @@ -17,6 +17,7 @@ package cmd import ( "runtime" + "github.com/gardener/etcd-backup-restore/pkg/initializer/validator" ver "github.com/gardener/etcd-backup-restore/pkg/version" "github.com/spf13/cobra" ) @@ -36,3 +37,7 @@ func printVersionInfo() { logger.Infof("Go Version: %s", runtime.Version()) logger.Infof("Go OS/Arch: %s/%s", runtime.GOOS, runtime.GOARCH) } + +func initializeValidatorFlags(cmd *cobra.Command) { + cmd.Flags().StringVar(&validationMode, "validation-mode", string(validator.Full), "mode to do data initialization[full/sanity]") +} diff --git a/cmd/types.go b/cmd/types.go index 459f3e051..9c484e68d 100644 --- a/cmd/types.go +++ b/cmd/types.go @@ -64,6 +64,9 @@ var ( storagePrefix string maxParallelChunkUploads int snapstoreTempDir string + + //initializer flags + validationMode string ) var emptyStruct struct{} diff --git a/doc/validation.md b/doc/validation.md index b4fa0f69f..7e9c9cc70 100644 --- a/doc/validation.md +++ b/doc/validation.md @@ -1 +1,49 @@ # Etcd data validation + + As etcd is being used to store the state of the K8s cluster, it is mandatory that etcd deployment has to be hardened against data loss. Sufficient checks have to be in place to prevent etcd from erroneously starting with stale/corrupt data and taking stale snapshots to the backing store. We have a data validation flow in place which prevents etcd from starting in case of data corruption. + + ## Directory validation +The etcd data directory validation comprises of multiple checks as mentioned below: +### Structure validation +The member directory, snap directory and wal directory are checked to ascertain that they adhere to the directory structure followed by etcd. +### Content validation +#### Corruption check +The contents for the data directory(db file, snap files and wal file) are checked for data corruption. +#### Revision check +The revision of etcd data in the db file is checked with the revision of the latest snapshot in the backing store. If the revison in the backing store is greater than that of etcd data in the db file, etcd data is considered stale. This is to prevent etcd snapshots for stale revisions from overwriting legit recent snapshots. + + ## Validation flow +Not all validation steps take the same time to complete. Some validation steps are dependent of the size of etcd data(eg. db file). If the db file is checked for data corruption before etcd startup, it would take longer for etcd to become servicable. Therefore, it is only imperative to perform validation checks on abnormal etcd events like etcd restart after a crash. The validation flow mentioned below is modeled with the aforementioned rationale in mind. + + * Check if etcd resulted in an abnormal exit. Is it possible to identify previous etcd run status? + * No + * Do directory structure validation. + * Do directory content validation. + * Start etcd + * Yes + * Check if previous exit was normal + * Yes + * Do revision check + * Do directory structure validation. + * Start etcd + * No + * Do directory structure validation. + * Do directory content validation. + * Start etcd + + ## Addition design decisions to be made + Currently, we have the validation check triggered from a bash script in the etcd container. The status of the validation check is polled till its completed and based on the validation status, it is decided whether it is safe to start etcd. During validation if etcd directory is found to be corrupt or stale, the latest snapshot in the backing store is used to restore etcd data to the latest revision. + + ### Question 1: Should the sidecar container be able to act on the status of previous etcd run status? + + * **Option 1**: Yes. The information of previous etcd run may be made available to the sidecar container via configmaps. The idea is that `validate` REST endpoint shall check the shared configmap for status, perform necessary validation and restore steps before etcd start. + + * **Option 2**: No. If the above-mentioned level of granularity is to be available for validation checks, we would need to modify the REST endpoints to trigger the validation sub-checks. Should we modify the bash script to handle the cases and let the sidecar be agnostic to the status of the previous etcd run? + + We have chosen the approach were the script decides on the previous exit status of etcd, to call the necessary validation step. If etcd terminated normally then sanity validation is performed else we perform a full etcd data validation. + + ### Question 2: How should status for previous etcd run be identified? +* **Option 1**: The error logs of the etcd run can be dumped to an log file in the persistent disk. This can be checked on subsequent validation steps to identify the status of previous etcd run. +* **Option 2**: Via exit code stored in a file in the persistent disk. This can be checked on subsequent validation steps to identify the status of previous etcd run. + +Since we are do not do an analysis of the logs at this point of time, the log dump and subsequent analysis steps can be taken care of in the necessary PR. \ No newline at end of file diff --git a/example/etcd-statefulset-aws.yaml b/example/etcd-statefulset-aws.yaml index f85bad92c..87695d903 100644 --- a/example/etcd-statefulset-aws.yaml +++ b/example/etcd-statefulset-aws.yaml @@ -12,7 +12,7 @@ spec: protocol: TCP clusterIP: None selector: - app: etcd + app: etcd-aws --- apiVersion: v1 kind: ConfigMap @@ -23,23 +23,103 @@ metadata: data: bootstrap.sh: |- #!/bin/sh - while true; - do - wget http://localhost:8080/initialization/status -S -O status; - STATUS=`cat status`; - case $STATUS in - "New") - wget http://localhost:8080/initialization/start -S -O - ;; - "Progress") - sleep 1; - continue;; - "Failed") - continue;; - "Successful") - exec etcd --data-dir=/var/etcd/data/new.etcd --name=etcd --advertise-client-urls=http://0.0.0.0:2379 --listen-client-urls=http://0.0.0.0:2379 --initial-cluster-state=new --initial-cluster-token=new - ;; - esac; - done + MARKER=/var/etcd/data/etcdmarker + + trap_and_propagate() { + PID=$1 + shift + for sig in "$@" ; do + trap "kill -$sig $PID" "$sig" + done + } + + start_managed_etcd(){ + etcd --config-file /bootstrap/etcd.conf.yml & + ETCDPID=$! + trap_and_propagate $ETCDPID INT TERM + wait $ETCDPID + RET=$? + echo $RET > $MARKER + exit $RET + } + + check_and_start_etcd(){ + while true; + do + wget http://localhost:8080/initialization/status -S -O status; + STATUS=`cat status`; + case $STATUS in + "New") + wget http://localhost:8080/initialization/start?mode=$1 -S -O - ;; + "Progress") + sleep 1; + continue;; + "Failed") + continue;; + "Successful") + start_managed_etcd + break + ;; + esac; + done + } + + if [ ! -f $MARKER ] ; + then + echo "No marker file. Perform complete initialization routine and start and start etcd." + check_and_start_etcd full + else + echo "Marker file present. Check return status and decide on initialization" + run_status=`cat $MARKER` + echo "Marker content: $run_status" + if [ $run_status == '143' ] || [ $run_status == '130' ] || [ $run_status == '0' ] ; then + rm -rf $MARKER + echo "Requesting sidecar to perform sanity validation" + check_and_start_etcd sanity + else + rm -rf $MARKER + echo "Requesting sidecar to perform full validation" + check_and_start_etcd full + fi + fi + etcd.conf.yml: |- + # This is the configuration file for the etcd server. + + # Human-readable name for this member. + name: etcd-new + + # Path to the data directory. + data-dir: /var/etcd/data/new.etcd + + # List of this member's client URLs to advertise to the public. + # The URLs needed to be a comma-separated list. + advertise-client-urls: http://0.0.0.0:2379 + + # List of comma separated URLs to listen on for client traffic. + listen-client-urls: http://0.0.0.0:2379 + + # Initial cluster token for the etcd cluster during bootstrap. + initial-cluster-token: 'new' + + # Initial cluster state ('new' or 'existing'). + initial-cluster-state: 'new' + + # Number of committed transactions to trigger a snapshot to disk. + snapshot-count: 75000 + + # Raise alarms when backend size exceeds the given quota. 0 means use the + # default quota. + quota-backend-bytes: 8589934592 + + # Accept etcd V2 client requests + enable-v2: false + + # keep one day of history + auto-compaction-mode: periodic + auto-compaction-retention: "24" + + + --- apiVersion: apps/v1beta1 kind: StatefulSet diff --git a/example/etcd-statefulset-azure.yaml b/example/etcd-statefulset-azure.yaml index 65c74e318..15d011214 100644 --- a/example/etcd-statefulset-azure.yaml +++ b/example/etcd-statefulset-azure.yaml @@ -12,7 +12,7 @@ spec: protocol: TCP clusterIP: None selector: - app: etcd + app: etcd-azure --- apiVersion: v1 kind: ConfigMap @@ -23,23 +23,103 @@ metadata: data: bootstrap.sh: |- #!/bin/sh - while true; - do - wget http://localhost:8080/initialization/status -S -O status; - STATUS=`cat status`; - case $STATUS in - "New") - wget http://localhost:8080/initialization/start -S -O - ;; - "Progress") - sleep 1; - continue;; - "Failed") - continue;; - "Successful") - exec etcd --data-dir=/var/etcd/data/new.etcd --name=etcd --advertise-client-urls=http://0.0.0.0:2379 --listen-client-urls=http://0.0.0.0:2379 --initial-cluster-state=new --initial-cluster-token=new - ;; - esac; - done + MARKER=/var/etcd/data/etcdmarker + + trap_and_propagate() { + PID=$1 + shift + for sig in "$@" ; do + trap "kill -$sig $PID" "$sig" + done + } + + start_managed_etcd(){ + etcd --config-file /bootstrap/etcd.conf.yml & + ETCDPID=$! + trap_and_propagate $ETCDPID INT TERM + wait $ETCDPID + RET=$? + echo $RET > $MARKER + exit $RET + } + + check_and_start_etcd(){ + while true; + do + wget http://localhost:8080/initialization/status -S -O status; + STATUS=`cat status`; + case $STATUS in + "New") + wget http://localhost:8080/initialization/start?mode=$1 -S -O - ;; + "Progress") + sleep 1; + continue;; + "Failed") + continue;; + "Successful") + start_managed_etcd + break + ;; + esac; + done + } + + if [ ! -f $MARKER ] ; + then + echo "No marker file. Perform complete initialization routine and start and start etcd." + check_and_start_etcd full + else + echo "Marker file present. Check return status and decide on initialization" + run_status=`cat $MARKER` + echo "Marker content: $run_status" + if [ $run_status == '143' ] || [ $run_status == '130' ] || [ $run_status == '0' ] ; then + rm -rf $MARKER + echo "Requesting sidecar to perform sanity validation" + check_and_start_etcd sanity + else + rm -rf $MARKER + echo "Requesting sidecar to perform full validation" + check_and_start_etcd full + fi + fi + etcd.conf.yml: |- + # This is the configuration file for the etcd server. + + # Human-readable name for this member. + name: etcd-new + + # Path to the data directory. + data-dir: /var/etcd/data/new.etcd + + # List of this member's client URLs to advertise to the public. + # The URLs needed to be a comma-separated list. + advertise-client-urls: http://0.0.0.0:2379 + + # List of comma separated URLs to listen on for client traffic. + listen-client-urls: http://0.0.0.0:2379 + + # Initial cluster token for the etcd cluster during bootstrap. + initial-cluster-token: 'new' + + # Initial cluster state ('new' or 'existing'). + initial-cluster-state: 'new' + + # Number of committed transactions to trigger a snapshot to disk. + snapshot-count: 75000 + + # Raise alarms when backend size exceeds the given quota. 0 means use the + # default quota. + quota-backend-bytes: 8589934592 + + # Accept etcd V2 client requests + enable-v2: false + + # keep one day of history + auto-compaction-mode: periodic + auto-compaction-retention: "24" + + + --- apiVersion: apps/v1beta1 kind: StatefulSet diff --git a/example/etcd-statefulset-gcp.yaml b/example/etcd-statefulset-gcp.yaml index 657a58dc0..30dac3e79 100644 --- a/example/etcd-statefulset-gcp.yaml +++ b/example/etcd-statefulset-gcp.yaml @@ -12,7 +12,7 @@ spec: protocol: TCP clusterIP: None selector: - app: etcd + app: etcd-gcp --- apiVersion: v1 kind: ConfigMap @@ -23,23 +23,103 @@ metadata: data: bootstrap.sh: |- #!/bin/sh - while true; - do - wget http://localhost:8080/initialization/status -S -O status; - STATUS=`cat status`; - case $STATUS in - "New") - wget http://localhost:8080/initialization/start -S -O - ;; - "Progress") - sleep 1; - continue;; - "Failed") - continue;; - "Successful") - exec etcd --data-dir=/var/etcd/data/new.etcd --name=etcd --advertise-client-urls=http://0.0.0.0:2379 --listen-client-urls=http://0.0.0.0:2379 --initial-cluster-state=new --initial-cluster-token=new - ;; - esac; - done + MARKER=/var/etcd/data/etcdmarker + + trap_and_propagate() { + PID=$1 + shift + for sig in "$@" ; do + trap "kill -$sig $PID" "$sig" + done + } + + start_managed_etcd(){ + etcd --config-file /bootstrap/etcd.conf.yml & + ETCDPID=$! + trap_and_propagate $ETCDPID INT TERM + wait $ETCDPID + RET=$? + echo $RET > $MARKER + exit $RET + } + + check_and_start_etcd(){ + while true; + do + wget http://localhost:8080/initialization/status -S -O status; + STATUS=`cat status`; + case $STATUS in + "New") + wget http://localhost:8080/initialization/start?mode=$1 -S -O - ;; + "Progress") + sleep 1; + continue;; + "Failed") + continue;; + "Successful") + start_managed_etcd + break + ;; + esac; + done + } + + if [ ! -f $MARKER ] ; + then + echo "No marker file. Perform complete initialization routine and start and start etcd." + check_and_start_etcd full + else + echo "Marker file present. Check return status and decide on initialization" + run_status=`cat $MARKER` + echo "Marker content: $run_status" + if [ $run_status == '143' ] || [ $run_status == '130' ] || [ $run_status == '0' ] ; then + rm -rf $MARKER + echo "Requesting sidecar to perform sanity validation" + check_and_start_etcd sanity + else + rm -rf $MARKER + echo "Requesting sidecar to perform full validation" + check_and_start_etcd full + fi + fi + etcd.conf.yml: |- + # This is the configuration file for the etcd server. + + # Human-readable name for this member. + name: etcd-new + + # Path to the data directory. + data-dir: /var/etcd/data/new.etcd + + # List of this member's client URLs to advertise to the public. + # The URLs needed to be a comma-separated list. + advertise-client-urls: http://0.0.0.0:2379 + + # List of comma separated URLs to listen on for client traffic. + listen-client-urls: http://0.0.0.0:2379 + + # Initial cluster token for the etcd cluster during bootstrap. + initial-cluster-token: 'new' + + # Initial cluster state ('new' or 'existing'). + initial-cluster-state: 'new' + + # Number of committed transactions to trigger a snapshot to disk. + snapshot-count: 75000 + + # Raise alarms when backend size exceeds the given quota. 0 means use the + # default quota. + quota-backend-bytes: 8589934592 + + # Accept etcd V2 client requests + enable-v2: false + + # keep one day of history + auto-compaction-mode: periodic + auto-compaction-retention: "24" + + + --- apiVersion: apps/v1beta1 kind: StatefulSet diff --git a/example/etcd-statefulset-local.yaml b/example/etcd-statefulset-local.yaml index 6bd79d101..5967792e7 100644 --- a/example/etcd-statefulset-local.yaml +++ b/example/etcd-statefulset-local.yaml @@ -12,7 +12,7 @@ spec: protocol: TCP clusterIP: None selector: - app: etcd + app: etcd-local --- apiVersion: v1 kind: ConfigMap @@ -23,23 +23,103 @@ metadata: data: bootstrap.sh: |- #!/bin/sh - while true; - do - wget http://localhost:8080/initialization/status -S -O status; - STATUS=`cat status`; - case $STATUS in - "New") - wget http://localhost:8080/initialization/start -S -O - ;; - "Progress") - sleep 1; - continue;; - "Failed") - continue;; - "Successful") - exec etcd --data-dir=/var/etcd/data/new.etcd --name=etcd --advertise-client-urls=http://0.0.0.0:2379 --listen-client-urls=http://0.0.0.0:2379 --initial-cluster-state=new --initial-cluster-token=new - ;; - esac; - done + MARKER=/var/etcd/data/etcdmarker + + trap_and_propagate() { + PID=$1 + shift + for sig in "$@" ; do + trap "kill -$sig $PID" "$sig" + done + } + + start_managed_etcd(){ + etcd --config-file /bootstrap/etcd.conf.yml & + ETCDPID=$! + trap_and_propagate $ETCDPID INT TERM + wait $ETCDPID + RET=$? + echo $RET > $MARKER + exit $RET + } + + check_and_start_etcd(){ + while true; + do + wget http://localhost:8080/initialization/status -S -O status; + STATUS=`cat status`; + case $STATUS in + "New") + wget http://localhost:8080/initialization/start?mode=$1 -S -O - ;; + "Progress") + sleep 1; + continue;; + "Failed") + continue;; + "Successful") + start_managed_etcd + break + ;; + esac; + done + } + + if [ ! -f $MARKER ] ; + then + echo "No marker file. Perform complete initialization routine and start and start etcd." + check_and_start_etcd full + else + echo "Marker file present. Check return status and decide on initialization" + run_status=`cat $MARKER` + echo "Marker content: $run_status" + if [ $run_status == '143' ] || [ $run_status == '130' ] || [ $run_status == '0' ] ; then + rm -rf $MARKER + echo "Requesting sidecar to perform sanity validation" + check_and_start_etcd sanity + else + rm -rf $MARKER + echo "Requesting sidecar to perform full validation" + check_and_start_etcd full + fi + fi + etcd.conf.yml: |- + # This is the configuration file for the etcd server. + + # Human-readable name for this member. + name: etcd-new + + # Path to the data directory. + data-dir: /var/etcd/data/new.etcd + + # List of this member's client URLs to advertise to the public. + # The URLs needed to be a comma-separated list. + advertise-client-urls: http://0.0.0.0:2379 + + # List of comma separated URLs to listen on for client traffic. + listen-client-urls: http://0.0.0.0:2379 + + # Initial cluster token for the etcd cluster during bootstrap. + initial-cluster-token: 'new' + + # Initial cluster state ('new' or 'existing'). + initial-cluster-state: 'new' + + # Number of committed transactions to trigger a snapshot to disk. + snapshot-count: 75000 + + # Raise alarms when backend size exceeds the given quota. 0 means use the + # default quota. + quota-backend-bytes: 8589934592 + + # Accept etcd V2 client requests + enable-v2: false + + # keep one day of history + auto-compaction-mode: periodic + auto-compaction-retention: "24" + + + --- apiVersion: apps/v1beta1 kind: StatefulSet diff --git a/example/etcd-statefulset-openstack.yaml b/example/etcd-statefulset-openstack.yaml index dc5b30d1c..3ddd49bdb 100644 --- a/example/etcd-statefulset-openstack.yaml +++ b/example/etcd-statefulset-openstack.yaml @@ -12,7 +12,7 @@ spec: protocol: TCP clusterIP: None selector: - app: etcd + app: etcd-openstack --- apiVersion: v1 kind: ConfigMap @@ -23,23 +23,103 @@ metadata: data: bootstrap.sh: |- #!/bin/sh - while true; - do - wget http://localhost:8080/initialization/status -S -O status; - STATUS=`cat status`; - case $STATUS in - "New") - wget http://localhost:8080/initialization/start -S -O - ;; - "Progress") - sleep 1; - continue;; - "Failed") - continue;; - "Successful") - exec etcd --data-dir=/var/etcd/data/new.etcd --name=etcd --advertise-client-urls=http://0.0.0.0:2379 --listen-client-urls=http://0.0.0.0:2379 --initial-cluster-state=new --initial-cluster-token=new - ;; - esac; - done + MARKER=/var/etcd/data/etcdmarker + + trap_and_propagate() { + PID=$1 + shift + for sig in "$@" ; do + trap "kill -$sig $PID" "$sig" + done + } + + start_managed_etcd(){ + etcd --config-file /bootstrap/etcd.conf.yml & + ETCDPID=$! + trap_and_propagate $ETCDPID INT TERM + wait $ETCDPID + RET=$? + echo $RET > $MARKER + exit $RET + } + + check_and_start_etcd(){ + while true; + do + wget http://localhost:8080/initialization/status -S -O status; + STATUS=`cat status`; + case $STATUS in + "New") + wget http://localhost:8080/initialization/start?mode=$1 -S -O - ;; + "Progress") + sleep 1; + continue;; + "Failed") + continue;; + "Successful") + start_managed_etcd + break + ;; + esac; + done + } + + if [ ! -f $MARKER ] ; + then + echo "No marker file. Perform complete initialization routine and start and start etcd." + check_and_start_etcd full + else + echo "Marker file present. Check return status and decide on initialization" + run_status=`cat $MARKER` + echo "Marker content: $run_status" + if [ $run_status == '143' ] || [ $run_status == '130' ] || [ $run_status == '0' ] ; then + rm -rf $MARKER + echo "Requesting sidecar to perform sanity validation" + check_and_start_etcd sanity + else + rm -rf $MARKER + echo "Requesting sidecar to perform full validation" + check_and_start_etcd full + fi + fi + etcd.conf.yml: |- + # This is the configuration file for the etcd server. + + # Human-readable name for this member. + name: etcd-new + + # Path to the data directory. + data-dir: /var/etcd/data/new.etcd + + # List of this member's client URLs to advertise to the public. + # The URLs needed to be a comma-separated list. + advertise-client-urls: http://0.0.0.0:2379 + + # List of comma separated URLs to listen on for client traffic. + listen-client-urls: http://0.0.0.0:2379 + + # Initial cluster token for the etcd cluster during bootstrap. + initial-cluster-token: 'new' + + # Initial cluster state ('new' or 'existing'). + initial-cluster-state: 'new' + + # Number of committed transactions to trigger a snapshot to disk. + snapshot-count: 75000 + + # Raise alarms when backend size exceeds the given quota. 0 means use the + # default quota. + quota-backend-bytes: 8589934592 + + # Accept etcd V2 client requests + enable-v2: false + + # keep one day of history + auto-compaction-mode: periodic + auto-compaction-retention: "24" + + + --- apiVersion: apps/v1beta1 kind: StatefulSet diff --git a/hack/templates/etcd-statefulset.yaml.tpl b/hack/templates/etcd-statefulset.yaml.tpl index 14965e5e2..ae83cb7b5 100644 --- a/hack/templates/etcd-statefulset.yaml.tpl +++ b/hack/templates/etcd-statefulset.yaml.tpl @@ -44,23 +44,103 @@ metadata: data: bootstrap.sh: |- #!/bin/sh - while true; - do - wget http://localhost:8080/initialization/status -S -O status; - STATUS=`cat status`; - case $STATUS in - "New") - wget http://localhost:8080/initialization/start -S -O - ;; - "Progress") - sleep 1; - continue;; - "Failed") - continue;; - "Successful") - exec etcd --data-dir=/var/etcd/data/new.etcd --name=etcd --advertise-client-urls=http://0.0.0.0:2379 --listen-client-urls=http://0.0.0.0:2379 --initial-cluster-state=new --initial-cluster-token=new - ;; - esac; - done + MARKER=/var/etcd/data/etcdmarker + + trap_and_propagate() { + PID=$1 + shift + for sig in "$@" ; do + trap "kill -$sig $PID" "$sig" + done + } + + start_managed_etcd(){ + etcd --config-file /bootstrap/etcd.conf.yml & + ETCDPID=$! + trap_and_propagate $ETCDPID INT TERM + wait $ETCDPID + RET=$? + echo $RET > $MARKER + exit $RET + } + + check_and_start_etcd(){ + while true; + do + wget http://localhost:8080/initialization/status -S -O status; + STATUS=`cat status`; + case $STATUS in + "New") + wget http://localhost:8080/initialization/start?mode=$1 -S -O - ;; + "Progress") + sleep 1; + continue;; + "Failed") + continue;; + "Successful") + start_managed_etcd + break + ;; + esac; + done + } + + if [ ! -f $MARKER ] ; + then + echo "No marker file. Perform complete initialization routine and start etcd." + check_and_start_etcd full + else + echo "Marker file present. Check return status and decide on initialization" + run_status=`cat $MARKER` + echo "Marker content: $run_status" + if [ $run_status == '143' ] || [ $run_status == '130' ] || [ $run_status == '0' ] ; then + rm -rf $MARKER + echo "Requesting sidecar to perform sanity validation" + check_and_start_etcd sanity + else + rm -rf $MARKER + echo "Requesting sidecar to perform full validation" + check_and_start_etcd full + fi + fi + etcd.conf.yml: |- + # This is the configuration file for the etcd server. + + # Human-readable name for this member. + name: etcd-new + + # Path to the data directory. + data-dir: /var/etcd/data/new.etcd + + # List of this member's client URLs to advertise to the public. + # The URLs needed to be a comma-separated list. + advertise-client-urls: http://0.0.0.0:2379 + + # List of comma separated URLs to listen on for client traffic. + listen-client-urls: http://0.0.0.0:2379 + + # Initial cluster token for the etcd cluster during bootstrap. + initial-cluster-token: 'new' + + # Initial cluster state ('new' or 'existing'). + initial-cluster-state: 'new' + + # Number of committed transactions to trigger a snapshot to disk. + snapshot-count: 75000 + + # Raise alarms when backend size exceeds the given quota. 0 means use the + # default quota. + quota-backend-bytes: 8589934592 + + # Accept etcd V2 client requests + enable-v2: false + + # keep one day of history + auto-compaction-mode: periodic + auto-compaction-retention: "24" + + + --- apiVersion: apps/v1beta1 kind: StatefulSet diff --git a/pkg/initializer/initializer.go b/pkg/initializer/initializer.go index aa036f21e..797cb6296 100644 --- a/pkg/initializer/initializer.go +++ b/pkg/initializer/initializer.go @@ -42,9 +42,9 @@ const ( // * Check if Latest snapshot available. // - Try to perform an Etcd data restoration from the latest snapshot. // - No snapshots are available, start etcd as a fresh installation. -func (e *EtcdInitializer) Initialize() error { +func (e *EtcdInitializer) Initialize(mode validator.Mode) error { start := time.Now() - dataDirStatus, err := e.Validator.Validate() + dataDirStatus, err := e.Validator.Validate(mode) if err != nil && dataDirStatus != validator.DataDirectoryNotExist { metrics.ValidationDurationSeconds.With(prometheus.Labels{metrics.LabelSucceeded: metrics.ValueSucceededFalse}).Observe(time.Now().Sub(start).Seconds()) err = fmt.Errorf("error while initializing: %v", err) diff --git a/pkg/initializer/types.go b/pkg/initializer/types.go index 877204a07..f93b782c7 100644 --- a/pkg/initializer/types.go +++ b/pkg/initializer/types.go @@ -38,5 +38,5 @@ type EtcdInitializer struct { // Initializer is the interface for etcd initialization actions. type Initializer interface { - Initialize() error + Initialize(validator.Mode) error } diff --git a/pkg/initializer/validator/datavalidator.go b/pkg/initializer/validator/datavalidator.go index 0ef924bf0..ebc6ca8d9 100644 --- a/pkg/initializer/validator/datavalidator.go +++ b/pkg/initializer/validator/datavalidator.go @@ -76,7 +76,7 @@ func (d *DataValidator) backendPath() string { return filepath.Join(d.snapDir(), // - If data directory structure is invalid return DataDirectoryInvStruct status. // * Check for data corruption. // - return data directory corruption status. -func (d *DataValidator) Validate() (DataDirStatus, error) { +func (d *DataValidator) Validate(mode Mode) (DataDirStatus, error) { dataDir := d.Config.DataDir dirExists, err := directoryExist(dataDir) if err != nil { @@ -106,10 +106,14 @@ func (d *DataValidator) Validate() (DataDirStatus, error) { d.Logger.Info("Skipping check for revision consistency, since no snapstore configured.") } - d.Logger.Info("Checking for data directory files corruption...") - if err = d.checkForDataCorruption(); err != nil { - d.Logger.Infof("Data directory corrupt. %v", err) - return DataDirectoryCorrupt, nil + switch mode { + case Full: + d.Logger.Info("Checking for data directory files corruption...") + err = d.checkForDataCorruption() + if err != nil { + d.Logger.Infof("Data directory corrupt. %v", err) + return DataDirectoryCorrupt, nil + } } d.Logger.Info("Data directory valid.") diff --git a/pkg/initializer/validator/types.go b/pkg/initializer/validator/types.go index e5a95cd70..56e069b1d 100644 --- a/pkg/initializer/validator/types.go +++ b/pkg/initializer/validator/types.go @@ -41,6 +41,16 @@ const ( snapSuffix = ".snap" ) +// Mode is the Validation mode passed on to the DataValidator +type Mode string + +const ( + // Full Mode does complete validation including the data directory contents for corruption. + Full Mode = "full" + // Sanity Mode validates only the data revision against the revision in the backup store. + Sanity Mode = "sanity" +) + // Config store configuration for DataValidator. type Config struct { DataDir string @@ -55,5 +65,5 @@ type DataValidator struct { // Validator is the interface for data validation actions. type Validator interface { - Validate() error + Validate(Mode) error } diff --git a/pkg/server/httpAPI.go b/pkg/server/httpAPI.go index 549b81b48..05662db93 100644 --- a/pkg/server/httpAPI.go +++ b/pkg/server/httpAPI.go @@ -22,6 +22,7 @@ import ( "sync/atomic" "github.com/gardener/etcd-backup-restore/pkg/initializer" + "github.com/gardener/etcd-backup-restore/pkg/initializer/validator" "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/sirupsen/logrus" ) @@ -136,12 +137,22 @@ func (h *HTTPHandler) serveInitialize(rw http.ResponseWriter, req *http.Request) h.initializationStatus = initializationStatusProgress go func() { // This is needed to stop snapshotter. + var mode validator.Mode atomic.StoreUint32(&h.AckState, HandlerAckWaiting) h.Logger.Info("Changed handler state.") h.ReqCh <- emptyStruct h.Logger.Info("Waiting for acknowledgment...") <-h.AckCh - err := h.EtcdInitializer.Initialize() + switch modeVal := req.URL.Query().Get("mode"); modeVal { + case string(validator.Full): + mode = validator.Full + case string(validator.Sanity): + mode = validator.Sanity + default: + mode = validator.Full + } + h.Logger.Infof("Validation mode: %s", mode) + err := h.EtcdInitializer.Initialize(mode) h.initializationStatusMutex.Lock() defer h.initializationStatusMutex.Unlock() if err != nil {