addMember.sh: | #!/bin/bash -x # change file ownership of pv chown -R 250422:250422 ${ETCD_DATA_DIR} ordinal=${ETCD_NAME##*-} OPERATIONAL_CA_CERT_FILE=/run/sec/cas/clientca/client-cacertbundle.pem SIPTLS_CA_CERT_FILE=/run/sec/cas/siptlsca/cacertbundle.pem echo "Waiting for operational certificates to be available" while [[ ! -f "$OPERATIONAL_CA_CERT_FILE" ]] || [[ ! -f "$SIPTLS_CA_CERT_FILE" ]] do sleep 2 done echo "operational certificates found" if [ $ordinal -ne 0 ] then sleep 5 fi while true do # enable authentication very first time that etcd-0 is started if [ ${ordinal} -eq 0 ] && [ ! -f "${ETCD_DATA_DIR}/auth_successful" ] then # make sure everything succeeds set +e # start etcd locally without any cert validation ETCDCTL_ENDPOINTS="localhost:2379" unset ETCDCTL_CACERT ETCDCTL_CERT ETCDCTL_KEY echo "==> Configuring RBAC authentication!" # etcd will listen on localhost and accept requests on localhost only /usr/local/bin/etcd & PIDETCD=$! while ! /usr/local/bin/etcdctl member list ; do sleep 1; done # disable command output set +x /usr/local/bin/etcdctl --user root:${ACL_ROOT_PASSWORD} user get root || /usr/local/bin/etcdctl user add root:${ACL_ROOT_PASSWORD} /usr/local/bin/etcdctl auth enable || /usr/local/bin/etcdctl --user root:${ACL_ROOT_PASSWORD} auth enable set -x touch ${ETCD_DATA_DIR}/auth_successful # there's nothing else todo here kill ${PIDETCD} exit 0 fi /usr/local/bin/scripts/switch_ca_cert.sh -c chown -R 250422:250422 /data/combinedca/ # get all members in the cluster member_list=$(/usr/local/bin/etcdctl member list ) member_list_return_code=$? # check if the new node is already in the cluster my_member_line=$(/usr/local/bin/etcdctl member list | grep ${ETCD_NAME}. ) member_line_return_code=$? # get member_id member_id=$(echo $my_member_line | cut -d, -f1) # if the cluster exists if [ $member_list_return_code -eq 0 ] then # disable the output of running commands, because of the the sensitive ACL_ROOT_PASSWORD set +x # if new node not yet in cluster if [ $member_line_return_code -ne 0 ] then # add node to cluster member_add_line=$(/usr/local/bin/etcdctl member add ${ETCD_NAME} --peer-urls=${ETCD_INITIAL_ADVERTISE_PEER_URLS} || /usr/local/bin/etcdctl member add --user root:${ACL_ROOT_PASSWORD} ${ETCD_NAME} --peer-urls=${ETCD_INITIAL_ADVERTISE_PEER_URLS} ) member_add_line_return_code=$? elif [[ -z $(/usr/local/bin/etcdctl member list | grep ${ETCD_INITIAL_ADVERTISE_PEER_URLS}) ]] then # already in cluster update peer url in case it changed member_add_line=$(/usr/local/bin/etcdctl member update $member_id --peer-urls=${ETCD_INITIAL_ADVERTISE_PEER_URLS} || /usr/local/bin/etcdctl member update --user root:${ACL_ROOT_PASSWORD} $member_id --peer-urls=${ETCD_INITIAL_ADVERTISE_PEER_URLS} ) member_add_line_return_code=$? else exit 0 fi # enable the output of running commands set -x # check if new node has been added correctly if [ $member_add_line_return_code -eq 0 ] then exit 0 fi else if [ $ordinal -ne 0 ] then # check if already initialized if [ -d "${ETCD_DATA_DIR}/member" ] then # we were a member previously but we cannot reach the cluster, start with whatever config we have exit 0 else echo "cluster id is not 0, but there is not cluster and this instance has not been initialized previously" fi else exit 0 fi fi sleep 2 done EOF defragmentation.sh: | #!/bin/bash # runs defragmentation command every set interval(minute) while true; do bash -c 'unset ETCDCTL_ENDPOINTS; /usr/local/bin/etcdctl defrag --endpoints=:2379 --insecure-skip-tls-verify || true' sleep ${DEFRAGMENT_PERIODIC_INTERVAL}m done entrypoint.sh: | #!/bin/bash LOGS=${ETCD_FIFO_DIR}/etcd.fifo # this is used by the liveness probe to check if the pod is alive echo "alive" > ${ETCD_DATA_DIR}/etcd.liveness # open a named pipe for etcd_runner.sh. Leave it in read write mode to avoid blocking the script. A pipe is used to avoid filling disk space with logs if [ ! -e $LOGS ] then mkfifo $LOGS fi if [ $? -ne 0 ]; then echo "Cannot open logging pipe" exit 1 fi # run the script in background nohup /usr/local/bin/scripts/etcd_runner.sh & # get the pid to check for it later PIDRUNNER=$! if [[ ${DEFRAGMENT_ENABLE}=="true" ]]; then nohup /usr/local/bin/scripts/defragmentation.sh & nohup /usr/local/bin/scripts/monitorAlarm.sh & fi fail_count=60 restart=true restart_count=0 # checking for etcd health while true; do # get the start time so is possible to know the time spent reading the pipe start=$(date +"%s") # read logs from pipe IFS=$'\n' while read -r -t 5 line; do echo $line done < $LOGS # calculate time spent reading the pipe end=$(date +"%s") time_spent=$((end-start)) # if we spent less then 5 seconds, add some sleep. this is useful to avoid too much cpu cycles if [[ ${time_spent} -le 5 ]]; then sleep $((5-time_spent)) fi # if etcd is not running and ENTRYPOINT_RESTART is true, restart etcd, otherwise exit if ! kill -0 ${PIDRUNNER} >/dev/null 2>&1; then if [[ ${restart} == "true" ]]; then backupFile=${ETCD_DATA_DIR}/member/snap/backup.db if [[ -f "$backupFile" ]]; then nohup /usr/local/bin/scripts/etcd_restore.sh else echo "Restarting ETCD" nohup /usr/local/bin/scripts/etcd_runner.sh & PIDRUNNER=$! ((restart_count++)) if [[ "$restart_count" == 12 ]]; then echo "dead" > ${ETCD_DATA_DIR}/etcd.liveness fi fi else echo "dead" > ${ETCD_DATA_DIR}/etcd.liveness fi fi done etcd_restore.sh: | #!/bin/bash echo "$(date +'%Y-%m-%dT%H:%M:%S.%N%z'): restore starts" ordinal=${ETCD_NAME##*-} RESTORE_DIR=${ETCD_DATA_DIR}/restore_data SNAPSHOT_DIR=${ETCD_DATA_DIR}/member/snap BACKUPFILE=backup.db if [[ ordinal -eq 0 ]]; then # node 0 rm -rf ${RESTORE_DIR} mkdir -p ${SNAPSHOT_DIR} echo "$(date +'%Y-%m-%dT%H:%M:%S.%N%z'): Launch command etcdctl snapshot restore" /usr/local/bin/etcdctl snapshot restore ${SNAPSHOT_DIR}/${BACKUPFILE} --name ${HOSTNAME} --initial-cluster ${HOSTNAME}=${ETCD_INITIAL_ADVERTISE_PEER_URLS} --initial-advertise-peer-urls ${ETCD_INITIAL_ADVERTISE_PEER_URLS} --data-dir=${RESTORE_DIR} echo "$(date +'%Y-%m-%dT%H:%M:%S.%N%z'): kill etcd" pkill --exact etcd echo "$(date +'%Y-%m-%dT%H:%M:%S.%N%z'): sleep a while so that agent can detect that the service is down" sleep 2 echo "$(date +'%Y-%m-%dT%H:%M:%S.%N%z'): copy member directory from backup to /data" cp -Rf ${RESTORE_DIR}/member ${ETCD_DATA_DIR} rm ${SNAPSHOT_DIR}/${BACKUPFILE} else # other nodes echo "$(date +'%Y-%m-%dT%H:%M:%S.%N%z'): Remove /data/member directory" rm -Rf ${ETCD_DATA_DIR}/member echo "$(date +'%Y-%m-%dT%H:%M:%S.%N%z'): /data/member directory removed." sleep 3 rm ${SNAPSHOT_DIR}/${BACKUPFILE} fi echo "$(date +'%Y-%m-%dT%H:%M:%S.%N%z'): restore ends" etcd_runner.sh: | #!/bin/bash -xe # get ordinal ordinal=${ETCD_NAME##*-} # get service name base_name=`echo $ETCD_NAME | sed "s/-${ordinal}$//g"` if [[ "$ordinal" = "0" ]]; then export ETCD_INITIAL_CLUSTER_STATE="new" else export ETCD_INITIAL_CLUSTER_STATE="existing" fi # create ETCD_INITIAL_CLUSTER: # for peer 0 this is dced-service-0=url-0 # for peer 1 this is dced-service-0=url-0,dced-service-1=url-1 # for peer 2 this is dced-service-0=url-0,dced-service-1=url-1,dced-service-2=url-2 replace_string="XYZ" initial_cluster_base="${ETCD_NAME}=${ETCD_INITIAL_ADVERTISE_PEER_URLS}" initial_cluster_base="${initial_cluster_base//$ETCD_NAME/${replace_string}}" ETCD_INITIAL_CLUSTER="${initial_cluster_base//${replace_string}/$base_name-0}" for (( c=1; c<=$ordinal; c++ )) do ETCD_INITIAL_CLUSTER=${ETCD_INITIAL_CLUSTER},${initial_cluster_base//${replace_string}/$base_name-$c} done export ETCD_INITIAL_CLUSTER ETCD_PEER_CERT_FILE=/run/sec/certs/peer/srvcert.pem ETCD_PEER_KEY_FILE=/run/sec/certs/peer/srvprivkey.pem AUTO_TLS=true CA_FILE=/data/combinedca/cacertbundle.pem /usr/local/bin/scripts/switch_ca_cert.sh -cs #If non-bootstrap mode and pod-0, use peer certs for pod-0 #For other pods, use peer certs echo "Setup peer certs for ${ETCD_NAME} " ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380 export ETCD_LISTEN_PEER_URLS while [[ "${AUTO_TLS}" == "false" && ! -s ${ETCD_PEER_CERT_FILE} && ! -s ${ETCD_PEER_KEY_FILE} ]] do echo "Peer certs empty ${ETCD_PEER_CERT_FILE} ${ETCD_PEER_KEY_FILE},sleep 2 seconds. " sleep 2 done if [[ "${AUTO_TLS}" == "false" ]]; then echo "Auto TLS disabled , setup SIP-TLS certs for peer communication" ETCD_PEER_CLIENT_CERT_AUTH=true ETCD_PEER_TRUSTED_CA_FILE=/data/combinedca/cacertbundle.pem ETCD_PEER_CERT_FILE=/run/sec/certs/peer/srvcert.pem export ETCD_PEER_CLIENT_CERT_AUTH export ETCD_PEER_TRUSTED_CA_FILE export ETCD_PEER_CERT_FILE export ETCD_PEER_KEY_FILE else echo "Auto TLS enable , using ETCD auto generated certs for peer communication" fi # redirect etcd output to named pipe open by entrypoint.sh so we can continue to stream etcd log to k8s in real time. /usr/local/bin/etcd > ${ETCD_FIFO_DIR}/etcd.fifo handleAlarm.sh: | #!/bin/bash # runs compact command once the nospace alarm raised echo "Running Compaction" /usr/local/bin/etcdctl compact $(/usr/local/bin/etcdctl endpoint status --write-out="json" | egrep -o '"revision":[0-9]*' | egrep -o '[0-9].*') # runs defragmentation command echo "Running Defragmentation" bash -c 'unset ETCDCTL_ENDPOINTS; /usr/local/bin/etcdctl defrag --endpoints=:2379 --insecure-skip-tls-verify' echo "Endpoint after compaction and defragmentation -" /usr/local/bin/etcdctl --write-out=table endpoint status --endpoints=:2379 --insecure-skip-tls-verify # wait for delay mins for all pod defragging echo "Delay Alarm Removal for (minutes):"+${DISARM_ALARM_PEER_INTERVAL} sleep ${DISARM_ALARM_PEER_INTERVAL}m # remove nospace alarm /usr/local/bin/etcdctl alarm disarm liveness.sh: | #!/bin/bash -xe # grep the control file for pod status if [ -e ${ETCD_DATA_DIR}/etcd.liveness ] then if grep -qi 'alive' ${ETCD_DATA_DIR}/etcd.liveness; then exit 0 else exit 1 fi else if grep -qi 'alive' ~/etcd.liveness; then exit 0 else exit 1 fi fi monitorAlarm.sh: | #!/bin/bash # runs list Alarm command every 5(minute) while true; do alarm=$(/usr/local/bin/etcdctl alarm list | grep 'NOSPACE') if [[ -n $alarm ]];then echo "alarm detected:"+$alarm echo "Endpoint status before compaction and defragmentation -" /usr/local/bin/etcdctl --write-out=table endpoint status --endpoints=:2379 --insecure-skip-tls-verify nohup /usr/local/bin/scripts/handleAlarm.sh fi sleep ${MONITOR_ALARM_INTERVAL}m done restart.sh: | #!/bin/bash echo "$(date +%Y-%m-%dT%T.%N) Restarting ETCD while reloading certificates" OPERATIONAL_CA_CERT_FILE=/run/sec/cas/clientca/client-cacertbundle.pem SIPTLS_CA_CERT_FILE=/run/sec/cas/siptlsca/cacertbundle.pem if [[ -f "$OPERATIONAL_CA_CERT_FILE" && -f "$SIPTLS_CA_CERT_FILE" ]] then echo "$(date +%Y-%m-%dT%T.%N) Killing ETCD process" pkill etcd & fi switch_ca_cert.sh: | #!/bin/bash -xe # options # -c check if ca cert does not exist, exit # -s restart pod if certs change # -w wait for at least one of the ca cert files to exist # concat all potential locations of the ca certs OPERATIONAL_CA_CERT_FILE=/run/sec/cas/clientca/client-cacertbundle.pem SIPTLS_CA_CERT_FILE=/run/sec/cas/siptlsca/cacertbundle.pem CA_FILE=/data/combinedca/cacertbundle.pem while getopts cs option do case "${option}" in c) CHECK=true;; s) SUICIDE=true;; esac done mkdir -p /data/combinedca/ if [[ -v SUICIDE ]] then # etcd automatically reloads the server and client certs # but it does not reload the CA # we need to restart etcd to make this happen # if an exisiting watch_cert process is detected, skip step. if [[ -z $(ps -ef | grep watch_cert | grep -v grep) ]] then nohup /usr/local/bin/scripts/watch_cert.sh /run/sec/cas/*ca & fi fi # put all CAs in one file, ensure CA is not empty rm -f $CA_FILE while [[ ! -s $CA_FILE ]] do awk 1 /run/sec/cas/*/*cacertbundle.pem > $CA_FILE if [[ ! -s $CA_FILE ]]; then sleep 2 fi done # exit if ca file does not exist (if -c is given as argument) if [[ -v CHECK && ! -f "$CA_FILE" ]] then echo "No content in $CA_FILE" exit 1 fi watch_cert.sh: | #!/bin/bash CERT=$1 last_modified=$(date +%s) while true do if [[ ! -z ${CERT} ]] then epoch_arr=$(stat -c %Y ${CERT}) current_modified=${epoch_arr[0]} if [ $last_modified != $current_modified ]; then last_modified=$current_modified echo "$(date +%Y-%m-%dT%T.%N) [ ${CERT} ] Caught Certificate renewed event." bash /usr/local/bin/scripts/restart.sh & fi sleep 5 fi done