Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP

Comparing changes

Choose two branches to see what's changed or to start a new pull request. If you need to, you can also compare across forks.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also compare across forks.
base fork: ebrandi/route53-failover
base: af7a68289c
...
head fork: ebrandi/route53-failover
compare: c250d24d7a
  • 2 commits
  • 2 files changed
  • 0 commit comments
  • 2 contributors
Commits on Dec 04, 2012
@raineralves raineralves Add multi site probing and mail notifications
+ Include support for multi site probing (see README for more info)
+ Include support for mail notifications
6375abe
@ebrandi Merge pull request #3 from raineralves/newfeatures
Add multi site probing and mail notifications
c250d24
Showing with 446 additions and 95 deletions.
  1. +37 −1 README.md
  2. +409 −94 route53-failover.sh
View
38 README.md
@@ -3,4 +3,40 @@ route53-failover
Shell scripts to implement server failover using Amazon Route53, for more information visit:
-http://blog.ebrandi.eti.br/2012/11/como-implementar-um-traffic-manager-com-funcionalidade-de-failover-baseado-no-amazon-route53/
+http://blog.ebrandi.eti.br/2012/11/como-implementar-um-traffic-manager-com-funcionalidade-de-failover-baseado-no-amazon-route53/
+
+
+**********************
+Additional information
+**********************
+
+I) Multi-site probing
+
+This script supports two different probing methods: single site and multi site.
+You should use the multi site option on mission critical DNS zones related to a URL where
+maximum uptime and availability are absolutely required.
+
+In order to use the multi site method you must choose 3 different servers:
+one will be the master node, responsible for doing webserver probes and updating the
+Route53 API whenever a host goes down.
+The other two nodes will act as slave nodes: they'll check your hosts and
+send their result back to the master node.
+
+The master node will update your DNS zone if (and only if) any given host
+is reported "down" or "up" from at least 2 different locations (out of 3).
+
+How to configure: set "multisiteprobe" to "1" on all three locations,
+set "probeonly" to "0" on the master node and on the slave nodes set
+"probeonly" to "1".
+
+On the master node set a friendly name for each slave node using "remoteprobe[1-2]"
+For instante, give each slave node the name of the hosting company where each
+server is located.
+
+Then set "remoteprobefile[1-2]" pointing to the "proberesult" file on each node.
+You may use a SCP url such as scp://username@my.server.com:/home/route53-failover/probe/proberesult
+A HTTP url or UNIX path are also accepted (useful for NFS exports between servers)
+
+Please note that each "proberesult" file has a timestamp and the master node
+requires that these files have been generated less than 5 minutes in the past.
+
View
503 route53-failover.sh
@@ -41,14 +41,55 @@ test_file=status
test_string="Error 200 OK"
connect_timeout=2
retries=3
+mailnotification=0 # [1]=on / [0]=off
+mailfrom=noreply@yourdomain.com
+mailto=noc@yourdomain.com,admin@anotherdomain.com
+mailsmtp=smtp.yourdomain.com
+multisiteprobe=0 # [1]=on / [0]=off
+probeonly=0
+remoteprobe[1]=isp1
+remoteprobefile[1]=scp://route53@server1.yourdomain.com:/usr/local/route53-failover/probe/proberesult
+remoteprobe[2]=isp2
+remoteprobefile[2]=http://server2.yourdomain.com/probe/proberesult
###############################################################
# You should not need to change anything bellow this point #
###############################################################
+# Create lockfile and avoid more than one script execution ('lockfile(1)' is used to avoid race conditions)
+if ! lockfile -r 0 $lockfile; then
+ echo "Error: script already running, exiting..."
+ exit 1
+fi
+
+# Remove lockfile if some other error causes the script to exit
+trap "{ rm -f "$lockfile"; exit $?; }" HUP INT TERM EXIT
+
+# Test if this script has write permission on $script_path
+if [ -z $script_path ]; then
+ echo "Error: Please set the \$script_path variable"
+ exit 1
+else
+ if [ ! -w $script_path ]; then
+ echo "Error: I don't have write permission on $script_path, please fix and try again"
+ exit 1
+ fi
+fi
+
+# Make sure some tools are installed correctly
+for i in dig awk curl diff openssl lockfile xmllint; do
+ which $i >/dev/null 2>&1 || { echo "Error: Please install $i before proceeding" && exit 1; }
+done
+
+if [[ $mailnotification -eq 1 ]] && [[ $probeonly -ne 1 ]]; then
+ which nail >/dev/null 2>&1 || { echo "Error: Please install \"nail\" before proceeding (usually found in the \"mailx\" package)" && exit 1; }
+fi
+
# Enable some bash traps in order to avoid problems
-set -o nounset # avoid breaking everything in case of an uninitialised variable
-set -o pipefail # always set exit code to 1 when a piped subcommand fails
+set -o nounset # avoid breaking everything in case of an uninitialised variable
+set -o pipefail # always set exit code to 1 when a piped subcommand fails
+
+mkdir $script_path/log/ >/dev/null 2>&1
# Our logging function requires GNU awk [`gawk(1)'].
# The code bellow tries to find a suitable binary on non-Linux platforms.
@@ -69,44 +110,210 @@ fi
log() {
echo "$2" | $gawk '{ print "[" strftime("%Y-%m-%d %H:%M:%S") "]" "\t" $0; }' | tee -a $logfile
if [[ $1 == "error" ]]; then
+ mailNotification
echo && exit 1
fi
}
-# Make sure some tools are installed correctly
-for i in dig awk curl diff openssl lockfile xmllint; do
- which $i >/dev/null 2>&1 || log error "Error: Please install $i before proceeding"
+# Initialize the mail notification variables
+for i in $(seq 1 8); do
+ mailNotificationStatus[i]=0
done
+# Mail notification function
+
+mailNotification() {
+
+# During script execution we set special status codes (described bellow)
+# depending on which problems were encountered.
+# These status codes are later processed to decide which notifications should be sent.
+#
+# mailNotificationStatus[1] = DNS resolution problems
+# mailNotificationStatus[2] = Failed to generate AWS signature or validate AWS credentials
+# mailNotificationStatus[3] = Failed to submit AWS zoneset update
+# mailNotificationStatus[4] = Host down
+# mailNotificationStatus[5] = Host up
+# mailNotificationStatus[6] = Failover activated (problems)
+# mailNotificationStatus[7] = Failover disabled (back to normal)
+# mailNotificationStatus[8] = All hosts down, failover also down
+#
+# For status 4 and 5 (hosts up/down) we also store the amount of affected hosts.
+#
+# For example:
+#
+# mailNotificationStatus[4]=2 means we have 2 hosts down.
+# The IP address for the affected hosts will be stored on mailNotificationStatus[4*10+1] and mailNotificationStatus[4*10+2]
+# The reason for each problem will be stored on mailNotificationStatus[4*100+1] and mailNotificationStatus[4*100+2]
+# The string test for each host will be stored on mailNotificationStatus[4*1000+1] and mailNotificationStatus[4*1000+2]
+#
+# mailNotificationStatus[5]=3 means 3 hosts are back online (up)
+# The IP address for each host back online will be stored on mailNotificationStatus[5*10+1] until mailNotificationStatus[5*10+3]
+#
+
+ if [[ $mailnotification -eq 1 ]] && [[ $probeonly -ne 1 ]]; then
+
+ rm -f "$script_path/mail.out" >/dev/null 2>&1
+
+ if [[ -z $mailto ]]; then
+ echo "Error: Please provide a comma separated list of email recipients (\$mailto)"
+ exit 1
+ fi
+ if [[ -z $mailsmtp ]]; then
+ echo "Error: Please provide a valid SMTP server (\$mailsmtp)"
+ exit 1
+ fi
+
+ serviceProblems=0
+ serviceOK=0
+
+ for i in 4 6; do
+ if [[ ${mailNotificationStatus[i]} -ne 0 ]] && [[ $probeonly -ne 1 ]]; then
+ serviceProblems=1
+ fi
+ done
+
+ for i in 5 7; do
+ if [[ ${mailNotificationStatus[i]} -ne 0 ]] && [[ $probeonly -ne 1 ]]; then
+ serviceOK=1
+ fi
+ done
+
+ if [[ $serviceProblems -eq 1 ]]; then
+
+ mailsubject="[route53-failover] $Hostname.$Domain service trouble"
+
+ echo "There has been a change in your monitored service at $Hostname.$Domain" > $script_path/mail.out
+ echo "Current status: trouble" >> $script_path/mail.out
+ echo >> $script_path/mail.out
+ if [[ ${mailNotificationStatus[6]} -ne 0 ]]; then
+ echo "All hosts are DOWN, switching to failover host" >> $script_path/mail.out
+ echo "==> $fail_host" >> $script_path/mail.out
+ echo >> $script_path/mail.out
+ fi
+ if [[ ${mailNotificationStatus[4]} -ne 0 ]]; then
+ for i in $(seq 1 ${mailNotificationStatus[4]}); do
+ printf "%s reported down; " "${mailNotificationStatus[4*10+i]}" >> $script_path/mail.out
+ case "${mailNotificationStatus[4*100+i]}" in
+ 000) printf "connection error\n" >> $script_path/mail.out
+ ;;
+ 200) if [[ ${mailNotificationStatus[4*1000+i]} -eq 0 ]]; then
+ printf "test string not found\n" >> $script_path/mail.out
+ else
+ printf "connection error\n" >> $script_path/mail.out
+ fi
+ ;;
+ 400) printf "bad request (error 400)\n" >> $script_path/mail.out
+ ;;
+ 403) printf "access forbidden (error 403)\n" >> $script_path/mail.out
+ ;;
+ 404) printf "page not found (error 404)\n" >> $script_path/mail.out
+ ;;
+ 503) printf "internal server error (503)\n" >> $script_path/mail.out
+ ;;
+ *) printf "connection error\n" >> $script_path/mail.out
+ ;;
+ esac
+ done
+ if [[ ${mailNotificationStatus[5]} -ne 0 ]]; then
+ for i in $(seq 1 ${mailNotificationStatus[5]}); do
+ printf "%s reported up" "${mailNotificationStatus[5*10+i]}" >> $script_path/mail.out
+ done
+ fi
+ echo >> $script_path/mail.out
+ echo "*** Current production hosts (up):" >> $script_path/mail.out
+ if [[ -n $NewRecordSorted ]]; then
+ for i in $NewRecordSorted; do
+ echo "$i" >> $script_path/mail.out
+ done
+ else
+ echo "none" >> $script_path/mail.out
+ fi
+ echo >> $script_path/mail.out
+ echo "*** Current impacted hosts (down):" >> $script_path/mail.out
+ HostsDownList="$(echo $HostsDownList | sed -e 's;^,;;' | tr ',' '\n')"
+ if [[ -n $HostsDownList ]]; then
+ echo "$HostsDownList" >> $script_path/mail.out
+ else
+ echo "none" >> $script_path/mail.out
+ fi
+ fi
+ fi
+
+ if [[ $serviceOK -eq 1 ]] && [[ $serviceProblems -eq 0 ]]; then
+
+ mailsubject="[route53-failover] $Hostname.$Domain service ok"
+
+ echo "There has been a change in your monitored service at $Hostname.$Domain" > $script_path/mail.out
+ echo "Current status: ok" >> $script_path/mail.out
+ echo >> $script_path/mail.out
+ if [[ ${mailNotificationStatus[7]} -ne 0 ]]; then
+ echo "Disabling failover state, returning to normal operation" >> $script_path/mail.out
+ echo >> $script_path/mail.out
+ fi
+ if [[ ${mailNotificationStatus[5]} -ne 0 ]]; then
+ for i in $(seq 1 ${mailNotificationStatus[5]}); do
+ printf "%s reported up\n" "${mailNotificationStatus[5*10+i]}" >> $script_path/mail.out
+ done
+ echo >> $script_path/mail.out
+ echo "*** Current production hosts (up):" >> $script_path/mail.out
+ if [[ -n $NewRecordSorted ]]; then
+ for i in $NewRecordSorted; do
+ echo "$i" >> $script_path/mail.out
+ done
+ else
+ echo "none" >> $script_path/mail.out
+ fi
+ echo >> $script_path/mail.out
+ echo "*** Current impacted hosts (down):" >> $script_path/mail.out
+ HostsDownList="$(echo $HostsDownList | sed -e 's;^,;;' | tr ',' '\n')"
+ if [[ -n $HostsDownList ]]; then
+ echo "$HostsDownList" >> $script_path/mail.out
+ else
+ echo "none" >> $script_path/mail.out
+ fi
+ fi
+ fi
+
+ if [[ -s $script_path/mail.out ]] && [[ -n $mailsubject ]]; then
+
+ if [[ -n "$AWSResult" ]] && [[ -n "$AWSChangeset" ]]; then
+ echo >> $script_path/mail.out
+ echo "*** Route53 API Output:" >> $script_path/mail.out
+ echo >> $script_path/mail.out
+ echo "$AWSResult" >> $script_path/mail.out
+ echo >> $script_path/mail.out
+ echo "*** Changeset submited:" >> $script_path/mail.out
+ echo >> $script_path/mail.out
+ echo "$AWSChangeset" >> $script_path/mail.out
+ echo >> $script_path/mail.out
+ fi
+
+ nail -r "Route53 Failover <${mailfrom}>" -s "$mailsubject" -S smtp="$mailsmtp" "$mailto" < $script_path/mail.out
+
+ if [[ $? -eq 0 ]]; then
+ log info "Mail sent to $mailto"
+ else
+ log info "Error while sending mail to $mailto"
+ fi
+
+ rm -f "$script_path/mail.out" >/dev/null 2>&1
+ fi
+ fi
+
+}
+
# Enforce script permission to 700 for improved security (avoid leaking AWS credentials)
if [[ -z "$(find $script_path/$(basename $0) -perm 700)" ]]; then
log error "Error: This script should NOT be accessible to other users since it contains sensitive information, please chmod it to 700"
fi
-# Test if this script has write permission on $script_path
-if [ -z $script_path ]; then
- log error "Error: Please set the \$script_path variable"
-else
- if [ ! -w $script_path ]; then
- log error "Error: I don't have write permission on $script_path, please fix and try again"
- fi
+# The "ips.master" file cannot be empty
+if [ ! -s $script_path/ips.master ]; then
+ log error "Please create the \"ips.master\" file according to the documentation. Aborting..."
fi
-# Create lockfile and avoid more than one script execution ('lockfile(1)' is used to avoid race conditions)
-if ! lockfile -r 0 $lockfile; then
- log error "Error: script already running, exiting..."
-fi
-
-# Remove lockfile if some other error causes the script to exit
-trap 'rm -f "$lockfile"; exit $?' INT TERM EXIT
-
-# Initialize the mail notification variables
-for i in $(seq 1 8); do
- mailNotificationStatus[i]=0
-done
-
# Set variables with DNS Record Values to create DELETE API request
-AuthServer=$(dig NS $Domain | awk "/^$Domain/ { print \$5 }" | head -1) || (mailNotificationStatus[1]=1 && log error "Error retrieving domain info, check dns resolution")
+AuthServer=$(dig NS $Domain | awk "/^$Domain/ { print \$5 }" | head -1) || { mailNotificationStatus[1]=1 && log error "Error retrieving domain info, check dns resolution"; }
# Test DNS resolution and check if our domain is actually hosted on Route53
if [ -z $AuthServer ]; then
@@ -122,8 +329,8 @@ awssignature() {
if [ -z $AWSZoneID ] || [ -z $AWSAccesskeyID ] || [ -z $AWSSecretAPIKey ]; then
log error "Error: Please provide a valid set of AWS credentials"
else
- AWSCurrentDate="$(curl -sS -I --connect-timeout $connect_timeout --retry $retries --retry-delay 5 --stderr /dev/null https://route53.amazonaws.com/date | grep Date | sed 's/.*Date: //' | tr -d '\r')" || (mailNotificationStatus[2]=1 && log error "Error retrieving current date from AWS")
- AWSSignature=$(printf "$AWSCurrentDate" | openssl dgst -binary -sha256 -hmac $AWSSecretAPIKey | openssl enc -base64) || (mailNotificationStatus[2]=1 && log error "Error generating AWS signature")
+ AWSCurrentDate="$(curl -sS -I --connect-timeout $connect_timeout --retry $retries --retry-delay 5 --stderr /dev/null https://route53.amazonaws.com/date | grep Date | sed 's/.*Date: //' | tr -d '\r')" || { mailNotificationStatus[2]=1 && log error "Error retrieving current date from AWS"; }
+ AWSSignature=$(printf "$AWSCurrentDate" | openssl dgst -binary -sha256 -hmac $AWSSecretAPIKey | openssl enc -base64) || { mailNotificationStatus[2]=1 && log error "Error generating AWS signature"; }
AWSDateHeader="Date: $AWSCurrentDate"
AWSAuthHeader="X-Amzn-Authorization: AWS3-HTTPS AWSAccessKeyId=$AWSAccesskeyID,Algorithm=HmacSHA256,Signature=$AWSSignature"
fi
@@ -184,7 +391,7 @@ submitroute53() {
touch $script_path/awslastvalidation || log error "Error manipulating temporary files"
-if [[ ! "$(date +"%H")" = "$(cat $script_path/awslastvalidation)" ]]; then
+if [[ ! "$(date +"%H")" = "$(cat $script_path/awslastvalidation)" ]] && [[ $probeonly -ne 1 ]]; then
awssignature # call our signature generation function
AWSResult=$(curl -sS -w ";;%{http_code}" --connect-timeout $connect_timeout --retry $retries --retry-delay 5 -H "$AWSDateHeader" -H "$AWSAuthHeader" -H "Content-Type: text/xml; charset=UTF-8" https://route53.amazonaws.com/2012-02-29/hostedzone?marker=$AWSZoneID)
@@ -214,9 +421,9 @@ if [[ ! "$(date +"%H")" = "$(cat $script_path/awslastvalidation)" ]]; then
esac
fi
-OldType=$(dig @$AuthServer A $Hostname.$Domain | awk "/^$Hostname.$Domain/ { print \$4 }" | head -1) || (mailNotificationStatus[1]=1 && log error "Error while running dig")
-OldTTL=$(dig @$AuthServer A $Hostname.$Domain | awk "/^$Hostname.$Domain/ { print \$2 }" | head -1) || (mailNotificationStatus[1]=1 && log error "Error while running dig")
-OldRecord=$(dig @$AuthServer A $Hostname.$Domain | awk "/^$Hostname.$Domain/ { print \$5 }" | sed s/\ //g) || (mailNotificationStatus[1]=1 && log error "Error while running dig")
+OldType=$(dig @$AuthServer A $Hostname.$Domain | awk "/^$Hostname.$Domain/ { print \$4 }" | head -1) || { mailNotificationStatus[1]=1 && log error "Error while running dig"; }
+OldTTL=$(dig @$AuthServer A $Hostname.$Domain | awk "/^$Hostname.$Domain/ { print \$2 }" | head -1) || { mailNotificationStatus[1]=1 && log error "Error while running dig"; }
+OldRecord=$(dig @$AuthServer A $Hostname.$Domain | awk "/^$Hostname.$Domain/ { print \$5 }" | sed s/\ //g) || { mailNotificationStatus[1]=1 && log error "Error while running dig"; }
# Create temporary files needed by this script
@@ -225,12 +432,111 @@ touch $script_path/ips.tmp || log error "Error manipulating temporary files"
mv -f $script_path/ips.tmp $script_path/ips.tmp.old || log error "Error manipulating temporary files"
touch $script_path/ips.tmp || log error "Error manipulating temporary files"
mkdir $script_path/probe/ >/dev/null 2>&1
-touch $script_path/probe/proberesult.old || log error "Error manipulating temporary files"
-touch $script_path/probe/proberesult || log error "Error manipulating temporary files"
-mv -f $script_path/probe/proberesult $script_path/probe/proberesult.old || log error "Error manipulating temporary files"
-touch $script_path/probe/proberesult || log error "Error manipulating temporary files"
touch $script_path/awsresult || log error "Error manipulating temporary files"
+# Fetch remote probe file if $multisiteprobe is enabled
+
+if [[ $multisiteprobe -eq 1 ]] && [[ $probeonly -ne 1 ]]; then
+
+ for i in 1 2; do
+
+ if [[ -z ${remoteprobe[i]} ]] || [[ -z ${remoteprobefile[i]} ]]; then
+ log error "Error: You have enabled multisite probe/health check therefore you must provide two valid probe names (\$remoteprobe) and two result URLs (\$remoteprobefile)"
+ fi
+
+ touch "$script_path/probe/proberesult.${remoteprobe[i]}"
+ mv -f "$script_path/probe/proberesult.${remoteprobe[i]}" "$script_path/probe/proberesult.${remoteprobe[i]}.old"
+ touch "$script_path/probe/proberesult.${remoteprobe[i]}"
+
+ remoteprobefiletype[i]=${remoteprobefile[i]:0:1} # unix file, scp or http? let's see the first letter
+
+ case "${remoteprobefiletype[i]}" in
+ /) if [[ -s "${remoteprobefile[i]}" ]]; then
+ cp -f "${remoteprobefile[i]}" "$script_path/probe/proberesult.${remoteprobe[i]}" >/dev/null 2>&1
+ else
+ log error "Error: ${remoteprobefile[i]} not found"
+ fi
+ ;;
+ s) scpusername=$(echo ${remoteprobefile[i]} | sed -e "s;scp://\(.*\)\@.*:/.*;\1;") # scp://username@myserver.com:/path/to/file
+ scphost=$(echo ${remoteprobefile[i]} | sed -e "s;scp://.*\@\(.*\):/.*;\1;")
+ scpfile=$(echo ${remoteprobefile[i]} | sed -e "s;scp://.*\@.*:\(.*\);\1;")
+ if [[ -n $scpusername ]] && [[ -n $scphost ]] && [[ -n $scpfile ]]; then
+ scp -q -B $scpusername@$scphost:$scpfile "$script_path/probe/proberesult.${remoteprobe[i]}" >/dev/null 2>&1
+ if [[ $? -ne 0 ]]; then
+ log error "Error during SCP copy. Please test the connection manually first and also make sure you set the variable like this: \$remoteprobefile[N]=scp://username@myserver.com:/path/to/file"
+ fi
+ else
+ log error "Error: If you want to use SCP please set \$remoteprobefile[N]=scp://username@myserver.com:/path/to/file"
+ fi
+ ;;
+ h) curl -sS --connect-timeout $connect_timeout --retry $retries --retry-delay 5 -o "$script_path/probe/proberesult.${remoteprobe[i]}" ${remoteprobefile[i]} >/dev/null 2>&1
+ if [[ $? -ne 0 ]]; then
+ log error "Error fetching HTTP file: ${remoteprobefile[i]}"
+ fi
+ ;;
+ *) log error "Please use a valid format for \$remoteprobefile: unix file, scp or http"
+ ;;
+ esac
+
+ if [[ -s "$script_path/probe/proberesult.${remoteprobe[i]}" ]]; then
+ echo "ok" > "$script_path/probe/fetchstatus.remote.${remoteprobe[i]}"
+ remoteprobetimestamp[i]=$(awk "/Timestamp/ { print \$3 }" < "$script_path/probe/proberesult.${remoteprobe[i]}")
+ if [[ -n ${remoteprobetimestamp[i]} ]]; then
+ timestampdelta[i]=$(($(date +%s)-${remoteprobetimestamp[i]}))
+ else
+ log error "Error: no timestamp found on remote probe file"
+ fi
+ if [[ ${timestampdelta[i]} -gt 30000 ]]; then
+ log error "Error: remote timestamp difference is greater than 5 minutes for ${remoteprobe[i]}, make sure the script is running on the remote host and also double check the clock/NTP settings"
+ fi
+ else
+ echo "error" > "$script_path/probe/fetchstatus.remote.${remoteprobe[i]}"
+ log error "Error fetching remote proberesult (${remoteprobefile[i]})"
+ fi
+ done
+fi
+
+# Compare probe results
+# If multisite is enabled, compare all three results
+# If multisite is disabled, no comparison is done
+
+multiSiteCompare () {
+
+ webserverip="$1"
+ webserverstatus="$2"
+
+ if [[ $multisiteprobe -eq 1 ]] && [[ $probeonly -ne 1 ]]; then
+ multisite_up_webserver=0
+ if [[ $webserverstatus == "up" ]]; then
+ multisite_up_webserver=$(( $multisite_up_webserver + 1 ))
+ fi
+ for z in 1 2; do
+ if [[ ! -s "$script_path/probe/proberesult.${remoteprobe[z]}" ]]; then
+ log error "Error reading proberesult.${remoteprobe[z]}"
+ fi
+ remoteprobe[z*10]=$(grep "^[0-9]" "$script_path/probe/proberesult.${remoteprobe[z]}" | grep "^$webserverip:200:1$" | uniq | wc -l | cut -d ' ' -f 8)
+ if [[ ${remoteprobe[z*10]} -ne 1 ]]; then # 1=up / 0=down
+ remoteprobe[z*100]=$(grep "^[0-9]" "$script_path/probe/proberesult.${remoteprobe[z]}" | grep "^$webserverip" | awk -F \: "{ print \$2 }")
+ log info "${remoteprobe[z]} is reporting $webserverip as DOWN"
+ else
+ multisite_up_webserver=$(( $multisite_up_webserver + 1 ))
+ fi
+ done
+ if [[ $multisite_up_webserver -ge 2 ]]; then
+ return 1 #up
+ elif [[ $multisite_up_webserver -lt 2 ]]; then
+ return 0 #down
+ fi
+ else
+ if [[ $webserverstatus == "up" ]]; then
+ return 1 #up
+ elif [[ $webserverstatus == "down" ]]; then
+ return 0 #down
+ fi
+ fi
+}
+
+
# Connect to webserver and search for a specific string to
# check if webserver are up and running for each address
# listed in ips.master file. Than print multiple lines
@@ -239,32 +545,63 @@ touch $script_path/awsresult || log error "Error manipulating temporary files"
HostsUpAmount=0
HostsDownAmount=0
HostsDisabledAmount=$(cat $script_path/ips.master | egrep "^#.*[0-9]" | wc -l | cut -d ' ' -f 8)
-
-for i in $(cat $script_path/ips.master | grep -v "#")
-do
- ip=$(echo $i | awk -F":" '{print $2}')
- webserverprobe=$(curl -sS -w ";;%{http_code}" --connect-timeout $connect_timeout --retry $retries --retry-delay 5 http://$ip/$test_file 2>&1)
- webserverprobeHTTPcode=$(echo $webserverprobe | awk -F ";;" "{ print \$2 }")
- webserverprobecondition=$(echo "$webserverprobe" | grep "$test_string" | wc -l | cut -d ' ' -f 8)
- echo "# Timestamp: $(date +%s) [$(date)]" > $script_path/probe/proberesult
- echo "# Format: <Webserver IP>:<Returned HTTP Code>:<String Found>" >> $script_path/probe/proberesult
- echo "# HTTP Code \"000\" means timeout or connection refused" >> $script_path/probe/proberesult
- echo "$ip:$webserverprobeHTTPcode:$webserverprobecondition" >> $script_path/probe/proberesult
- if [ "$webserverprobecondition" -eq "1" ]
- then
- HostsUpAmount=$(( $HostsUpAmount + 1 ))
- peso=$(echo $i | awk -F":" '{print $1}')
- counter=1
- while [ $counter -le $peso ]
- do
- echo $ip >> $script_path/ips.tmp
- counter=$(( $counter + 1 ))
- done
- else
- HostsDownAmount=$(( $HostsDownAmount + 1 ))
- fi
+HostsDownList=""
+
+echo "# Timestamp: $(date +%s) [$(date)]" > $script_path/probe/proberesult.tmp
+echo "# Format: <Webserver IP>:<Returned HTTP Code>:<String Found>" >> $script_path/probe/proberesult.tmp
+echo "# HTTP Code \"000\" means timeout or connection refused" >> $script_path/probe/proberesult.tmp
+
+for i in $(cat $script_path/ips.master | grep -v "#"); do
+ ip=$(echo $i | awk -F":" '{print $2}')
+ webserverprobe=$(curl -sS -w ";;%{http_code}" --connect-timeout $connect_timeout --retry $retries --retry-delay 5 http://$ip/$test_file 2>&1)
+ webserverprobeHTTPcode=$(echo $webserverprobe | awk -F ";;" "{ print \$2 }")
+ webserverprobecondition=$(echo "$webserverprobe" | grep "$test_string" | wc -l | cut -d ' ' -f 8)
+ echo "$ip:$webserverprobeHTTPcode:$webserverprobecondition" >> $script_path/probe/proberesult.tmp
+ if [ "$webserverprobecondition" -eq "1" ]; then
+ multiSiteCompare "$ip" up
+ if [[ $? -eq 1 ]]; then
+ HostsUpAmount=$(( $HostsUpAmount + 1 ))
+ peso=$(echo $i | awk -F":" '{print $1}')
+ counter=1
+ while [ $counter -le $peso ]; do
+ echo $ip >> $script_path/ips.tmp
+ counter=$(( $counter + 1 ))
+ done
+ else
+ log info "Host $ip is UP for me, but is DOWN from both other locations"
+ HostsDownAmount=$(( $HostsDownAmount + 1 ))
+ HostsDownList=$HostsDownList",$ip"
+ fi
+ else
+ multiSiteCompare "$ip" down
+ if [[ $? -eq 1 ]]; then
+ log info "Host $ip is DOWN for me, but is UP from both other locations"
+ HostsUpAmount=$(( $HostsUpAmount + 1 ))
+ peso=$(echo $i | awk -F":" '{print $1}')
+ counter=1
+ while [ $counter -le $peso ]; do
+ echo $ip >> $script_path/ips.tmp
+ counter=$(( $counter + 1 ))
+ done
+ else
+ HostsDownAmount=$(( $HostsDownAmount + 1 ))
+ HostsDownList=$HostsDownList",$ip"
+ log info "Host $ip is DOWN"
+ fi
+ fi
done
+# Move tmp file to its correct location (avoid race condition)
+mv -f $script_path/probe/proberesult.tmp $script_path/probe/proberesult
+
+# Exit if we detect this is a probe-only host. Probing has finished, nothing else needs to be done.
+if [[ $probeonly -eq 1 ]]; then
+ log info "Probe result: [up:$HostsUpAmount][down:$HostsDownAmount][disabled:$HostsDisabledAmount]"
+ echo "# Last successful execution: $(date +%s) [$(date)]" > $script_path/monit.status
+ rm -f "$lockfile"
+ exit 0
+fi
+
# Check if file ips.tmp are empty (empty file = no webserver available)
if [ -s "$script_path/ips.tmp" ]
@@ -288,7 +625,9 @@ then
echo
echo "You may force an update to Route53 by using the '--force' argument"
echo
- exit 0
+ echo "# Last successful execution: $(date +%s) [$(date)]" > $script_path/monit.status
+ rm -f "$lockfile"
+ exit 0
fi
if [[ -z $(diff $script_path/ips.tmp.old $script_path/ips.tmp >/dev/null) ]] && [[ -n $(grep ok $script_path/awsresult) ]] && [[ "$OldRecordSorted" == "$NewRecordSorted" ]] && [[ ! "${1:-unset}" = "--force" ]]
@@ -314,7 +653,6 @@ then
fi
# Show which hosts have been added or removed
- # TODO: Send this information by email
addHosts[0]="$(diff -u $script_path/ips.tmp.old $script_path/ips.tmp | sort -u | awk "/^\+[0-9]+/ {printf \$0 \" \";}" | sed -e s/\+//g -e s/\ $//)"
j=1
for i in ${addHosts[0]}; do
@@ -344,8 +682,8 @@ then
for i in $(seq 1 $(($j-1))); do
if [ ${addHosts[i*1000+4]} -eq 0 ]; then
showAddedHosts=$showAddedHosts", "${addHosts[i*1000+1]}
- mailNotificationStatus[5*10+i]=${addHosts[i*1000+1]} # ip address for each added host
mailNotificationStatus[5]=$((${mailNotificationStatus[5]}+1)) # increase host added count
+ mailNotificationStatus[5*10+i]=${addHosts[i*1000+1]} # ip address for each added host
else
increasedHostWeight=$increasedHostWeight"| "${addHosts[i*1000+1]}"|"${addHosts[i*1000+2]}"|"${addHosts[i*1000+3]}
increasedHostWeightCount=$(($increasedHostWeightCount+1))
@@ -395,8 +733,10 @@ then
for i in $(seq 1 $(($j-1))); do
if [ ${removeHosts[i*1000+4]} -eq 0 ]; then
showRemovedHosts=$showRemovedHosts", "${removeHosts[i*1000+1]}
- mailNotificationStatus[4*10+i]=${removeHosts[i*1000+1]} # ip address for each removed host
mailNotificationStatus[4]=$((${mailNotificationStatus[4]}+1)) # increase host removed count
+ mailNotificationStatus[4*10+i]=${removeHosts[i*1000+1]} # ip address for each removed host
+ mailNotificationStatus[4*100+i]=$(awk -F \: "/^${removeHosts[i*1000+1]}/ { print \$2 }" < $script_path/probe/proberesult) # http error code (reason)
+ mailNotificationStatus[4*1000+i]=$(awk -F \: "/^${removeHosts[i*1000+1]}/ { print \$3 }" < $script_path/probe/proberesult) # string test
else
decreasedHostWeight=$decreasedHostWeight"| "${removeHosts[i*1000+1]}"|"${removeHosts[i*1000+2]}"|"${removeHosts[i*1000+3]}
decreasedHostWeightCount=$(($decreasedHostWeightCount+1))
@@ -565,37 +905,12 @@ then
fi
fi
-mailNotification() {
-
-# During script execution we set special status codes (described bellow)
-# depending on which problems were encountered.
-# These status codes are later processed to decide which notifications should be sent.
-#
-# mailNotificationStatus[1] = DNS resolution problems
-# mailNotificationStatus[2] = Failed to generate AWS signature or validate AWS credentials
-# mailNotificationStatus[3] = Failed to submit AWS zoneset update
-# mailNotificationStatus[4] = Hosts down
-# mailNotificationStatus[5] = Hosts up
-# mailNotificationStatus[6] = Failover activated (problems)
-# mailNotificationStatus[7] = Failover disabled (back to normal)
-# mailNotificationStatus[8] = All hosts down, failover also down
-#
-# For status 4 and 5 (hosts up/down) we also store the amount of affected hosts.
-#
-# For example:
-#
-# mailNotificationStatus[4]=2 means we have 2 hosts down.
-# The IP address for the affected hosts will be stored on mailNotificationStatus[4*10+1] and mailNotificationStatus[4*10+2]
-# The reason for each problem will be stored on mailNotificationStatus[4*100+1] and mailNotificationStatus[4*100+2]
-# The string test for each host will be stored on mailNotificationStatus[4*1000+1] and mailNotificationStatus[4*1000+2]
-#
-# mailNotificationStatus[5]=3 means 3 hosts are back online (up)
-# The IP address for each host back online will be stored on mailNotificationStatus[5*10+1] until mailNotificationStatus[5*10+3]
-#
-
-echo
+mailNotification
-}
+# Update monitoring file.
+# You should use a monitoring tool to check if this file was updated in the last 5 minutes.
+# (and generate a critical alarm otherwise)
+echo "# Last successful execution: $(date +%s) [$(date)]" > $script_path/monit.status
# Remove lockfile
rm -f "$lockfile"

No commit comments for this range

Something went wrong with that request. Please try again.