Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP

Loading…

Add multi site probing and mail notifications #3

Merged
merged 1 commit into from

2 participants

raineralves Edson Brandi
raineralves
  • Include support for multi site probing (see README for more info)
  • Include support for mail notifications
raineralves raineralves Add multi site probing and mail notifications
+ Include support for multi site probing (see README for more info)
+ Include support for mail notifications
6375abe
Edson Brandi
Owner

Thks for your contribution :)

Edson Brandi ebrandi merged commit c250d24 into from
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Commits on Dec 4, 2012
  1. raineralves

    Add multi site probing and mail notifications

    raineralves authored
    + Include support for multi site probing (see README for more info)
    + Include support for mail notifications
This page is out of date. Refresh to see the latest.
Showing with 446 additions and 95 deletions.
  1. +37 −1 README.md
  2. +409 −94 route53-failover.sh
38 README.md
View
@@ -3,4 +3,40 @@ route53-failover
Shell scripts to implement server failover using Amazon Route53, for more information visit:
-http://blog.ebrandi.eti.br/2012/11/como-implementar-um-traffic-manager-com-funcionalidade-de-failover-baseado-no-amazon-route53/
+http://blog.ebrandi.eti.br/2012/11/como-implementar-um-traffic-manager-com-funcionalidade-de-failover-baseado-no-amazon-route53/
+
+
+**********************
+Additional information
+**********************
+
+I) Multi-site probing
+
+This script supports two different probing methods: single site and multi site.
+You should use the multi site option on mission critical DNS zones related to a URL where
+maximum uptime and availability are absolutely required.
+
+In order to use the multi site method you must choose 3 different servers:
+one will be the master node, responsible for doing webserver probes and updating the
+Route53 API whenever a host goes down.
+The other two nodes will act as slave nodes: they'll check your hosts and
+send their result back to the master node.
+
+The master node will update your DNS zone if (and only if) any given host
+is reported "down" or "up" from at least 2 different locations (out of 3).
+
+How to configure: set "multisiteprobe" to "1" on all three locations,
+set "probeonly" to "0" on the master node and on the slave nodes set
+"probeonly" to "1".
+
+On the master node set a friendly name for each slave node using "remoteprobe[1-2]"
+For instante, give each slave node the name of the hosting company where each
+server is located.
+
+Then set "remoteprobefile[1-2]" pointing to the "proberesult" file on each node.
+You may use a SCP url such as scp://username@my.server.com:/home/route53-failover/probe/proberesult
+A HTTP url or UNIX path are also accepted (useful for NFS exports between servers)
+
+Please note that each "proberesult" file has a timestamp and the master node
+requires that these files have been generated less than 5 minutes in the past.
+
503 route53-failover.sh
View
@@ -41,14 +41,55 @@ test_file=status
test_string="Error 200 OK"
connect_timeout=2
retries=3
+mailnotification=0 # [1]=on / [0]=off
+mailfrom=noreply@yourdomain.com
+mailto=noc@yourdomain.com,admin@anotherdomain.com
+mailsmtp=smtp.yourdomain.com
+multisiteprobe=0 # [1]=on / [0]=off
+probeonly=0
+remoteprobe[1]=isp1
+remoteprobefile[1]=scp://route53@server1.yourdomain.com:/usr/local/route53-failover/probe/proberesult
+remoteprobe[2]=isp2
+remoteprobefile[2]=http://server2.yourdomain.com/probe/proberesult
###############################################################
# You should not need to change anything bellow this point #
###############################################################
+# Create lockfile and avoid more than one script execution ('lockfile(1)' is used to avoid race conditions)
+if ! lockfile -r 0 $lockfile; then
+ echo "Error: script already running, exiting..."
+ exit 1
+fi
+
+# Remove lockfile if some other error causes the script to exit
+trap "{ rm -f "$lockfile"; exit $?; }" HUP INT TERM EXIT
+
+# Test if this script has write permission on $script_path
+if [ -z $script_path ]; then
+ echo "Error: Please set the \$script_path variable"
+ exit 1
+else
+ if [ ! -w $script_path ]; then
+ echo "Error: I don't have write permission on $script_path, please fix and try again"
+ exit 1
+ fi
+fi
+
+# Make sure some tools are installed correctly
+for i in dig awk curl diff openssl lockfile xmllint; do
+ which $i >/dev/null 2>&1 || { echo "Error: Please install $i before proceeding" && exit 1; }
+done
+
+if [[ $mailnotification -eq 1 ]] && [[ $probeonly -ne 1 ]]; then
+ which nail >/dev/null 2>&1 || { echo "Error: Please install \"nail\" before proceeding (usually found in the \"mailx\" package)" && exit 1; }
+fi
+
# Enable some bash traps in order to avoid problems
-set -o nounset # avoid breaking everything in case of an uninitialised variable
-set -o pipefail # always set exit code to 1 when a piped subcommand fails
+set -o nounset # avoid breaking everything in case of an uninitialised variable
+set -o pipefail # always set exit code to 1 when a piped subcommand fails
+
+mkdir $script_path/log/ >/dev/null 2>&1
# Our logging function requires GNU awk [`gawk(1)'].
# The code bellow tries to find a suitable binary on non-Linux platforms.
@@ -69,44 +110,210 @@ fi
log() {
echo "$2" | $gawk '{ print "[" strftime("%Y-%m-%d %H:%M:%S") "]" "\t" $0; }' | tee -a $logfile
if [[ $1 == "error" ]]; then
+ mailNotification
echo && exit 1
fi
}
-# Make sure some tools are installed correctly
-for i in dig awk curl diff openssl lockfile xmllint; do
- which $i >/dev/null 2>&1 || log error "Error: Please install $i before proceeding"
+# Initialize the mail notification variables
+for i in $(seq 1 8); do
+ mailNotificationStatus[i]=0
done
+# Mail notification function
+
+mailNotification() {
+
+# During script execution we set special status codes (described bellow)
+# depending on which problems were encountered.
+# These status codes are later processed to decide which notifications should be sent.
+#
+# mailNotificationStatus[1] = DNS resolution problems
+# mailNotificationStatus[2] = Failed to generate AWS signature or validate AWS credentials
+# mailNotificationStatus[3] = Failed to submit AWS zoneset update
+# mailNotificationStatus[4] = Host down
+# mailNotificationStatus[5] = Host up
+# mailNotificationStatus[6] = Failover activated (problems)
+# mailNotificationStatus[7] = Failover disabled (back to normal)
+# mailNotificationStatus[8] = All hosts down, failover also down
+#
+# For status 4 and 5 (hosts up/down) we also store the amount of affected hosts.
+#
+# For example:
+#
+# mailNotificationStatus[4]=2 means we have 2 hosts down.
+# The IP address for the affected hosts will be stored on mailNotificationStatus[4*10+1] and mailNotificationStatus[4*10+2]
+# The reason for each problem will be stored on mailNotificationStatus[4*100+1] and mailNotificationStatus[4*100+2]
+# The string test for each host will be stored on mailNotificationStatus[4*1000+1] and mailNotificationStatus[4*1000+2]
+#
+# mailNotificationStatus[5]=3 means 3 hosts are back online (up)
+# The IP address for each host back online will be stored on mailNotificationStatus[5*10+1] until mailNotificationStatus[5*10+3]
+#
+
+ if [[ $mailnotification -eq 1 ]] && [[ $probeonly -ne 1 ]]; then
+
+ rm -f "$script_path/mail.out" >/dev/null 2>&1
+
+ if [[ -z $mailto ]]; then
+ echo "Error: Please provide a comma separated list of email recipients (\$mailto)"
+ exit 1
+ fi
+ if [[ -z $mailsmtp ]]; then
+ echo "Error: Please provide a valid SMTP server (\$mailsmtp)"
+ exit 1
+ fi
+
+ serviceProblems=0
+ serviceOK=0
+
+ for i in 4 6; do
+ if [[ ${mailNotificationStatus[i]} -ne 0 ]] && [[ $probeonly -ne 1 ]]; then
+ serviceProblems=1
+ fi
+ done
+
+ for i in 5 7; do
+ if [[ ${mailNotificationStatus[i]} -ne 0 ]] && [[ $probeonly -ne 1 ]]; then
+ serviceOK=1
+ fi
+ done
+
+ if [[ $serviceProblems -eq 1 ]]; then
+
+ mailsubject="[route53-failover] $Hostname.$Domain service trouble"
+
+ echo "There has been a change in your monitored service at $Hostname.$Domain" > $script_path/mail.out
+ echo "Current status: trouble" >> $script_path/mail.out
+ echo >> $script_path/mail.out
+ if [[ ${mailNotificationStatus[6]} -ne 0 ]]; then
+ echo "All hosts are DOWN, switching to failover host" >> $script_path/mail.out
+ echo "==> $fail_host" >> $script_path/mail.out
+ echo >> $script_path/mail.out
+ fi
+ if [[ ${mailNotificationStatus[4]} -ne 0 ]]; then
+ for i in $(seq 1 ${mailNotificationStatus[4]}); do
+ printf "%s reported down; " "${mailNotificationStatus[4*10+i]}" >> $script_path/mail.out
+ case "${mailNotificationStatus[4*100+i]}" in
+ 000) printf "connection error\n" >> $script_path/mail.out
+ ;;
+ 200) if [[ ${mailNotificationStatus[4*1000+i]} -eq 0 ]]; then
+ printf "test string not found\n" >> $script_path/mail.out
+ else
+ printf "connection error\n" >> $script_path/mail.out
+ fi
+ ;;
+ 400) printf "bad request (error 400)\n" >> $script_path/mail.out
+ ;;
+ 403) printf "access forbidden (error 403)\n" >> $script_path/mail.out
+ ;;
+ 404) printf "page not found (error 404)\n" >> $script_path/mail.out
+ ;;
+ 503) printf "internal server error (503)\n" >> $script_path/mail.out
+ ;;
+ *) printf "connection error\n" >> $script_path/mail.out
+ ;;
+ esac
+ done
+ if [[ ${mailNotificationStatus[5]} -ne 0 ]]; then
+ for i in $(seq 1 ${mailNotificationStatus[5]}); do
+ printf "%s reported up" "${mailNotificationStatus[5*10+i]}" >> $script_path/mail.out
+ done
+ fi
+ echo >> $script_path/mail.out
+ echo "*** Current production hosts (up):" >> $script_path/mail.out
+ if [[ -n $NewRecordSorted ]]; then
+ for i in $NewRecordSorted; do
+ echo "$i" >> $script_path/mail.out
+ done
+ else
+ echo "none" >> $script_path/mail.out
+ fi
+ echo >> $script_path/mail.out
+ echo "*** Current impacted hosts (down):" >> $script_path/mail.out
+ HostsDownList="$(echo $HostsDownList | sed -e 's;^,;;' | tr ',' '\n')"
+ if [[ -n $HostsDownList ]]; then
+ echo "$HostsDownList" >> $script_path/mail.out
+ else
+ echo "none" >> $script_path/mail.out
+ fi
+ fi
+ fi
+
+ if [[ $serviceOK -eq 1 ]] && [[ $serviceProblems -eq 0 ]]; then
+
+ mailsubject="[route53-failover] $Hostname.$Domain service ok"
+
+ echo "There has been a change in your monitored service at $Hostname.$Domain" > $script_path/mail.out
+ echo "Current status: ok" >> $script_path/mail.out
+ echo >> $script_path/mail.out
+ if [[ ${mailNotificationStatus[7]} -ne 0 ]]; then
+ echo "Disabling failover state, returning to normal operation" >> $script_path/mail.out
+ echo >> $script_path/mail.out
+ fi
+ if [[ ${mailNotificationStatus[5]} -ne 0 ]]; then
+ for i in $(seq 1 ${mailNotificationStatus[5]}); do
+ printf "%s reported up\n" "${mailNotificationStatus[5*10+i]}" >> $script_path/mail.out
+ done
+ echo >> $script_path/mail.out
+ echo "*** Current production hosts (up):" >> $script_path/mail.out
+ if [[ -n $NewRecordSorted ]]; then
+ for i in $NewRecordSorted; do
+ echo "$i" >> $script_path/mail.out
+ done
+ else
+ echo "none" >> $script_path/mail.out
+ fi
+ echo >> $script_path/mail.out
+ echo "*** Current impacted hosts (down):" >> $script_path/mail.out
+ HostsDownList="$(echo $HostsDownList | sed -e 's;^,;;' | tr ',' '\n')"
+ if [[ -n $HostsDownList ]]; then
+ echo "$HostsDownList" >> $script_path/mail.out
+ else
+ echo "none" >> $script_path/mail.out
+ fi
+ fi
+ fi
+
+ if [[ -s $script_path/mail.out ]] && [[ -n $mailsubject ]]; then
+
+ if [[ -n "$AWSResult" ]] && [[ -n "$AWSChangeset" ]]; then
+ echo >> $script_path/mail.out
+ echo "*** Route53 API Output:" >> $script_path/mail.out
+ echo >> $script_path/mail.out
+ echo "$AWSResult" >> $script_path/mail.out
+ echo >> $script_path/mail.out
+ echo "*** Changeset submited:" >> $script_path/mail.out
+ echo >> $script_path/mail.out
+ echo "$AWSChangeset" >> $script_path/mail.out
+ echo >> $script_path/mail.out
+ fi
+
+ nail -r "Route53 Failover <${mailfrom}>" -s "$mailsubject" -S smtp="$mailsmtp" "$mailto" < $script_path/mail.out
+
+ if [[ $? -eq 0 ]]; then
+ log info "Mail sent to $mailto"
+ else
+ log info "Error while sending mail to $mailto"
+ fi
+
+ rm -f "$script_path/mail.out" >/dev/null 2>&1
+ fi
+ fi
+
+}
+
# Enforce script permission to 700 for improved security (avoid leaking AWS credentials)
if [[ -z "$(find $script_path/$(basename $0) -perm 700)" ]]; then
log error "Error: This script should NOT be accessible to other users since it contains sensitive information, please chmod it to 700"
fi
-# Test if this script has write permission on $script_path
-if [ -z $script_path ]; then
- log error "Error: Please set the \$script_path variable"
-else
- if [ ! -w $script_path ]; then
- log error "Error: I don't have write permission on $script_path, please fix and try again"
- fi
+# The "ips.master" file cannot be empty
+if [ ! -s $script_path/ips.master ]; then
+ log error "Please create the \"ips.master\" file according to the documentation. Aborting..."
fi
-# Create lockfile and avoid more than one script execution ('lockfile(1)' is used to avoid race conditions)
-if ! lockfile -r 0 $lockfile; then
- log error "Error: script already running, exiting..."
-fi
-
-# Remove lockfile if some other error causes the script to exit
-trap 'rm -f "$lockfile"; exit $?' INT TERM EXIT
-
-# Initialize the mail notification variables
-for i in $(seq 1 8); do
- mailNotificationStatus[i]=0
-done
-
# Set variables with DNS Record Values to create DELETE API request
-AuthServer=$(dig NS $Domain | awk "/^$Domain/ { print \$5 }" | head -1) || (mailNotificationStatus[1]=1 && log error "Error retrieving domain info, check dns resolution")
+AuthServer=$(dig NS $Domain | awk "/^$Domain/ { print \$5 }" | head -1) || { mailNotificationStatus[1]=1 && log error "Error retrieving domain info, check dns resolution"; }
# Test DNS resolution and check if our domain is actually hosted on Route53
if [ -z $AuthServer ]; then
@@ -122,8 +329,8 @@ awssignature() {
if [ -z $AWSZoneID ] || [ -z $AWSAccesskeyID ] || [ -z $AWSSecretAPIKey ]; then
log error "Error: Please provide a valid set of AWS credentials"
else
- AWSCurrentDate="$(curl -sS -I --connect-timeout $connect_timeout --retry $retries --retry-delay 5 --stderr /dev/null https://route53.amazonaws.com/date | grep Date | sed 's/.*Date: //' | tr -d '\r')" || (mailNotificationStatus[2]=1 && log error "Error retrieving current date from AWS")
- AWSSignature=$(printf "$AWSCurrentDate" | openssl dgst -binary -sha256 -hmac $AWSSecretAPIKey | openssl enc -base64) || (mailNotificationStatus[2]=1 && log error "Error generating AWS signature")
+ AWSCurrentDate="$(curl -sS -I --connect-timeout $connect_timeout --retry $retries --retry-delay 5 --stderr /dev/null https://route53.amazonaws.com/date | grep Date | sed 's/.*Date: //' | tr -d '\r')" || { mailNotificationStatus[2]=1 && log error "Error retrieving current date from AWS"; }
+ AWSSignature=$(printf "$AWSCurrentDate" | openssl dgst -binary -sha256 -hmac $AWSSecretAPIKey | openssl enc -base64) || { mailNotificationStatus[2]=1 && log error "Error generating AWS signature"; }
AWSDateHeader="Date: $AWSCurrentDate"
AWSAuthHeader="X-Amzn-Authorization: AWS3-HTTPS AWSAccessKeyId=$AWSAccesskeyID,Algorithm=HmacSHA256,Signature=$AWSSignature"
fi
@@ -184,7 +391,7 @@ submitroute53() {
touch $script_path/awslastvalidation || log error "Error manipulating temporary files"
-if [[ ! "$(date +"%H")" = "$(cat $script_path/awslastvalidation)" ]]; then
+if [[ ! "$(date +"%H")" = "$(cat $script_path/awslastvalidation)" ]] && [[ $probeonly -ne 1 ]]; then
awssignature # call our signature generation function
AWSResult=$(curl -sS -w ";;%{http_code}" --connect-timeout $connect_timeout --retry $retries --retry-delay 5 -H "$AWSDateHeader" -H "$AWSAuthHeader" -H "Content-Type: text/xml; charset=UTF-8" https://route53.amazonaws.com/2012-02-29/hostedzone?marker=$AWSZoneID)
@@ -214,9 +421,9 @@ if [[ ! "$(date +"%H")" = "$(cat $script_path/awslastvalidation)" ]]; then
esac
fi
-OldType=$(dig @$AuthServer A $Hostname.$Domain | awk "/^$Hostname.$Domain/ { print \$4 }" | head -1) || (mailNotificationStatus[1]=1 && log error "Error while running dig")
-OldTTL=$(dig @$AuthServer A $Hostname.$Domain | awk "/^$Hostname.$Domain/ { print \$2 }" | head -1) || (mailNotificationStatus[1]=1 && log error "Error while running dig")
-OldRecord=$(dig @$AuthServer A $Hostname.$Domain | awk "/^$Hostname.$Domain/ { print \$5 }" | sed s/\ //g) || (mailNotificationStatus[1]=1 && log error "Error while running dig")
+OldType=$(dig @$AuthServer A $Hostname.$Domain | awk "/^$Hostname.$Domain/ { print \$4 }" | head -1) || { mailNotificationStatus[1]=1 && log error "Error while running dig"; }
+OldTTL=$(dig @$AuthServer A $Hostname.$Domain | awk "/^$Hostname.$Domain/ { print \$2 }" | head -1) || { mailNotificationStatus[1]=1 && log error "Error while running dig"; }
+OldRecord=$(dig @$AuthServer A $Hostname.$Domain | awk "/^$Hostname.$Domain/ { print \$5 }" | sed s/\ //g) || { mailNotificationStatus[1]=1 && log error "Error while running dig"; }
# Create temporary files needed by this script
@@ -225,12 +432,111 @@ touch $script_path/ips.tmp || log error "Error manipulating temporary files"
mv -f $script_path/ips.tmp $script_path/ips.tmp.old || log error "Error manipulating temporary files"
touch $script_path/ips.tmp || log error "Error manipulating temporary files"
mkdir $script_path/probe/ >/dev/null 2>&1
-touch $script_path/probe/proberesult.old || log error "Error manipulating temporary files"
-touch $script_path/probe/proberesult || log error "Error manipulating temporary files"
-mv -f $script_path/probe/proberesult $script_path/probe/proberesult.old || log error "Error manipulating temporary files"
-touch $script_path/probe/proberesult || log error "Error manipulating temporary files"
touch $script_path/awsresult || log error "Error manipulating temporary files"
+# Fetch remote probe file if $multisiteprobe is enabled
+
+if [[ $multisiteprobe -eq 1 ]] && [[ $probeonly -ne 1 ]]; then
+
+ for i in 1 2; do
+
+ if [[ -z ${remoteprobe[i]} ]] || [[ -z ${remoteprobefile[i]} ]]; then
+ log error "Error: You have enabled multisite probe/health check therefore you must provide two valid probe names (\$remoteprobe) and two result URLs (\$remoteprobefile)"
+ fi
+
+ touch "$script_path/probe/proberesult.${remoteprobe[i]}"
+ mv -f "$script_path/probe/proberesult.${remoteprobe[i]}" "$script_path/probe/proberesult.${remoteprobe[i]}.old"
+ touch "$script_path/probe/proberesult.${remoteprobe[i]}"
+
+ remoteprobefiletype[i]=${remoteprobefile[i]:0:1} # unix file, scp or http? let's see the first letter
+
+ case "${remoteprobefiletype[i]}" in
+ /) if [[ -s "${remoteprobefile[i]}" ]]; then
+ cp -f "${remoteprobefile[i]}" "$script_path/probe/proberesult.${remoteprobe[i]}" >/dev/null 2>&1
+ else
+ log error "Error: ${remoteprobefile[i]} not found"
+ fi
+ ;;
+ s) scpusername=$(echo ${remoteprobefile[i]} | sed -e "s;scp://\(.*\)\@.*:/.*;\1;") # scp://username@myserver.com:/path/to/file
+ scphost=$(echo ${remoteprobefile[i]} | sed -e "s;scp://.*\@\(.*\):/.*;\1;")
+ scpfile=$(echo ${remoteprobefile[i]} | sed -e "s;scp://.*\@.*:\(.*\);\1;")
+ if [[ -n $scpusername ]] && [[ -n $scphost ]] && [[ -n $scpfile ]]; then
+ scp -q -B $scpusername@$scphost:$scpfile "$script_path/probe/proberesult.${remoteprobe[i]}" >/dev/null 2>&1
+ if [[ $? -ne 0 ]]; then
+ log error "Error during SCP copy. Please test the connection manually first and also make sure you set the variable like this: \$remoteprobefile[N]=scp://username@myserver.com:/path/to/file"
+ fi
+ else
+ log error "Error: If you want to use SCP please set \$remoteprobefile[N]=scp://username@myserver.com:/path/to/file"
+ fi
+ ;;
+ h) curl -sS --connect-timeout $connect_timeout --retry $retries --retry-delay 5 -o "$script_path/probe/proberesult.${remoteprobe[i]}" ${remoteprobefile[i]} >/dev/null 2>&1
+ if [[ $? -ne 0 ]]; then
+ log error "Error fetching HTTP file: ${remoteprobefile[i]}"
+ fi
+ ;;
+ *) log error "Please use a valid format for \$remoteprobefile: unix file, scp or http"
+ ;;
+ esac
+
+ if [[ -s "$script_path/probe/proberesult.${remoteprobe[i]}" ]]; then
+ echo "ok" > "$script_path/probe/fetchstatus.remote.${remoteprobe[i]}"
+ remoteprobetimestamp[i]=$(awk "/Timestamp/ { print \$3 }" < "$script_path/probe/proberesult.${remoteprobe[i]}")
+ if [[ -n ${remoteprobetimestamp[i]} ]]; then
+ timestampdelta[i]=$(($(date +%s)-${remoteprobetimestamp[i]}))
+ else
+ log error "Error: no timestamp found on remote probe file"
+ fi
+ if [[ ${timestampdelta[i]} -gt 30000 ]]; then
+ log error "Error: remote timestamp difference is greater than 5 minutes for ${remoteprobe[i]}, make sure the script is running on the remote host and also double check the clock/NTP settings"
+ fi
+ else
+ echo "error" > "$script_path/probe/fetchstatus.remote.${remoteprobe[i]}"
+ log error "Error fetching remote proberesult (${remoteprobefile[i]})"
+ fi
+ done
+fi
+
+# Compare probe results
+# If multisite is enabled, compare all three results
+# If multisite is disabled, no comparison is done
+
+multiSiteCompare () {
+
+ webserverip="$1"
+ webserverstatus="$2"
+
+ if [[ $multisiteprobe -eq 1 ]] && [[ $probeonly -ne 1 ]]; then
+ multisite_up_webserver=0
+ if [[ $webserverstatus == "up" ]]; then
+ multisite_up_webserver=$(( $multisite_up_webserver + 1 ))
+ fi
+ for z in 1 2; do
+ if [[ ! -s "$script_path/probe/proberesult.${remoteprobe[z]}" ]]; then
+ log error "Error reading proberesult.${remoteprobe[z]}"
+ fi
+ remoteprobe[z*10]=$(grep "^[0-9]" "$script_path/probe/proberesult.${remoteprobe[z]}" | grep "^$webserverip:200:1$" | uniq | wc -l | cut -d ' ' -f 8)
+ if [[ ${remoteprobe[z*10]} -ne 1 ]]; then # 1=up / 0=down
+ remoteprobe[z*100]=$(grep "^[0-9]" "$script_path/probe/proberesult.${remoteprobe[z]}" | grep "^$webserverip" | awk -F \: "{ print \$2 }")
+ log info "${remoteprobe[z]} is reporting $webserverip as DOWN"
+ else
+ multisite_up_webserver=$(( $multisite_up_webserver + 1 ))
+ fi
+ done
+ if [[ $multisite_up_webserver -ge 2 ]]; then
+ return 1 #up
+ elif [[ $multisite_up_webserver -lt 2 ]]; then
+ return 0 #down
+ fi
+ else
+ if [[ $webserverstatus == "up" ]]; then
+ return 1 #up
+ elif [[ $webserverstatus == "down" ]]; then
+ return 0 #down
+ fi
+ fi
+}
+
+
# Connect to webserver and search for a specific string to
# check if webserver are up and running for each address
# listed in ips.master file. Than print multiple lines
@@ -239,32 +545,63 @@ touch $script_path/awsresult || log error "Error manipulating temporary files"
HostsUpAmount=0
HostsDownAmount=0
HostsDisabledAmount=$(cat $script_path/ips.master | egrep "^#.*[0-9]" | wc -l | cut -d ' ' -f 8)
-
-for i in $(cat $script_path/ips.master | grep -v "#")
-do
- ip=$(echo $i | awk -F":" '{print $2}')
- webserverprobe=$(curl -sS -w ";;%{http_code}" --connect-timeout $connect_timeout --retry $retries --retry-delay 5 http://$ip/$test_file 2>&1)
- webserverprobeHTTPcode=$(echo $webserverprobe | awk -F ";;" "{ print \$2 }")
- webserverprobecondition=$(echo "$webserverprobe" | grep "$test_string" | wc -l | cut -d ' ' -f 8)
- echo "# Timestamp: $(date +%s) [$(date)]" > $script_path/probe/proberesult
- echo "# Format: <Webserver IP>:<Returned HTTP Code>:<String Found>" >> $script_path/probe/proberesult
- echo "# HTTP Code \"000\" means timeout or connection refused" >> $script_path/probe/proberesult
- echo "$ip:$webserverprobeHTTPcode:$webserverprobecondition" >> $script_path/probe/proberesult
- if [ "$webserverprobecondition" -eq "1" ]
- then
- HostsUpAmount=$(( $HostsUpAmount + 1 ))
- peso=$(echo $i | awk -F":" '{print $1}')
- counter=1
- while [ $counter -le $peso ]
- do
- echo $ip >> $script_path/ips.tmp
- counter=$(( $counter + 1 ))
- done
- else
- HostsDownAmount=$(( $HostsDownAmount + 1 ))
- fi
+HostsDownList=""
+
+echo "# Timestamp: $(date +%s) [$(date)]" > $script_path/probe/proberesult.tmp
+echo "# Format: <Webserver IP>:<Returned HTTP Code>:<String Found>" >> $script_path/probe/proberesult.tmp
+echo "# HTTP Code \"000\" means timeout or connection refused" >> $script_path/probe/proberesult.tmp
+
+for i in $(cat $script_path/ips.master | grep -v "#"); do
+ ip=$(echo $i | awk -F":" '{print $2}')
+ webserverprobe=$(curl -sS -w ";;%{http_code}" --connect-timeout $connect_timeout --retry $retries --retry-delay 5 http://$ip/$test_file 2>&1)
+ webserverprobeHTTPcode=$(echo $webserverprobe | awk -F ";;" "{ print \$2 }")
+ webserverprobecondition=$(echo "$webserverprobe" | grep "$test_string" | wc -l | cut -d ' ' -f 8)
+ echo "$ip:$webserverprobeHTTPcode:$webserverprobecondition" >> $script_path/probe/proberesult.tmp
+ if [ "$webserverprobecondition" -eq "1" ]; then
+ multiSiteCompare "$ip" up
+ if [[ $? -eq 1 ]]; then
+ HostsUpAmount=$(( $HostsUpAmount + 1 ))
+ peso=$(echo $i | awk -F":" '{print $1}')
+ counter=1
+ while [ $counter -le $peso ]; do
+ echo $ip >> $script_path/ips.tmp
+ counter=$(( $counter + 1 ))
+ done
+ else
+ log info "Host $ip is UP for me, but is DOWN from both other locations"
+ HostsDownAmount=$(( $HostsDownAmount + 1 ))
+ HostsDownList=$HostsDownList",$ip"
+ fi
+ else
+ multiSiteCompare "$ip" down
+ if [[ $? -eq 1 ]]; then
+ log info "Host $ip is DOWN for me, but is UP from both other locations"
+ HostsUpAmount=$(( $HostsUpAmount + 1 ))
+ peso=$(echo $i | awk -F":" '{print $1}')
+ counter=1
+ while [ $counter -le $peso ]; do
+ echo $ip >> $script_path/ips.tmp
+ counter=$(( $counter + 1 ))
+ done
+ else
+ HostsDownAmount=$(( $HostsDownAmount + 1 ))
+ HostsDownList=$HostsDownList",$ip"
+ log info "Host $ip is DOWN"
+ fi
+ fi
done
+# Move tmp file to its correct location (avoid race condition)
+mv -f $script_path/probe/proberesult.tmp $script_path/probe/proberesult
+
+# Exit if we detect this is a probe-only host. Probing has finished, nothing else needs to be done.
+if [[ $probeonly -eq 1 ]]; then
+ log info "Probe result: [up:$HostsUpAmount][down:$HostsDownAmount][disabled:$HostsDisabledAmount]"
+ echo "# Last successful execution: $(date +%s) [$(date)]" > $script_path/monit.status
+ rm -f "$lockfile"
+ exit 0
+fi
+
# Check if file ips.tmp are empty (empty file = no webserver available)
if [ -s "$script_path/ips.tmp" ]
@@ -288,7 +625,9 @@ then
echo
echo "You may force an update to Route53 by using the '--force' argument"
echo
- exit 0
+ echo "# Last successful execution: $(date +%s) [$(date)]" > $script_path/monit.status
+ rm -f "$lockfile"
+ exit 0
fi
if [[ -z $(diff $script_path/ips.tmp.old $script_path/ips.tmp >/dev/null) ]] && [[ -n $(grep ok $script_path/awsresult) ]] && [[ "$OldRecordSorted" == "$NewRecordSorted" ]] && [[ ! "${1:-unset}" = "--force" ]]
@@ -314,7 +653,6 @@ then
fi
# Show which hosts have been added or removed
- # TODO: Send this information by email
addHosts[0]="$(diff -u $script_path/ips.tmp.old $script_path/ips.tmp | sort -u | awk "/^\+[0-9]+/ {printf \$0 \" \";}" | sed -e s/\+//g -e s/\ $//)"
j=1
for i in ${addHosts[0]}; do
@@ -344,8 +682,8 @@ then
for i in $(seq 1 $(($j-1))); do
if [ ${addHosts[i*1000+4]} -eq 0 ]; then
showAddedHosts=$showAddedHosts", "${addHosts[i*1000+1]}
- mailNotificationStatus[5*10+i]=${addHosts[i*1000+1]} # ip address for each added host
mailNotificationStatus[5]=$((${mailNotificationStatus[5]}+1)) # increase host added count
+ mailNotificationStatus[5*10+i]=${addHosts[i*1000+1]} # ip address for each added host
else
increasedHostWeight=$increasedHostWeight"| "${addHosts[i*1000+1]}"|"${addHosts[i*1000+2]}"|"${addHosts[i*1000+3]}
increasedHostWeightCount=$(($increasedHostWeightCount+1))
@@ -395,8 +733,10 @@ then
for i in $(seq 1 $(($j-1))); do
if [ ${removeHosts[i*1000+4]} -eq 0 ]; then
showRemovedHosts=$showRemovedHosts", "${removeHosts[i*1000+1]}
- mailNotificationStatus[4*10+i]=${removeHosts[i*1000+1]} # ip address for each removed host
mailNotificationStatus[4]=$((${mailNotificationStatus[4]}+1)) # increase host removed count
+ mailNotificationStatus[4*10+i]=${removeHosts[i*1000+1]} # ip address for each removed host
+ mailNotificationStatus[4*100+i]=$(awk -F \: "/^${removeHosts[i*1000+1]}/ { print \$2 }" < $script_path/probe/proberesult) # http error code (reason)
+ mailNotificationStatus[4*1000+i]=$(awk -F \: "/^${removeHosts[i*1000+1]}/ { print \$3 }" < $script_path/probe/proberesult) # string test
else
decreasedHostWeight=$decreasedHostWeight"| "${removeHosts[i*1000+1]}"|"${removeHosts[i*1000+2]}"|"${removeHosts[i*1000+3]}
decreasedHostWeightCount=$(($decreasedHostWeightCount+1))
@@ -565,37 +905,12 @@ then
fi
fi
-mailNotification() {
-
-# During script execution we set special status codes (described bellow)
-# depending on which problems were encountered.
-# These status codes are later processed to decide which notifications should be sent.
-#
-# mailNotificationStatus[1] = DNS resolution problems
-# mailNotificationStatus[2] = Failed to generate AWS signature or validate AWS credentials
-# mailNotificationStatus[3] = Failed to submit AWS zoneset update
-# mailNotificationStatus[4] = Hosts down
-# mailNotificationStatus[5] = Hosts up
-# mailNotificationStatus[6] = Failover activated (problems)
-# mailNotificationStatus[7] = Failover disabled (back to normal)
-# mailNotificationStatus[8] = All hosts down, failover also down
-#
-# For status 4 and 5 (hosts up/down) we also store the amount of affected hosts.
-#
-# For example:
-#
-# mailNotificationStatus[4]=2 means we have 2 hosts down.
-# The IP address for the affected hosts will be stored on mailNotificationStatus[4*10+1] and mailNotificationStatus[4*10+2]
-# The reason for each problem will be stored on mailNotificationStatus[4*100+1] and mailNotificationStatus[4*100+2]
-# The string test for each host will be stored on mailNotificationStatus[4*1000+1] and mailNotificationStatus[4*1000+2]
-#
-# mailNotificationStatus[5]=3 means 3 hosts are back online (up)
-# The IP address for each host back online will be stored on mailNotificationStatus[5*10+1] until mailNotificationStatus[5*10+3]
-#
-
-echo
+mailNotification
-}
+# Update monitoring file.
+# You should use a monitoring tool to check if this file was updated in the last 5 minutes.
+# (and generate a critical alarm otherwise)
+echo "# Last successful execution: $(date +%s) [$(date)]" > $script_path/monit.status
# Remove lockfile
rm -f "$lockfile"
Something went wrong with that request. Please try again.