Skip to content

Commit

Permalink
Don't try to use nagios-plugins-contrib package in ubuntu 12.04, and …
Browse files Browse the repository at this point in the history
…add useful plugins
  • Loading branch information
OpenSourceAlchemist committed Apr 19, 2016
1 parent b013eb1 commit b355f75
Show file tree
Hide file tree
Showing 3 changed files with 355 additions and 1 deletion.
229 changes: 229 additions & 0 deletions files/default/plugins/check_postgres_replication
@@ -0,0 +1,229 @@
#!/bin/bash
# ========================================================================================
# Postgres replication lag nagios check using psql and bash.
#
# 2013 Wanelo Inc, Apache License.
# This script expects psql to be in the PATH.
#
# Usage: ./check_postgres_replication [ -h <host> ] [ -m <master> ] [ -U user ] [ -x <units> ]
# [-w <warn_perc>] [-c <critical_perc>]
# -h --host replica host (default 127.0.0.1)
# -m --master master fqdn or ip (required)
# -U --user database user (default postgres)
# -x --units units of measurement to display (KB or MB, default MB)
# -w --warning warning threshold (default 10MB)
# -c --critical critical threshold (default 15MB)
# ========================================================================================

# Nagios return codes
readonly STATE_OK=0
readonly STATE_WARNING=1
readonly STATE_CRITICAL=2
readonly STATE_UNKNOWN=3

readonly ARGS="$@"

# set thresholds in bytes
readonly DEFAULT_WARNING_THRESHOLD=10485760
readonly DEFAULT_CRITICAL_THRESHOLD=15728640

readonly DEFAULT_HOST="127.0.0.1"
readonly DEFAULT_USER=postgres
readonly DEFAULT_UNITS=MB

readonly PATH=/opt/local/bin:${PATH}
readonly NODENAME=$(cat /etc/nodename)
readonly MASTER_SQL="SELECT pg_current_xlog_location()"
readonly REPLICA_SQL="SELECT pg_last_xlog_replay_location()"
readonly REPLICA_TIME_LAG="select now() - pg_last_xact_replay_timestamp()"
readonly ERR=/tmp/repl_chec.$$

usage() {
cat <<-EOF
Usage: ./check_postgres_replication [ -h <host> ] [ -m <master> ] [ -U user ] [ -x <units> ]
[-w <warn_perc>] [-c <critical_perc>]
-h --host replica host (default 127.0.0.1)
-m --master master fqdn or ip (required)
-U --user database user (default postgres)
-x --units units of measurement to display (KB or MB, default MB)
-w --warning warning threshold (default 10MB)
-c --critical critical threshold (default 15MB)
--help show this message
--verbose
EOF
}

# Parse parameters
parse_arguments() {
local arg=$1
for arg; do
local delim=""
case "$arg" in
--host) args="${args}-h ";;
--master) args="${args}-m ";;
--user) args="${args}-U ";;
--units) args="${args}-x ";;
--warning) args="${args}-w ";;
--critical) args="${args}-c ";;
--help) args="${args}-H ";;
--verbose) args="${args}-v ";;
*) [[ "${arg:0:1}" == "-" ]] || delim="\""
args="${args}${delim}${arg}${delim} ";;
esac
done

eval set -- $args

while getopts "h:m:U:x:w:c:Hv" OPTION
do
case $OPTION in
v)
set -x
;;
H)
usage
exit
;;
h)
local host=$OPTARG
;;
m)
readonly MASTER=$OPTARG
;;
U)
local user=$OPTARG
;;
x)
local units=$OPTARG
;;
w)
local warning_threshold=$OPTARG
;;
c)
local critical_threshold=$OPTARG
;;
esac
done

readonly USER=${user:-$DEFAULT_USER}
readonly HOST=${host:-$DEFAULT_HOST}
readonly UNITS=${units:-$DEFAULT_UNITS}
readonly WARNING_THRESHOLD=${warning_threshold:-$DEFAULT_WARNING_THRESHOLD}
readonly CRITICAL_THRESHOLD=${critical_threshold:-$DEFAULT_CRITICAL_THRESHOLD}
}

check_required_arguments() {
if [ -z "$MASTER" ]; then
echo "pass master host in parameters via -m flag"
exit 1
fi
}

normalize_units() {
# Error checking of arguments
case "$UNITS" in
KB)
readonly DIVISOR=1024
;;
MB)
readonly DIVISOR=1048576
;;
*)
echo "Incorrect unit of measurement"
usage
exit 1
;;
esac
}

result() {
local description=$1
local status=$2
local diff=$3
local time_lag=$4

local error=$(cat $ERR 2>/dev/null)

if [[ "${status}" -eq "${STATE_CRITICAL}" && ! -z "${error}" ]]; then
local message="replication check error ${error}"
else
local diff_units=$(bytes_to_units $diff)
local message="replication lag is ${diff_units}${UNITS} : time lag is ${time_lag}"
fi
echo "REPLICATION $description : ${NODENAME} $message|repl=${diff},time_lag=${time_lag};${WARNING_THRESHOLD};${CRITICAL_THRESHOLD}"
rm -f $ERR
exit $status
}

get_replica_current_xlog() {
echo $(psql -U $USER -Atc "$REPLICA_SQL" -h $HOST 2>$ERR)
}

get_master_current_xlog() {
echo $(psql -U $USER -Atc "$MASTER_SQL" -h $MASTER 2>$ERR)
}

check_replica_time_lag() {
echo $(psql -U $USER -Atc "${REPLICA_TIME_LAG}" -h ${HOST} 2>${ERR})
}

check_errors() {
if [ $1 -ne 0 ]; then
result "CRITICAL" $STATE_CRITICAL
fi
}

xlog_to_bytes() {
# http://eulerto.blogspot.com/2011/11/understanding-wal-nomenclature.html
local logid="${1%%/*}"
local offset="${1##*/}"
echo $((0xFF000000 * 0x$logid + 0x$offset))
}

bytes_to_units() {
local diff=$1
if [ -z "$diff" ]; then
echo "ERROR: NO DATA AVAILABLE"
else
echo $(( $diff / $DIVISOR ))
fi
}

main() {
parse_arguments $ARGS
check_required_arguments
normalize_units

local replica_xlog=$(get_replica_current_xlog)
check_errors $?
local replica_bytes=$(xlog_to_bytes ${replica_xlog})

if [ -z "${replica_xlog}" ]; then
echo -n "Unable to find replica XLOG replay location" > $ERR
result "CRITICAL" $STATE_CRITICAL
fi

# Query master and replica for latest xlog
local master_xlog=$(get_master_current_xlog)
check_errors $?
local master_bytes=$(xlog_to_bytes $master_xlog)

# Calculate xlog diff in bytes
local diff=$(($master_bytes - $replica_bytes))

local time_lag=$(check_replica_time_lag)

# Output response
if [ $diff -ge $WARNING_THRESHOLD ] && [ $diff -lt $CRITICAL_THRESHOLD ]; then
result "WARNING" $STATE_WARNING $diff $time_lag
elif [ $diff -ge $CRITICAL_THRESHOLD ]; then
result "CRITICAL" $STATE_CRITICAL $diff $time_lag
else
result "OK" $STATE_OK $diff $time_lag
fi

rm -f $ERR
}

main
125 changes: 125 additions & 0 deletions files/default/plugins/check_sidekiq_queue
@@ -0,0 +1,125 @@
#!/bin/bash
# ========================================================================================
# Sidekiq Queue Size Nagios Check
#
# (c) Wanelo Inc, Distributed under Apache License
#
# Usage:
# To check a regular queue:
# ./check_sidekiq_queue [ -h <host> ] [ -a <password> ] [ -q <default> ] [ <-n mq> ] [ -d <redis-db> ] [-w <warn_perc>] [-c <critical_perc>]
# Eg: ./check_sidekiq_queue -w 500 -c 2000 # warning at 500 or higher used, critical at 2000 or higher
#
# To check schedule or retry (system) queue:
# ./check_sidekiq_queue [ -h <host> ] [ -a <password> ] [ -s <schedule|retry> ] [ <-n mq> ] [ -d <redis-db> ] [-w <warn_perc>] [-c <critical_perc>]
#
# ========================================================================================

# Nagios return codes
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3

WARNING_THRESHOLD=500
CRITICAL_THRESHOLD=1000
QUEUE="default"
SYSTEM=""
NAMESPACE=""
HOST="127.0.0.1"
PASS=""
DB=0

# Parse parameters
while [ $# -gt 0 ]; do
case "$1" in
-d | --db)
shift
DB=$1
;;
-h | --hostname)
shift
HOST=$1
;;
-a | --password)
shift
PASS=$1
;;
-q | --queue)
shift
QUEUE=$1
;;
-n | --namespace)
shift
NAMESPACE=$1
;;
-s | --system)
shift
SYSTEM=$1
;;
-w | --warning)
shift
WARNING_THRESHOLD=$1
;;
-c | --critical)
shift
CRITICAL_THRESHOLD=$1
;;
*) echo "Unknown argument: $1"
exit $STATE_UNKNOWN
;;
esac
shift
done

PATH=/opt/local/bin:$PATH
NODENAME=$HOSTNAME

ERR=/tmp/redis-cli.error.$$
rm -f $ERR

function result {
DESCRIPTION=$1
STATUS=$2
echo "SIDEKIQ $DESCRIPTION : ${NODENAME} ${QUEUE_SIZE} on ${QUEUE}|sidekiq_queue_${QUEUE}=${QUEUE_SIZE};${WARNING_THRESHOLD};${CRITICAL_THRESHOLD}"
rm -f $ERR
exit $STATUS
}

if [ "$QUEUE" != "default" -a -n "$SYSTEM" ]; then
result "CRITICAL invalid usage: pass -q or -s but not both", $STATE_CRITICAL
fi

if [ -n "$SYSTEM" -a "$SYSTEM" != "schedule" -a "$SYSTEM" != "retry" ] ; then
result "CRITICAL invalid usage: -s expect one of schedule or retry", $STATE_CRITICAL
fi

if [ ! -z "$PASS" ]; then
PASS="-a $PASS"
fi

if [ ! -z "$NAMESPACE" ]; then
NAMESPACE="$NAMESPACE:"
fi

if [ -n "$SYSTEM" ]; then
QUEUE_SIZE=`redis-cli -h $HOST $PASS -n $DB zcard ${NAMESPACE}$SYSTEM 2>$ERR | cut -d " " -f 1`
QUEUE=$SYSTEM
else
QUEUE_SIZE=`redis-cli -h $HOST $PASS -n $DB llen ${NAMESPACE}queue:$QUEUE 2>$ERR | cut -d " " -f 1`
fi

if [ -s "$ERR" ]; then
QUEUE_SIZE=`cat $ERR`
result "CRITICAL" $STATE_CRITICAL
fi

if [ $QUEUE_SIZE -ge $WARNING_THRESHOLD ] && [ $QUEUE_SIZE -lt $CRITICAL_THRESHOLD ]; then
result "WARNING" $STATE_WARNING
elif [ $QUEUE_SIZE -ge $CRITICAL_THRESHOLD ]; then
result "CRITICAL" $STATE_CRITICAL
else
result "OK" $STATE_OK
fi

# ensure that output from stderr is cleaned up
rm -f $ERR
2 changes: 1 addition & 1 deletion recipes/client_package.rb
Expand Up @@ -24,5 +24,5 @@
nagios-plugins-standard
nagios-plugins-contrib
}.each do |pkg|
package pkg
package pkg unless pkg == 'nagios-plugins-contrib' && node.platform_version == '12.04'
end

0 comments on commit b355f75

Please sign in to comment.