forked from sous-chefs/nagios
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Don't try to use nagios-plugins-contrib package in ubuntu 12.04, and …
…add useful plugins
- Loading branch information
1 parent
b013eb1
commit b355f75
Showing
3 changed files
with
355 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,229 @@ | ||
#!/bin/bash | ||
# ======================================================================================== | ||
# Postgres replication lag nagios check using psql and bash. | ||
# | ||
# 2013 Wanelo Inc, Apache License. | ||
# This script expects psql to be in the PATH. | ||
# | ||
# Usage: ./check_postgres_replication [ -h <host> ] [ -m <master> ] [ -U user ] [ -x <units> ] | ||
# [-w <warn_perc>] [-c <critical_perc>] | ||
# -h --host replica host (default 127.0.0.1) | ||
# -m --master master fqdn or ip (required) | ||
# -U --user database user (default postgres) | ||
# -x --units units of measurement to display (KB or MB, default MB) | ||
# -w --warning warning threshold (default 10MB) | ||
# -c --critical critical threshold (default 15MB) | ||
# ======================================================================================== | ||
|
||
# Nagios return codes | ||
readonly STATE_OK=0 | ||
readonly STATE_WARNING=1 | ||
readonly STATE_CRITICAL=2 | ||
readonly STATE_UNKNOWN=3 | ||
|
||
readonly ARGS="$@" | ||
|
||
# set thresholds in bytes | ||
readonly DEFAULT_WARNING_THRESHOLD=10485760 | ||
readonly DEFAULT_CRITICAL_THRESHOLD=15728640 | ||
|
||
readonly DEFAULT_HOST="127.0.0.1" | ||
readonly DEFAULT_USER=postgres | ||
readonly DEFAULT_UNITS=MB | ||
|
||
readonly PATH=/opt/local/bin:${PATH} | ||
readonly NODENAME=$(cat /etc/nodename) | ||
readonly MASTER_SQL="SELECT pg_current_xlog_location()" | ||
readonly REPLICA_SQL="SELECT pg_last_xlog_replay_location()" | ||
readonly REPLICA_TIME_LAG="select now() - pg_last_xact_replay_timestamp()" | ||
readonly ERR=/tmp/repl_chec.$$ | ||
|
||
usage() { | ||
cat <<-EOF | ||
Usage: ./check_postgres_replication [ -h <host> ] [ -m <master> ] [ -U user ] [ -x <units> ] | ||
[-w <warn_perc>] [-c <critical_perc>] | ||
-h --host replica host (default 127.0.0.1) | ||
-m --master master fqdn or ip (required) | ||
-U --user database user (default postgres) | ||
-x --units units of measurement to display (KB or MB, default MB) | ||
-w --warning warning threshold (default 10MB) | ||
-c --critical critical threshold (default 15MB) | ||
--help show this message | ||
--verbose | ||
EOF | ||
} | ||
|
||
# Parse parameters | ||
parse_arguments() { | ||
local arg=$1 | ||
for arg; do | ||
local delim="" | ||
case "$arg" in | ||
--host) args="${args}-h ";; | ||
--master) args="${args}-m ";; | ||
--user) args="${args}-U ";; | ||
--units) args="${args}-x ";; | ||
--warning) args="${args}-w ";; | ||
--critical) args="${args}-c ";; | ||
--help) args="${args}-H ";; | ||
--verbose) args="${args}-v ";; | ||
*) [[ "${arg:0:1}" == "-" ]] || delim="\"" | ||
args="${args}${delim}${arg}${delim} ";; | ||
esac | ||
done | ||
|
||
eval set -- $args | ||
|
||
while getopts "h:m:U:x:w:c:Hv" OPTION | ||
do | ||
case $OPTION in | ||
v) | ||
set -x | ||
;; | ||
H) | ||
usage | ||
exit | ||
;; | ||
h) | ||
local host=$OPTARG | ||
;; | ||
m) | ||
readonly MASTER=$OPTARG | ||
;; | ||
U) | ||
local user=$OPTARG | ||
;; | ||
x) | ||
local units=$OPTARG | ||
;; | ||
w) | ||
local warning_threshold=$OPTARG | ||
;; | ||
c) | ||
local critical_threshold=$OPTARG | ||
;; | ||
esac | ||
done | ||
|
||
readonly USER=${user:-$DEFAULT_USER} | ||
readonly HOST=${host:-$DEFAULT_HOST} | ||
readonly UNITS=${units:-$DEFAULT_UNITS} | ||
readonly WARNING_THRESHOLD=${warning_threshold:-$DEFAULT_WARNING_THRESHOLD} | ||
readonly CRITICAL_THRESHOLD=${critical_threshold:-$DEFAULT_CRITICAL_THRESHOLD} | ||
} | ||
|
||
check_required_arguments() { | ||
if [ -z "$MASTER" ]; then | ||
echo "pass master host in parameters via -m flag" | ||
exit 1 | ||
fi | ||
} | ||
|
||
normalize_units() { | ||
# Error checking of arguments | ||
case "$UNITS" in | ||
KB) | ||
readonly DIVISOR=1024 | ||
;; | ||
MB) | ||
readonly DIVISOR=1048576 | ||
;; | ||
*) | ||
echo "Incorrect unit of measurement" | ||
usage | ||
exit 1 | ||
;; | ||
esac | ||
} | ||
|
||
result() { | ||
local description=$1 | ||
local status=$2 | ||
local diff=$3 | ||
local time_lag=$4 | ||
|
||
local error=$(cat $ERR 2>/dev/null) | ||
|
||
if [[ "${status}" -eq "${STATE_CRITICAL}" && ! -z "${error}" ]]; then | ||
local message="replication check error ${error}" | ||
else | ||
local diff_units=$(bytes_to_units $diff) | ||
local message="replication lag is ${diff_units}${UNITS} : time lag is ${time_lag}" | ||
fi | ||
echo "REPLICATION $description : ${NODENAME} $message|repl=${diff},time_lag=${time_lag};${WARNING_THRESHOLD};${CRITICAL_THRESHOLD}" | ||
rm -f $ERR | ||
exit $status | ||
} | ||
|
||
get_replica_current_xlog() { | ||
echo $(psql -U $USER -Atc "$REPLICA_SQL" -h $HOST 2>$ERR) | ||
} | ||
|
||
get_master_current_xlog() { | ||
echo $(psql -U $USER -Atc "$MASTER_SQL" -h $MASTER 2>$ERR) | ||
} | ||
|
||
check_replica_time_lag() { | ||
echo $(psql -U $USER -Atc "${REPLICA_TIME_LAG}" -h ${HOST} 2>${ERR}) | ||
} | ||
|
||
check_errors() { | ||
if [ $1 -ne 0 ]; then | ||
result "CRITICAL" $STATE_CRITICAL | ||
fi | ||
} | ||
|
||
xlog_to_bytes() { | ||
# http://eulerto.blogspot.com/2011/11/understanding-wal-nomenclature.html | ||
local logid="${1%%/*}" | ||
local offset="${1##*/}" | ||
echo $((0xFF000000 * 0x$logid + 0x$offset)) | ||
} | ||
|
||
bytes_to_units() { | ||
local diff=$1 | ||
if [ -z "$diff" ]; then | ||
echo "ERROR: NO DATA AVAILABLE" | ||
else | ||
echo $(( $diff / $DIVISOR )) | ||
fi | ||
} | ||
|
||
main() { | ||
parse_arguments $ARGS | ||
check_required_arguments | ||
normalize_units | ||
|
||
local replica_xlog=$(get_replica_current_xlog) | ||
check_errors $? | ||
local replica_bytes=$(xlog_to_bytes ${replica_xlog}) | ||
|
||
if [ -z "${replica_xlog}" ]; then | ||
echo -n "Unable to find replica XLOG replay location" > $ERR | ||
result "CRITICAL" $STATE_CRITICAL | ||
fi | ||
|
||
# Query master and replica for latest xlog | ||
local master_xlog=$(get_master_current_xlog) | ||
check_errors $? | ||
local master_bytes=$(xlog_to_bytes $master_xlog) | ||
|
||
# Calculate xlog diff in bytes | ||
local diff=$(($master_bytes - $replica_bytes)) | ||
|
||
local time_lag=$(check_replica_time_lag) | ||
|
||
# Output response | ||
if [ $diff -ge $WARNING_THRESHOLD ] && [ $diff -lt $CRITICAL_THRESHOLD ]; then | ||
result "WARNING" $STATE_WARNING $diff $time_lag | ||
elif [ $diff -ge $CRITICAL_THRESHOLD ]; then | ||
result "CRITICAL" $STATE_CRITICAL $diff $time_lag | ||
else | ||
result "OK" $STATE_OK $diff $time_lag | ||
fi | ||
|
||
rm -f $ERR | ||
} | ||
|
||
main |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
#!/bin/bash | ||
# ======================================================================================== | ||
# Sidekiq Queue Size Nagios Check | ||
# | ||
# (c) Wanelo Inc, Distributed under Apache License | ||
# | ||
# Usage: | ||
# To check a regular queue: | ||
# ./check_sidekiq_queue [ -h <host> ] [ -a <password> ] [ -q <default> ] [ <-n mq> ] [ -d <redis-db> ] [-w <warn_perc>] [-c <critical_perc>] | ||
# Eg: ./check_sidekiq_queue -w 500 -c 2000 # warning at 500 or higher used, critical at 2000 or higher | ||
# | ||
# To check schedule or retry (system) queue: | ||
# ./check_sidekiq_queue [ -h <host> ] [ -a <password> ] [ -s <schedule|retry> ] [ <-n mq> ] [ -d <redis-db> ] [-w <warn_perc>] [-c <critical_perc>] | ||
# | ||
# ======================================================================================== | ||
|
||
# Nagios return codes | ||
STATE_OK=0 | ||
STATE_WARNING=1 | ||
STATE_CRITICAL=2 | ||
STATE_UNKNOWN=3 | ||
|
||
WARNING_THRESHOLD=500 | ||
CRITICAL_THRESHOLD=1000 | ||
QUEUE="default" | ||
SYSTEM="" | ||
NAMESPACE="" | ||
HOST="127.0.0.1" | ||
PASS="" | ||
DB=0 | ||
|
||
# Parse parameters | ||
while [ $# -gt 0 ]; do | ||
case "$1" in | ||
-d | --db) | ||
shift | ||
DB=$1 | ||
;; | ||
-h | --hostname) | ||
shift | ||
HOST=$1 | ||
;; | ||
-a | --password) | ||
shift | ||
PASS=$1 | ||
;; | ||
-q | --queue) | ||
shift | ||
QUEUE=$1 | ||
;; | ||
-n | --namespace) | ||
shift | ||
NAMESPACE=$1 | ||
;; | ||
-s | --system) | ||
shift | ||
SYSTEM=$1 | ||
;; | ||
-w | --warning) | ||
shift | ||
WARNING_THRESHOLD=$1 | ||
;; | ||
-c | --critical) | ||
shift | ||
CRITICAL_THRESHOLD=$1 | ||
;; | ||
*) echo "Unknown argument: $1" | ||
exit $STATE_UNKNOWN | ||
;; | ||
esac | ||
shift | ||
done | ||
|
||
PATH=/opt/local/bin:$PATH | ||
NODENAME=$HOSTNAME | ||
|
||
ERR=/tmp/redis-cli.error.$$ | ||
rm -f $ERR | ||
|
||
function result { | ||
DESCRIPTION=$1 | ||
STATUS=$2 | ||
echo "SIDEKIQ $DESCRIPTION : ${NODENAME} ${QUEUE_SIZE} on ${QUEUE}|sidekiq_queue_${QUEUE}=${QUEUE_SIZE};${WARNING_THRESHOLD};${CRITICAL_THRESHOLD}" | ||
rm -f $ERR | ||
exit $STATUS | ||
} | ||
|
||
if [ "$QUEUE" != "default" -a -n "$SYSTEM" ]; then | ||
result "CRITICAL invalid usage: pass -q or -s but not both", $STATE_CRITICAL | ||
fi | ||
|
||
if [ -n "$SYSTEM" -a "$SYSTEM" != "schedule" -a "$SYSTEM" != "retry" ] ; then | ||
result "CRITICAL invalid usage: -s expect one of schedule or retry", $STATE_CRITICAL | ||
fi | ||
|
||
if [ ! -z "$PASS" ]; then | ||
PASS="-a $PASS" | ||
fi | ||
|
||
if [ ! -z "$NAMESPACE" ]; then | ||
NAMESPACE="$NAMESPACE:" | ||
fi | ||
|
||
if [ -n "$SYSTEM" ]; then | ||
QUEUE_SIZE=`redis-cli -h $HOST $PASS -n $DB zcard ${NAMESPACE}$SYSTEM 2>$ERR | cut -d " " -f 1` | ||
QUEUE=$SYSTEM | ||
else | ||
QUEUE_SIZE=`redis-cli -h $HOST $PASS -n $DB llen ${NAMESPACE}queue:$QUEUE 2>$ERR | cut -d " " -f 1` | ||
fi | ||
|
||
if [ -s "$ERR" ]; then | ||
QUEUE_SIZE=`cat $ERR` | ||
result "CRITICAL" $STATE_CRITICAL | ||
fi | ||
|
||
if [ $QUEUE_SIZE -ge $WARNING_THRESHOLD ] && [ $QUEUE_SIZE -lt $CRITICAL_THRESHOLD ]; then | ||
result "WARNING" $STATE_WARNING | ||
elif [ $QUEUE_SIZE -ge $CRITICAL_THRESHOLD ]; then | ||
result "CRITICAL" $STATE_CRITICAL | ||
else | ||
result "OK" $STATE_OK | ||
fi | ||
|
||
# ensure that output from stderr is cleaned up | ||
rm -f $ERR |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters