Permalink
Browse files

Adding a pagerduty plugin to the nagios cookbook

  • Loading branch information...
1 parent 9cd7b28 commit c41ffb38c90337b978926af671ca85b2d0f6c149 @clumpidy clumpidy committed with jtimberman Nov 8, 2011
@@ -22,6 +22,8 @@
# limitations under the License.
#
+default['nagios']['pagerduty_key'] = ""
+
case node['platform']
when "ubuntu","debian"
set['nagios']['server']['install_method'] = 'package'
@@ -31,7 +31,7 @@
source "#{params[:name]}.cfg.erb"
mode 0644
variables params[:variables]
- notifies :reload, "service[nagios]"
+ notifies :restart, "service[nagios]"
backup 0
end
end
@@ -0,0 +1,280 @@
+#!/usr/bin/env perl
+
+
+# Nagios plugin that sends Nagios events to PagerDuty.
+#
+# Copyright (c) 2011, PagerDuty, Inc. <info@pagerduty.com>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of PagerDuty Inc nor the
+# names of its contributors may be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PAGERDUTY INC BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+use Pod::Usage;
+use Getopt::Long;
+use Sys::Syslog;
+use HTTP::Request::Common qw(POST);
+use HTTP::Status qw(is_client_error);
+use LWP::UserAgent;
+use File::Path;
+use Fcntl qw(:flock);
+
+
+=head1 NAME
+
+pagerduty_nagios -- Send Nagios events to the PagerDuty alert system
+
+=head1 SYNOPSIS
+
+pagerduty_nagios enqueue [options]
+
+pagerduty_nagios flush [options]
+
+=head1 DESCRIPTION
+
+ This script passes events from Nagios to the PagerDuty alert system. It's
+ meant to be run as a Nagios notification plugin. For more details, please see
+ the PagerDuty Nagios integration docs at:
+ http://www.pagerduty.com/docs/nagios-integration.
+
+ When called in the "enqueue" mode, the script loads a Nagios notification out
+ of the environment and into the event queue. It then tries to flush the
+ queue by sending any enqueued events to the PagerDuty server. The script is
+ typically invoked in this mode from a Nagios notification handler.
+
+ When called in the "flush" mode, the script simply tries to send any enqueued
+ events to the PagerDuty server. This mode is typically invoked by cron. The
+ purpose of this mode is to retry any events that couldn't be sent to the
+ PagerDuty server for whatever reason when they were initially enqueued.
+
+=head1 OPTIONS
+
+ --api-base URL
+ The base URL used to communicate with PagerDuty. The default option here
+ should be fine, but adjusting it may make sense if your firewall doesn't
+ pass HTTPS traffic for some reason. See the PagerDuty Nagios integration
+ docs for details.
+
+ --field KEY=VALUE
+ Add this key-value pair to the event being passed to PagerDuty. The script
+ automatically gathers Nagios macros out of the environment, so there's no
+ need to specify these explicitly. This option can be repeated as many
+ times as necessary to pass multiple key-value pairs. This option is only
+ useful when an event is being enqueued.0
+
+ --help
+ Display documentation for the script.
+
+ --queue-dir DIR
+ Path to the directory to use to store the event queue. By default, we use
+ /tmp/pagerduty_nagios.
+
+ --verbose
+ Turn on extra debugging information. Useful for debugging.
+
+=cut
+
+# This release tested on:
+# Debian Sarge (Perl 5.8.4)
+# Ubuntu 9.04 (Perl 5.10.0)
+
+my $opt_api_base = "http://events.pagerduty.com/nagios/2010-04-15";
+my %opt_fields;
+my $opt_help;
+my $opt_queue_dir = "/tmp/pagerduty_nagios";
+my $opt_verbose;
+
+
+sub get_queue_from_dir {
+ my $dh;
+
+ unless (opendir($dh, $opt_queue_dir)) {
+ syslog(LOG_ERR, "opendir %s failed: %s", $opt_queue_dir, $!);
+ die $!;
+ }
+
+ my @files;
+ while (my $f = readdir($dh)) {
+ next unless $f =~ /^pd_(\d+)_\d+\.txt$/;
+ push @files, [int($1), $f];
+ }
+
+ closedir($dh);
+
+ @files = sort { @{$a}[0] <=> @{$b}[0] } @files;
+ return map { @{$_}[1] } @files;
+}
+
+
+sub flush_queue {
+ my @files = get_queue_from_dir();
+ my $ua = LWP::UserAgent->new;
+
+ # It's not a big deal if we don't get the message through the first time.
+ # It will get sent the next time cron fires.
+ $ua->timeout(15);
+
+ foreach (@files) {
+ my $filename = "$opt_queue_dir/$_";
+ my $fd;
+ my %event;
+
+ print STDERR "==== Now processing: $filename\n" if $opt_verbose;
+
+ unless (open($fd, "<", $filename)) {
+ syslog(LOG_ERR, "open %s for read failed: %s", $filename, $!);
+ die $!;
+ }
+
+ while (<$fd>) {
+ chomp;
+ my @fields = split("=", $_, 2);
+ $event{$fields[0]} = $fields[1];
+ }
+
+ close($fd);
+
+ my $req = POST("$opt_api_base/create_event", \%event);
+
+ if ($opt_verbose) {
+ my $s = $req->as_string;
+ print STDERR "Request:\n$s\n";
+ }
+
+ my $resp = $ua->request($req);
+
+ if ($opt_verbose) {
+ my $s = $resp->as_string;
+ print STDERR "Response:\n$s\n";
+ }
+
+ if ($resp->is_success) {
+ syslog(LOG_INFO, "Nagios event in file %s ACCEPTED by the PagerDuty server.", $filename);
+ unlink($filename);
+ }
+ elsif (is_client_error($resp->code)) {
+ syslog(LOG_WARNING, "Nagios event in file %s REJECTED by the PagerDuty server. Server says: %s", $filename, $resp->content);
+ unlink($filename);
+ }
+ else {
+ # Something else went wrong.
+ syslog(LOG_WARNING, "Nagios event in file %s DEFERRED due to network/server problems.", $filename);
+ return 0;
+ }
+ }
+
+ # Everything that needed to be sent was sent.
+ return 1;
+}
+
+
+sub lock_and_flush_queue {
+ # Serialize access to the queue directory while we flush.
+ # (We don't want more than one flush at once.)
+
+ my $lock_filename = "$opt_queue_dir/lockfile";
+ my $lock_fd;
+
+ unless (open($lock_fd, ">", $lock_filename)) {
+ syslog(LOG_ERR, "open %s for write failed: %s", $lock_filename, $!);
+ die $!;
+ }
+
+ unless (flock($lock_fd, LOCK_EX)) {
+ syslog(LOG_ERR, "flock %s failed: %s", $lock_filename, $!);
+ die $!;
+ }
+
+ my $ret = flush_queue();
+
+ close($lock_fd);
+
+ return $ret;
+}
+
+
+sub enqueue_event {
+ my %event;
+
+ # Scoop all the Nagios related stuff out of the environment.
+ while ((my $k, my $v) = each %ENV) {
+ next unless $k =~ /^NAGIOS_(.*)$/;
+ $event{$1} = $v;
+ }
+
+ # Apply any other variables that were passed in.
+ %event = (%event, %opt_fields);
+
+ $event{"pd_version"} = "1.0";
+
+ # Right off the bat, enqueue the event. Nothing tiem consuming should come
+ # before here (i.e. no locks or remote connections), because we want to
+ # make sure we get the event written out within the Nagios notification
+ # timeout. If we get killed off after that, it isn't a big deal.
+
+ my $filename = sprintf("$opt_queue_dir/pd_%u_%u.txt", time(), $$);
+ my $fd;
+
+ unless (open($fd, ">", $filename)) {
+ syslog(LOG_ERR, "open %s for write failed: %s", $filename, $!);
+ die $!;
+ }
+
+ while ((my $k, my $v) = each %event) {
+ # "=" can't occur in the keyname, and "\n" can't occur anywhere.
+ # (Nagios follows this already, so I think we're safe)
+ print $fd "$k=$v\n";
+ }
+
+ close($fd);
+}
+
+###########
+
+GetOptions("api-base=s" => \$opt_api_base,
+ "field=s%" => \%opt_fields,
+ "help" => \$opt_help,
+ "queue-dir=s" => \$opt_queue_dir,
+ "verbose" => \$opt_verbose
+ ) || pod2usage(2);
+
+pod2usage(2) if @ARGV < 1 ||
+ (($ARGV[0] ne "enqueue") && ($ARGV[0] ne "flush"));
+
+pod2usage(-verbose => 3) if $opt_help;
+
+my @log_mode = ("nofatal", "pid");
+push(@log_mode, "perror") if $opt_verbose;
+
+openlog("pagerduty_nagios", join(",", @log_mode), LOG_LOCAL0);
+
+# This function automatically terminates the program on things like permission
+# errors.
+mkpath($opt_queue_dir);
+
+if ($ARGV[0] eq "enqueue") {
+ enqueue_event();
+ lock_and_flush_queue();
+}
+elsif ($ARGV[0] eq "flush") {
+ lock_and_flush_queue();
+}
@@ -0,0 +1,24 @@
+
+
+package "libwww-perl"
+package "libcrypt-ssleay-perl"
+
+template "#{node['nagios']['config_dir']}/pagerduty_nagios.cfg" do
+ owner "nagios"
+ group "nagios"
+ mode 0644
+ source "pagerduty_nagios.cfg.erb"
+end
+
+cookbook_file "#{node['nagios']['plugin_dir']}/pagerduty_nagios.pl" do
@andreacampi

andreacampi Mar 4, 2012

Contributor

Won't execute[archive-default-nagios-object-definitions] (from the nagios::server recipe) blow this away every time?
This resource will end up changing on every run.

@clumpidy

clumpidy Mar 5, 2012

Contributor

I THINK I already changed this block to use "remote_file" instead of cookbook_file. That's what it seems to be in the current version of the opscode repo. Here's the commit: 5a8595a Let me know if I'm missing something.

@andreacampi

andreacampi Mar 5, 2012

Contributor

You did, sorry for the confusion.

The point is: a file named #{node['nagios']['plugin_dir']}/pagerduty_nagios.pl gets created.

On the next run, execute[archive-default-nagios-object-definitions] will do:

mv #{node['nagios']['config_dir']}/*_nagios*.cfg #{node['nagios']['conf_dir']}/dist

i.e. move away the file you just created. At which point the remote_file will run again, and so on and so forth :/

@clumpidy

clumpidy Mar 5, 2012

Contributor

Ah. Thanks for the clarification.

I don't think this should be a problem unless you set node['nagios']['plugin_dir'] to be the same as node['nagios']['config_dir']. By default node['nagios']['plugin_dir'] is set to /usr/lib/nagios/plugins in nagios/attributes/default.rb and node['nagios']['config_dir'] is set to /etc/nagios in nagios/attributes/client.rb.

So that move command should expand to:

mv /etc/nagios/*_nagios*.cfg /etc/nagios/dist

and that shouldn't touch /usr/lib/nagios/plugins/pagerduty_nagios.pl

@andreacampi

andreacampi Mar 6, 2012

Contributor

Oh my, now I'm embarassed. I simply got the line wrong :( I should have triple checked.

This comment was meant for line 6:

template "#{node['nagios']['config_dir']}/pagerduty_nagios.cfg" do

where it is precisely node['nagios']['config_dir'] as you point out. That's the file that is changing.

Sorry yet again, and thanks for your precise comment that made me see my mistake :)

@clumpidy

clumpidy Mar 6, 2012

Contributor

Thanks for you persistence. I agree this is a problem with pagerduty_nagios.cfg. I just checked my nagios server and this is exactly what is happening. What about if the name of the config file just changes to "pagerduty.cfg"?

@andreacampi

andreacampi Mar 6, 2012

Contributor

+1, that should work.

+ owner "root"
+ group "root"
+ mode 0755
+ source "plugins/pagerduty_nagios.pl"
+end
+
+cron "Flush Pagerduty" do
+ user "nagios"
+ mailto "root@localhost"
+ command "#{node['nagios']['plugin_dir']}/pagerduty_nagios.pl flush"
+end
@@ -13,7 +13,7 @@ define contact {
define contactgroup {
contactgroup_name admins
alias Nagios Administrators
- members <%= @members.join(',') %>
+ members <%= @members.join(',') %><%= ",pagerduty" if node['recipes'].include?("nagios::pagerduty") %>
}
define contactgroup {
@@ -0,0 +1,21 @@
+define contact {
+ contact_name pagerduty
+ alias PagerDuty Pseudo-Contact
+ service_notification_period 24x7
+ host_notification_period 24x7
+ service_notification_options w,u,c,r
+ host_notification_options d,r
+ service_notification_commands notify-service-by-pagerduty
+ host_notification_commands notify-host-by-pagerduty
+ pager <%= node[:nagios][:pagerduty_key] %>
+}
+
+define command {
+ command_name notify-service-by-pagerduty
+ command_line <%= node['nagios']['plugin_dir'] %>/pagerduty_nagios.pl enqueue -f pd_nagios_object=service
+}
+
+define command {
+ command_name notify-host-by-pagerduty
+ command_line <%= node['nagios']['plugin_dir'] %>/pagerduty_nagios.pl enqueue -f pd_nagios_object=host
+}

0 comments on commit c41ffb3

Please sign in to comment.