Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Fetching contributors…

Cannot retrieve contributors at this time

executable file 372 lines (337 sloc) 13.756 kB
#!/usr/bin/env perl
#
# euca_watchdog - daemon for monitoring a Eucalyptus cloud, sending
# alerts via email, and killing long-running instances
#
# OPTIONS:
#
# -l --limit: in seconds, how long instances are allowed to run
# default value is set below as a global variable
# -e --exempt-file: name of file containing usernames exempt from
# termination, one per line; this file is consulted
# upon every iteration, so it can be edited as the
# script runs
# -c --checkpoint-file: name of the file were the script saves the list
# of instances and their timestamps so that the
# latter are not lost if script is re-started
# -m --max-instances: the maximum number of instances allowed to
# each (non-exempt) user
# -s --status-file: print node availability to this file, including
# a prediction of how soon, in seconds, an instance
# will be evicted
# -h --high-watermark send a notification if there are this many or
# more instances running (can be max or close to it)
# -n --notify-email send all notifications to this e-mail address
# -n --no-killing don't kill any instances, just print out info
# -d --daemon: run in a loop
# -q --quiet: no messages on the console, only write to files
use diagnostics;
use warnings;
use sigtrap;
use strict;
use Getopt::Long;
use English; # for descriptive predefined var names, such as:
use Fcntl ':flock';
$OUTPUT_AUTOFLUSH = 1; # no output buffering
# globals
our $limit = 3600; # default limit, in seconds
our $max_instances = 4; # default max
our %def_exempt = ("eucalyptus" => 1, # default exemptions
"admin" => 1 );
our $chkpt_file = "/tmp/euca_watchdog.checkpoint";
our $status_file;
our $exempt_file;
our %instances;
our $verbose = 1; # set to 1 for debugging
our $quiet = 0;
our $peace = 0; # no killing if set to 1
our $notify_email; # no notifications if unset
our $high_watermark; # no high watermark notifications if unset
our $daemon = 0; # run in a loop
our $sleep_time = 30; # how frequently, in sec, we query Eucalyptus
# process command-line parameters
GetOptions('l|limit=i' => \$limit,
'e|exempt-file=s' => \$exempt_file,
'c|checkpoint-file=s' => \$chkpt_file,
'm|max-instances=i' => \$max_instances,
's|status-file=s' => \$status_file,
'h|high-watermark=i' => \$high_watermark,
'a|notify-email=s' => \$notify_email,
'n|no-killing' => sub { $peace = 1 },
'd|daemon' => sub { $daemon = 1 },
'q|quiet' => sub { $quiet = 1; $verbose = 0 }
) or die "Unknown parameter: $!\n";
# ensure that the environment variables necessary for EC2 tools are set
sub check_env { if ( not defined $ENV{$_[0]} ) { error ("environment variable \$$_[0] is not set!") } }
check_env ("EC2_HOME");
check_env ("EC2_PRIVATE_KEY");
check_env ("EC2_CERT");
check_env ("EC2_URL");
# if checkpoint file exists and non-empty, pick up that info
if ( -e "$chkpt_file" ) {
if ( open ( CHKPT, "<$chkpt_file" ) ) {
unless (flock CHKPT, LOCK_EX | LOCK_NB) {
warning ("file $chkpt_file already locked; waiting...");
alarm 10;
flock CHKPT, LOCK_EX or error ("failed to obtain lock on $chkpt_file");
}
while ( <CHKPT> ) {
# format: INSTANCEID USERNAME TIMESTAMP
if ( /(^[\w\-]+) ([\w\-]+) (\d+)$/ ) {
$instances{$1} = [$2, $3];
}
}
close CHKPT; # unlocks safely, too
}
}
if ( ( scalar keys %instances ) > 0 ) {
print "loaded instances from checkpoint file $chkpt_file:\n" unless $quiet;
foreach my $key ( keys %instances ) {
print "\t$key by $instances{$key}[0] noticed on $instances{$key}[1]\n" unless $quiet;
}
}
print "instance time limit: $limit seconds\n" unless $quiet;
print "maximum instances allowed: $max_instances\n" unless $quiet;
our $first = 1; # we'll print out some stuff only on first iteration
our $mode; # remembers whether previous iteration was "down", "low", or "high"
do {
# determine who is exempt by re-reading the exempt file every time, in case it's changed
my %exempt = %def_exempt;
if ( defined $exempt_file ) {
if ( open ( EXEMPT, "<$exempt_file" ) ) {
while ( <EXEMPT> ) {
if ( /([\w\-]+)/ ) {
$exempt{$1} = 1;
}
}
close EXEMPT;
} else {
if ( $first ) {
warning ("exemptions file $exempt_file could not be opened");
}
}
}
if ( $first and not $quiet ) {
print "exempt users:";
print map { " $_" } keys %exempt;
print "\n";
}
print "\n" if $verbose; # to separate output from each iteration
my $now = time;
my $now_str = localtime ($now);
print "now=$now ($now_str)\n" if $verbose;
# get list of running instances
my %old_instances = %instances;
%instances = (); # we rebuild a new list every time based on what is running
print "querying instances...\n" if $verbose;
if ( open (INSTANCES, "ec2-describe-instances |") ) {
my $user = "unknown";
while ( <INSTANCES> ) {
if ( /RESERVATION\s+(r-\w+)\s+([\w\-]+)/i ) {
my $r_id = $1;
$user = $2;
}
if ( /INSTANCE\s+(i-\w+)\s+(emi-\w+)\s+([\d\.]+)\s+([\d\.]+)\s+(\w+)/i ) {
my $i_id = $1;
my $e_id = $2;
my $public_ip = $3;
my $private_ip = $4;
my $state = $5;
if ( not defined $old_instances {$i_id} ) {
$instances {$i_id} = [$user, $now]; # add new entry
} else {
$instances {$i_id} = $old_instances {$i_id}; # keep the timestamp of the old one
}
print "\t$i_id $e_id $instances{$i_id}[1] $user $public_ip $private_ip $state\n" if $verbose;
}
}
close INSTANCES;
} else {
error ( "failed to run ec2-describe-instances", 1 );
}
# save the list of instances to a checkpoint
if ( open ( CHKPT, "+<$chkpt_file" )
or open ( CHKPT, ">$chkpt_file" ) ) {
unless (flock CHKPT, LOCK_EX | LOCK_NB) {
warning ("file $chkpt_file already locked; waiting...");
alarm 10;
flock CHKPT, LOCK_EX or error ("failed to obtain lock on $chkpt_file", 1);
}
truncate CHKPT, 0 or error ("failed to truncate file $chkpt_file", 1);
foreach my $key ( keys %instances ) {
print CHKPT "$key $instances{$key}[0] $instances{$key}[1]\n";
}
close CHKPT;
} else {
warning ("failed to save a checkpoint in $chkpt_file");
}
# check on the status
my $available = 0;
my $total = 0;
print "querying availability zones...\n" if $verbose;
if ( open (ZONES, "ec2-describe-availability-zones |") ) {
while ( <ZONES> ) {
if ( /AVAILABILITYZONE[\s\|]+([\w\-]+)[\s\|]+([\w\-]+)[\s\|]+(\d+)\/(\d+)\s+([\w\-]+)[\s\|]+/i ) {
my $zone_name = $1;
my $zone_status = $2;
my $zone_available = $3;
my $zone_total = $4;
my $zone_instance_type = $5;
print "\t$zone_name $zone_status $zone_available/$zone_total $zone_instance_type\n" if $verbose;
if ( $zone_status =~ /up/i ) {
$available += $zone_available;
$total += $zone_total;
}
}
}
close ZONES;
} else {
error ( "failed to run ec2-describe-availability-zones", 1);
}
# try to estimate the ealiest expiration, if any
my $remains = 0; # no expirations expected
if ( ( scalar keys %instances ) > 0 ) {
# find the earliest timestamp
my $earliest = $now; # should be bigger than all start timestamps
my $qualifying_instances = 0; # i.e. not exempt
foreach my $id ( keys %instances ) {
my $user = $instances{$id}[0];
my $started = $instances{$id}[1];
if ( not defined $exempt{$user} and $started < $earliest ) {
$qualifying_instances++;
$earliest = $started;
}
}
if ( $qualifying_instances ) { # else, keep remains==0
# conservatively, add detection latency and some padding
$remains = ($earliest + $limit) - $now + $sleep_time + 5;
# negative number means we're close, so return detection latency
if ( $remains <= 0 ) { $remains = $sleep_time; }
}
}
# state transition logic with email notifications for entering "down" mode,
# leaving "down" mode, and "high" mode (except on the first iteration, when
# we assume the admin knows what's going on)
if ( $total > 0 ) { # cloud is up
my $running = $total - $available; # total instances running
my $next_mode = "low"; # assume low, if high_watermark isn't defined
if ( defined $high_watermark and $running >= $high_watermark ) {
$next_mode = "high";
}
if ( defined $mode ) { # do not notify on the first iteration, when $mode is undefined
if ( $mode ne "high" and $next_mode eq "high" ) {
notify ("heavy load", "Heavy load on the cloud: $running instances, $available available slots\n", 1);
} elsif ( $mode eq "down" ) {
notify ("cloud went up", "Cloud Controller reports $available available out of $total total slots\n", 1);
}
$mode = $next_mode;
}
} else { # cloud is down
if ( defined $mode and $mode ne "down" ) {
notify ("cloud went down", "Cloud Controller could not be contacted or reported 000/000\n", 1);
}
$mode = "down";
}
# write the status to a file
my $status = "total=$total available=$available expiration=$remains\n";
print $status unless $quiet;
if ( defined $status_file ) {
if ( open ( STATUS, ">$status_file" ) ) {
print STATUS $status;
close STATUS;
} else {
if ( $first ) {
warning ("failed to write to status file $status_file");
}
}
}
# killin' time...
my $to_kill = "";
my %count = ();
ID: foreach my $key ( keys %instances ) {
my $user = $instances{$key}[0];
my $started = $instances{$key}[1];
if ( not defined $count{$user} ) {
$count{$user} = 1;
} else {
$count{$user}++;
}
if ( defined $exempt{$user} ) { next ID }
if ( ( $started + $limit ) >= $now
and $count{$user} <= $max_instances ) { next ID }
$to_kill .= " $key";
}
foreach my $user ( keys %count ) {
if ( $count{$user} > $max_instances ) {
print "user $user has $count{$user} instances, exceeding max-instances=$max_instances\n" unless $quiet;
}
}
if ( $to_kill ne "" ) {
print "instances qualifying for termination:$to_kill...\n" unless $quiet;
if ( not $peace ) {
my $error = shell_command_status ("ec2-terminate-instances $to_kill");
print "ec2-terminate-instances returned $error\n";
}
}
$first = 0;
if ( $daemon) {
print "sleeping for $sleep_time seconds...\n" if $verbose;
sleep ($sleep_time);
}
} while ( $daemon );
########################################################################
# run a shell command synchronously and return its STDOUT and STDERR
sub shell_command_output {
my ( $cmd ) = @_;
my $output = "";
$verbose and print "executing: $cmd\n";
if ( open(TMP, "$cmd |") ) {
my $terminator = $/;
undef $/;
$output = <TMP>;
$/ = $terminator;
close(TMP);
}
return $output; # returns "" if fork failed or if cmd returns error
}
# run a shell command synchronously and return its error status
sub shell_command_status {
my ( $cmd ) = @_;
$verbose and print "executing: $cmd\n";
if ( system ($cmd) ) {
if ($? == -1) {
print STDERR "failed to execute: $!\n";
} elsif ($? & 127) {
printf STDERR "child died with signal %d, %s coredump\n",
($? & 127), ($? & 128) ? 'with' : 'without';
} else {
$verbose and printf STDERR "child exited with value %d\n", $? >> 8;
}
}
return $?;
}
sub notify {
my ( $subject, $body, $notify ) = @_;
if ( defined $notify_email ) {
$verbose and print "notifying $notify_email\n";
my $full_subject = "Eucalyptus watchdog ALERT: $subject";
if ( open (MAIL, "| mailx -s '$full_subject' $notify_email") ) {
print MAIL "$body";
close MAIL;
} else {
print STDERR "failed to run mailx for notification\n";
}
}
}
sub warning {
my ( $str ) = @_;
print STDERR "WARNING: $str\n";
}
sub error {
my ( $str, $notify ) = @_;
print STDERR "ERROR: $str\n";
if ( defined $notify and $notify != 0 ) {
notify ("euca_watchdog.pl died", "euca_watchdog.pl died due to error:\n$str\n");
}
exit 1;
}
Jump to Line
Something went wrong with that request. Please try again.