Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Fetching contributors…

Octocat-spinner-32-eaf2f5

Cannot retrieve contributors at this time

executable file 431 lines (395 sloc) 16.73 kb
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430
#!/usr/bin/perl
#Copyright (c) 2009 Eucalyptus Systems, Inc.
#
#This program is free software: you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation, only version 3 of the License.
#
#This file is distributed in the hope that it will be useful, but WITHOUT
#ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
#FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
#for more details.
#
#You should have received a copy of the GNU General Public License along
#with this program. If not, see <http://www.gnu.org/licenses/>.
#
#Please contact Eucalyptus Systems, Inc., 130 Castilian
#Dr., Goleta, CA 93101 USA or visit <http://www.eucalyptus.com/licenses/>
#if you need additional information or have any questions.
#
#This file may incorporate work covered under the following copyright and
#permission notice:
#
# Software License Agreement (BSD License)
#
# Copyright (c) 2008, Regents of the University of California
#
#
# Redistribution and use of this software in source and binary forms, with
# or without modification, are permitted provided that the following
# conditions are met:
#
# Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
# OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USERS OF
# THIS SOFTWARE ACKNOWLEDGE THE POSSIBLE PRESENCE OF OTHER OPEN SOURCE
# LICENSED MATERIAL, COPYRIGHTED MATERIAL OR PATENTED MATERIAL IN THIS
# SOFTWARE, AND IF ANY SUCH MATERIAL IS DISCOVERED THE PARTY DISCOVERING
# IT MAY INFORM DR. RICH WOLSKI AT THE UNIVERSITY OF CALIFORNIA, SANTA
# BARBARA WHO WILL THEN ASCERTAIN THE MOST APPROPRIATE REMEDY, WHICH IN
# THE REGENTS' DISCRETION MAY INCLUDE, WITHOUT LIMITATION, REPLACEMENT
# OF THE CODE SO IDENTIFIED, LICENSING OF THE CODE SO IDENTIFIED, OR
# WITHDRAWAL OF THE CODE CAPABILITY TO THE EXTENT NEEDED TO COMPLY WITH
# ANY SUCH LICENSES OR RIGHTS.
#

#
# euca_watchdog - daemon for monitoring a Eucalyptus cloud, sending
# alerts via email, and killing long-running instances
#
# OPTIONS:
#
# -l --limit: in seconds, how long instances are allowed to run
# default value is set below as a global variable
# -e --exempt-file: name of file containing usernames exempt from
# termination, one per line; this file is consulted
# upon every iteration, so it can be edited as the
# script runs
# -c --checkpoint-file: name of the file were the script saves the list
# of instances and their timestamps so that the
# latter are not lost if script is re-started
# -m --max-instances: the maximum number of instances allowed to
# each (non-exempt) user
# -s --status-file: print node availability to this file, including
# a prediction of how soon, in seconds, an instance
# will be evicted
# -h --high-watermark send a notification if there are this many or
# more instances running (can be max or close to it)
# -n --notify-email send all notifications to this e-mail address
# -n --no-killing don't kill any instances, just print out info
# -d --daemon: run in a loop
# -q --quiet: no messages on the console, only write to files

use diagnostics;
use warnings;
use sigtrap;
use strict;
use Getopt::Long;
use English; # for descriptive predefined var names, such as:
use Fcntl ':flock';
$OUTPUT_AUTOFLUSH = 1; # no output buffering

# globals
our $limit = 3600; # default limit, in seconds
our $max_instances = 4; # default max
our %def_exempt = ("eucalyptus" => 1, # default exemptions
                   "admin" => 1 );
our $chkpt_file = "/tmp/euca_watchdog.checkpoint";
our $status_file;
our $exempt_file;
our %instances;
our $verbose = 1; # set to 1 for debugging
our $quiet = 0;
our $peace = 0; # no killing if set to 1
our $notify_email; # no notifications if unset
our $high_watermark; # no high watermark notifications if unset
our $daemon = 0; # run in a loop
our $sleep_time = 30; # how frequently, in sec, we query Eucalyptus

# process command-line parameters
GetOptions('l|limit=i' => \$limit,
           'e|exempt-file=s' => \$exempt_file,
           'c|checkpoint-file=s' => \$chkpt_file,
           'm|max-instances=i' => \$max_instances,
           's|status-file=s' => \$status_file,
           'h|high-watermark=i' => \$high_watermark,
           'a|notify-email=s' => \$notify_email,
           'n|no-killing' => sub { $peace = 1 },
           'd|daemon' => sub { $daemon = 1 },
           'q|quiet' => sub { $quiet = 1; $verbose = 0 }
) or die "Unknown parameter: $!\n";

# ensure that the environment variables necessary for EC2 tools are set
sub check_env { if ( not defined $ENV{$_[0]} ) { error ("environment variable \$$_[0] is not set!") } }
check_env ("EC2_HOME");
check_env ("EC2_PRIVATE_KEY");
check_env ("EC2_CERT");
check_env ("EC2_URL");

# if checkpoint file exists and non-empty, pick up that info
if ( -e "$chkpt_file" ) {
    if ( open ( CHKPT, "<$chkpt_file" ) ) {
        unless (flock CHKPT, LOCK_EX | LOCK_NB) {
            warning ("file $chkpt_file already locked; waiting...");
            alarm 10;
            flock CHKPT, LOCK_EX or error ("failed to obtain lock on $chkpt_file");
        }

        while ( <CHKPT> ) {
            # format: INSTANCEID USERNAME TIMESTAMP
            if ( /(^[\w\-]+) ([\w\-]+) (\d+)$/ ) {
                $instances{$1} = [$2, $3];
            }
        }
        close CHKPT; # unlocks safely, too
    }
}
if ( ( scalar keys %instances ) > 0 ) {
    print "loaded instances from checkpoint file $chkpt_file:\n" unless $quiet;
    foreach my $key ( keys %instances ) {
        print "\t$key by $instances{$key}[0] noticed on $instances{$key}[1]\n" unless $quiet;
    }
}

print "instance time limit: $limit seconds\n" unless $quiet;
print "maximum instances allowed: $max_instances\n" unless $quiet;

our $first = 1; # we'll print out some stuff only on first iteration
our $mode; # remembers whether previous iteration was "down", "low", or "high"

do {

    # determine who is exempt by re-reading the exempt file every time, in case it's changed
    my %exempt = %def_exempt;
    if ( defined $exempt_file ) {
        if ( open ( EXEMPT, "<$exempt_file" ) ) {
            while ( <EXEMPT> ) {
                if ( /([\w\-]+)/ ) {
                    $exempt{$1} = 1;
                }
            }
            close EXEMPT;
        } else {
            if ( $first ) {
                warning ("exemptions file $exempt_file could not be opened");
            }
        }
    }
    if ( $first and not $quiet ) {
        print "exempt users:";
        print map { " $_" } keys %exempt;
        print "\n";
    }
    
    print "\n" if $verbose; # to separate output from each iteration
    my $now = time;
    my $now_str = localtime ($now);
    print "now=$now ($now_str)\n" if $verbose;

    # get list of running instances
    my %old_instances = %instances;
    %instances = (); # we rebuild a new list every time based on what is running
    print "querying instances...\n" if $verbose;
    if ( open (INSTANCES, "ec2-describe-instances |") ) {
        my $user = "unknown";
        while ( <INSTANCES> ) {
            if ( /RESERVATION\s+(r-\w+)\s+([\w\-]+)/i ) {
                my $r_id = $1;
                $user = $2;
            }
            if ( /INSTANCE\s+(i-\w+)\s+(emi-\w+)\s+([\d\.]+)\s+([\d\.]+)\s+(\w+)/i ) {
                my $i_id = $1;
                my $e_id = $2;
                my $public_ip = $3;
                my $private_ip = $4;
                my $state = $5;
                if ( not defined $old_instances {$i_id} ) {
                    $instances {$i_id} = [$user, $now]; # add new entry
                } else {
                    $instances {$i_id} = $old_instances {$i_id}; # keep the timestamp of the old one
                }
                print "\t$i_id $e_id $instances{$i_id}[1] $user $public_ip $private_ip $state\n" if $verbose;
            }
        }
        close INSTANCES;
    } else {
        error ( "failed to run ec2-describe-instances", 1 );
    }

    # save the list of instances to a checkpoint
    if ( open ( CHKPT, "+<$chkpt_file" )
         or open ( CHKPT, ">$chkpt_file" ) ) {
        unless (flock CHKPT, LOCK_EX | LOCK_NB) {
            warning ("file $chkpt_file already locked; waiting...");
            alarm 10;
            flock CHKPT, LOCK_EX or error ("failed to obtain lock on $chkpt_file", 1);
        }
        truncate CHKPT, 0 or error ("failed to truncate file $chkpt_file", 1);
        foreach my $key ( keys %instances ) {
            print CHKPT "$key $instances{$key}[0] $instances{$key}[1]\n";
        }
        close CHKPT;
    } else {
        warning ("failed to save a checkpoint in $chkpt_file");
    }

    # check on the status
    my $available = 0;
    my $total = 0;
    print "querying availability zones...\n" if $verbose;
    if ( open (ZONES, "ec2-describe-availability-zones |") ) {
        while ( <ZONES> ) {
            if ( /AVAILABILITYZONE[\s\|]+([\w\-]+)[\s\|]+([\w\-]+)[\s\|]+(\d+)\/(\d+)\s+([\w\-]+)[\s\|]+/i ) {
                my $zone_name = $1;
                my $zone_status = $2;
                my $zone_available = $3;
                my $zone_total = $4;
                my $zone_instance_type = $5;
                print "\t$zone_name $zone_status $zone_available/$zone_total $zone_instance_type\n" if $verbose;
                if ( $zone_status =~ /up/i ) {
                    $available += $zone_available;
                    $total += $zone_total;
                }
            }
        }
        close ZONES;
    } else {
        error ( "failed to run ec2-describe-availability-zones", 1);
    }

    # try to estimate the ealiest expiration, if any
    my $remains = 0; # no expirations expected
    if ( ( scalar keys %instances ) > 0 ) {
        # find the earliest timestamp
        my $earliest = $now; # should be bigger than all start timestamps
        my $qualifying_instances = 0; # i.e. not exempt
        foreach my $id ( keys %instances ) {
            my $user = $instances{$id}[0];
            my $started = $instances{$id}[1];
            if ( not defined $exempt{$user} and $started < $earliest ) {
                $qualifying_instances++;
                $earliest = $started;
            }
        }

        if ( $qualifying_instances ) { # else, keep remains==0
            # conservatively, add detection latency and some padding
            $remains = ($earliest + $limit) - $now + $sleep_time + 5;
            # negative number means we're close, so return detection latency
            if ( $remains <= 0 ) { $remains = $sleep_time; }
        }
    }

    # state transition logic with email notifications for entering "down" mode,
    # leaving "down" mode, and "high" mode (except on the first iteration, when
    # we assume the admin knows what's going on)
    if ( $total > 0 ) { # cloud is up
        my $running = $total - $available; # total instances running
        my $next_mode = "low"; # assume low, if high_watermark isn't defined
        if ( defined $high_watermark and $running >= $high_watermark ) {
            $next_mode = "high";
        }
        if ( defined $mode ) { # do not notify on the first iteration, when $mode is undefined
            if ( $mode ne "high" and $next_mode eq "high" ) {
                notify ("heavy load", "Heavy load on the cloud: $running instances, $available available slots\n", 1);
            } elsif ( $mode eq "down" ) {
                notify ("cloud went up", "Cloud Controller reports $available available out of $total total slots\n", 1);
            }
            $mode = $next_mode;
        }
    } else { # cloud is down
        if ( defined $mode and $mode ne "down" ) {
            notify ("cloud went down", "Cloud Controller could not be contacted or reported 000/000\n", 1);
        }
        $mode = "down";
    }
    
    # write the status to a file
    my $status = "total=$total available=$available expiration=$remains\n";
    print $status unless $quiet;
    if ( defined $status_file ) {
        if ( open ( STATUS, ">$status_file" ) ) {
            print STATUS $status;
            close STATUS;
        } else {
            if ( $first ) {
                warning ("failed to write to status file $status_file");
            }
        }
    }
    
    # killin' time...
    my $to_kill = "";
    my %count = ();
    ID: foreach my $key ( keys %instances ) {
        my $user = $instances{$key}[0];
        my $started = $instances{$key}[1];
        if ( not defined $count{$user} ) {
            $count{$user} = 1;
        } else {
            $count{$user}++;
        }
        if ( defined $exempt{$user} ) { next ID }
        if ( ( $started + $limit ) >= $now
             and $count{$user} <= $max_instances ) { next ID }
        $to_kill .= " $key";
    }
    foreach my $user ( keys %count ) {
        if ( $count{$user} > $max_instances ) {
            print "user $user has $count{$user} instances, exceeding max-instances=$max_instances\n" unless $quiet;
        }
    }
    if ( $to_kill ne "" ) {
        print "instances qualifying for termination:$to_kill...\n" unless $quiet;
        if ( not $peace ) {
            my $error = shell_command_status ("ec2-terminate-instances $to_kill");
            print "ec2-terminate-instances returned $error\n";
        }
    }

    $first = 0;
    if ( $daemon) {
        print "sleeping for $sleep_time seconds...\n" if $verbose;
        sleep ($sleep_time);
    }
} while ( $daemon );

########################################################################

# run a shell command synchronously and return its STDOUT and STDERR
sub shell_command_output {
    my ( $cmd ) = @_;
    my $output = "";

    $verbose and print "executing: $cmd\n";
    if ( open(TMP, "$cmd |") ) {
        my $terminator = $/;
        undef $/;
        $output = <TMP>;
        $/ = $terminator;
        close(TMP);
    }
    return $output; # returns "" if fork failed or if cmd returns error
}

# run a shell command synchronously and return its error status
sub shell_command_status {
    my ( $cmd ) = @_;

    $verbose and print "executing: $cmd\n";
    if ( system ($cmd) ) {
        if ($? == -1) {
            print STDERR "failed to execute: $!\n";
        } elsif ($? & 127) {
            printf STDERR "child died with signal %d, %s coredump\n",
            ($? & 127), ($? & 128) ? 'with' : 'without';
        } else {
            $verbose and printf STDERR "child exited with value %d\n", $? >> 8;
        }
    }
    return $?;
}

sub notify {
    my ( $subject, $body, $notify ) = @_;

    if ( defined $notify_email ) {

        $verbose and print "notifying $notify_email\n";
        my $full_subject = "Eucalyptus watchdog ALERT: $subject";

        if ( open (MAIL, "| mailx -s '$full_subject' $notify_email") ) {
            print MAIL "$body";
            close MAIL;
        } else {
            print STDERR "failed to run mailx for notification\n";
        }
    }
}

sub warning {
    my ( $str ) = @_;
    
    print STDERR "WARNING: $str\n";
}

sub error {
    my ( $str, $notify ) = @_;

    print STDERR "ERROR: $str\n";
    if ( defined $notify and $notify != 0 ) {
        notify ("euca_watchdog.pl died", "euca_watchdog.pl died due to error:\n$str\n");
    }
    exit 1;
}

Something went wrong with that request. Please try again.