Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100755 431 lines (397 sloc) 16.792 kb
48052e33 » root
2009-08-26 added license header to source files (not clc/)
1 #!/usr/bin/perl
2 #
054a11a1 » gholms
2012-07-17 Update GPL+BSD file headers
3 # Copyright 2009-2012 Eucalyptus Systems, Inc.
48052e33 » root
2009-08-26 added license header to source files (not clc/)
4 #
054a11a1 » gholms
2012-07-17 Update GPL+BSD file headers
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; version 3 of the License.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
48052e33 » root
2009-08-26 added license header to source files (not clc/)
13 #
054a11a1 » gholms
2012-07-17 Update GPL+BSD file headers
14 # You should have received a copy of the GNU General Public License
15 # along with this program. If not, see http://www.gnu.org/licenses/.
48052e33 » root
2009-08-26 added license header to source files (not clc/)
16 #
054a11a1 » gholms
2012-07-17 Update GPL+BSD file headers
17 # Please contact Eucalyptus Systems, Inc., 6755 Hollister Ave., Goleta
18 # CA 93117, USA or visit http://www.eucalyptus.com/licenses/ if you need
19 # additional information or have any questions.
48052e33 » root
2009-08-26 added license header to source files (not clc/)
20 #
054a11a1 » gholms
2012-07-17 Update GPL+BSD file headers
21 # This file may incorporate work covered under the following copyright
22 # and permission notice:
48052e33 » root
2009-08-26 added license header to source files (not clc/)
23 #
054a11a1 » gholms
2012-07-17 Update GPL+BSD file headers
24 # Software License Agreement (BSD License)
48052e33 » root
2009-08-26 added license header to source files (not clc/)
25 #
054a11a1 » gholms
2012-07-17 Update GPL+BSD file headers
26 # Copyright (c) 2008, Regents of the University of California
27 # All rights reserved.
48052e33 » root
2009-08-26 added license header to source files (not clc/)
28 #
054a11a1 » gholms
2012-07-17 Update GPL+BSD file headers
29 # Redistribution and use of this software in source and binary forms,
30 # with or without modification, are permitted provided that the
31 # following conditions are met:
48052e33 » root
2009-08-26 added license header to source files (not clc/)
32 #
054a11a1 » gholms
2012-07-17 Update GPL+BSD file headers
33 # Redistributions of source code must retain the above copyright
34 # notice, this list of conditions and the following disclaimer.
48052e33 » root
2009-08-26 added license header to source files (not clc/)
35 #
054a11a1 » gholms
2012-07-17 Update GPL+BSD file headers
36 # Redistributions in binary form must reproduce the above copyright
37 # notice, this list of conditions and the following disclaimer
38 # in the documentation and/or other materials provided with the
39 # distribution.
40 #
41 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
42 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
43 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
44 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
45 # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
46 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
47 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
48 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
49 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
51 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
52 # POSSIBILITY OF SUCH DAMAGE. USERS OF THIS SOFTWARE ACKNOWLEDGE
53 # THE POSSIBLE PRESENCE OF OTHER OPEN SOURCE LICENSED MATERIAL,
54 # COPYRIGHTED MATERIAL OR PATENTED MATERIAL IN THIS SOFTWARE,
55 # AND IF ANY SUCH MATERIAL IS DISCOVERED THE PARTY DISCOVERING
56 # IT MAY INFORM DR. RICH WOLSKI AT THE UNIVERSITY OF CALIFORNIA,
57 # SANTA BARBARA WHO WILL THEN ASCERTAIN THE MOST APPROPRIATE REMEDY,
58 # WHICH IN THE REGENTS' DISCRETION MAY INCLUDE, WITHOUT LIMITATION,
59 # REPLACEMENT OF THE CODE SO IDENTIFIED, LICENSING OF THE CODE SO
60 # IDENTIFIED, OR WITHDRAWAL OF THE CODE CAPABILITY TO THE EXTENT
61 # NEEDED TO COMPLY WITH ANY SUCH LICENSES OR RIGHTS.
48052e33 » root
2009-08-26 added license header to source files (not clc/)
62
7183c5b2 » graziano obertelli
2009-01-06 from CVS
63 #
64 # euca_watchdog - daemon for monitoring a Eucalyptus cloud, sending
65 # alerts via email, and killing long-running instances
66 #
67 # OPTIONS:
68 #
69 # -l --limit: in seconds, how long instances are allowed to run
70 # default value is set below as a global variable
71 # -e --exempt-file: name of file containing usernames exempt from
72 # termination, one per line; this file is consulted
73 # upon every iteration, so it can be edited as the
74 # script runs
75 # -c --checkpoint-file: name of the file were the script saves the list
76 # of instances and their timestamps so that the
77 # latter are not lost if script is re-started
78 # -m --max-instances: the maximum number of instances allowed to
79 # each (non-exempt) user
80 # -s --status-file: print node availability to this file, including
81 # a prediction of how soon, in seconds, an instance
82 # will be evicted
83 # -h --high-watermark send a notification if there are this many or
84 # more instances running (can be max or close to it)
85 # -n --notify-email send all notifications to this e-mail address
86 # -n --no-killing don't kill any instances, just print out info
87 # -d --daemon: run in a loop
88 # -q --quiet: no messages on the console, only write to files
89
90 use diagnostics;
91 use warnings;
92 use sigtrap;
93 use strict;
94 use Getopt::Long;
95 use English; # for descriptive predefined var names, such as:
96 use Fcntl ':flock';
97 $OUTPUT_AUTOFLUSH = 1; # no output buffering
98
99 # globals
100 our $limit = 3600; # default limit, in seconds
101 our $max_instances = 4; # default max
102 our %def_exempt = ("eucalyptus" => 1, # default exemptions
103 "admin" => 1 );
104 our $chkpt_file = "/tmp/euca_watchdog.checkpoint";
105 our $status_file;
106 our $exempt_file;
107 our %instances;
108 our $verbose = 1; # set to 1 for debugging
109 our $quiet = 0;
110 our $peace = 0; # no killing if set to 1
111 our $notify_email; # no notifications if unset
112 our $high_watermark; # no high watermark notifications if unset
113 our $daemon = 0; # run in a loop
114 our $sleep_time = 30; # how frequently, in sec, we query Eucalyptus
115
116 # process command-line parameters
117 GetOptions('l|limit=i' => \$limit,
118 'e|exempt-file=s' => \$exempt_file,
119 'c|checkpoint-file=s' => \$chkpt_file,
120 'm|max-instances=i' => \$max_instances,
121 's|status-file=s' => \$status_file,
122 'h|high-watermark=i' => \$high_watermark,
123 'a|notify-email=s' => \$notify_email,
124 'n|no-killing' => sub { $peace = 1 },
125 'd|daemon' => sub { $daemon = 1 },
126 'q|quiet' => sub { $quiet = 1; $verbose = 0 }
127 ) or die "Unknown parameter: $!\n";
128
129 # ensure that the environment variables necessary for EC2 tools are set
130 sub check_env { if ( not defined $ENV{$_[0]} ) { error ("environment variable \$$_[0] is not set!") } }
131 check_env ("EC2_HOME");
132 check_env ("EC2_PRIVATE_KEY");
133 check_env ("EC2_CERT");
134 check_env ("EC2_URL");
135
136 # if checkpoint file exists and non-empty, pick up that info
137 if ( -e "$chkpt_file" ) {
138 if ( open ( CHKPT, "<$chkpt_file" ) ) {
139 unless (flock CHKPT, LOCK_EX | LOCK_NB) {
140 warning ("file $chkpt_file already locked; waiting...");
141 alarm 10;
142 flock CHKPT, LOCK_EX or error ("failed to obtain lock on $chkpt_file");
143 }
144
145 while ( <CHKPT> ) {
146 # format: INSTANCEID USERNAME TIMESTAMP
147 if ( /(^[\w\-]+) ([\w\-]+) (\d+)$/ ) {
148 $instances{$1} = [$2, $3];
149 }
150 }
151 close CHKPT; # unlocks safely, too
152 }
153 }
154 if ( ( scalar keys %instances ) > 0 ) {
155 print "loaded instances from checkpoint file $chkpt_file:\n" unless $quiet;
156 foreach my $key ( keys %instances ) {
157 print "\t$key by $instances{$key}[0] noticed on $instances{$key}[1]\n" unless $quiet;
158 }
159 }
160
161 print "instance time limit: $limit seconds\n" unless $quiet;
162 print "maximum instances allowed: $max_instances\n" unless $quiet;
163
164 our $first = 1; # we'll print out some stuff only on first iteration
165 our $mode; # remembers whether previous iteration was "down", "low", or "high"
166
167 do {
168
169 # determine who is exempt by re-reading the exempt file every time, in case it's changed
170 my %exempt = %def_exempt;
171 if ( defined $exempt_file ) {
172 if ( open ( EXEMPT, "<$exempt_file" ) ) {
173 while ( <EXEMPT> ) {
174 if ( /([\w\-]+)/ ) {
175 $exempt{$1} = 1;
176 }
177 }
178 close EXEMPT;
179 } else {
180 if ( $first ) {
181 warning ("exemptions file $exempt_file could not be opened");
182 }
183 }
184 }
185 if ( $first and not $quiet ) {
186 print "exempt users:";
187 print map { " $_" } keys %exempt;
188 print "\n";
189 }
190
191 print "\n" if $verbose; # to separate output from each iteration
192 my $now = time;
193 my $now_str = localtime ($now);
194 print "now=$now ($now_str)\n" if $verbose;
195
196 # get list of running instances
197 my %old_instances = %instances;
198 %instances = (); # we rebuild a new list every time based on what is running
199 print "querying instances...\n" if $verbose;
200 if ( open (INSTANCES, "ec2-describe-instances |") ) {
201 my $user = "unknown";
202 while ( <INSTANCES> ) {
203 if ( /RESERVATION\s+(r-\w+)\s+([\w\-]+)/i ) {
204 my $r_id = $1;
205 $user = $2;
206 }
207 if ( /INSTANCE\s+(i-\w+)\s+(emi-\w+)\s+([\d\.]+)\s+([\d\.]+)\s+(\w+)/i ) {
208 my $i_id = $1;
209 my $e_id = $2;
210 my $public_ip = $3;
211 my $private_ip = $4;
212 my $state = $5;
213 if ( not defined $old_instances {$i_id} ) {
214 $instances {$i_id} = [$user, $now]; # add new entry
215 } else {
216 $instances {$i_id} = $old_instances {$i_id}; # keep the timestamp of the old one
217 }
218 print "\t$i_id $e_id $instances{$i_id}[1] $user $public_ip $private_ip $state\n" if $verbose;
219 }
220 }
221 close INSTANCES;
222 } else {
223 error ( "failed to run ec2-describe-instances", 1 );
224 }
225
226 # save the list of instances to a checkpoint
227 if ( open ( CHKPT, "+<$chkpt_file" )
228 or open ( CHKPT, ">$chkpt_file" ) ) {
229 unless (flock CHKPT, LOCK_EX | LOCK_NB) {
230 warning ("file $chkpt_file already locked; waiting...");
231 alarm 10;
232 flock CHKPT, LOCK_EX or error ("failed to obtain lock on $chkpt_file", 1);
233 }
234 truncate CHKPT, 0 or error ("failed to truncate file $chkpt_file", 1);
235 foreach my $key ( keys %instances ) {
236 print CHKPT "$key $instances{$key}[0] $instances{$key}[1]\n";
237 }
238 close CHKPT;
239 } else {
240 warning ("failed to save a checkpoint in $chkpt_file");
241 }
242
243 # check on the status
244 my $available = 0;
245 my $total = 0;
246 print "querying availability zones...\n" if $verbose;
247 if ( open (ZONES, "ec2-describe-availability-zones |") ) {
248 while ( <ZONES> ) {
249 if ( /AVAILABILITYZONE[\s\|]+([\w\-]+)[\s\|]+([\w\-]+)[\s\|]+(\d+)\/(\d+)\s+([\w\-]+)[\s\|]+/i ) {
250 my $zone_name = $1;
251 my $zone_status = $2;
252 my $zone_available = $3;
253 my $zone_total = $4;
254 my $zone_instance_type = $5;
255 print "\t$zone_name $zone_status $zone_available/$zone_total $zone_instance_type\n" if $verbose;
256 if ( $zone_status =~ /up/i ) {
257 $available += $zone_available;
258 $total += $zone_total;
259 }
260 }
261 }
262 close ZONES;
263 } else {
264 error ( "failed to run ec2-describe-availability-zones", 1);
265 }
266
267 # try to estimate the ealiest expiration, if any
268 my $remains = 0; # no expirations expected
269 if ( ( scalar keys %instances ) > 0 ) {
270 # find the earliest timestamp
271 my $earliest = $now; # should be bigger than all start timestamps
272 my $qualifying_instances = 0; # i.e. not exempt
273 foreach my $id ( keys %instances ) {
274 my $user = $instances{$id}[0];
275 my $started = $instances{$id}[1];
276 if ( not defined $exempt{$user} and $started < $earliest ) {
277 $qualifying_instances++;
278 $earliest = $started;
279 }
280 }
281
282 if ( $qualifying_instances ) { # else, keep remains==0
283 # conservatively, add detection latency and some padding
284 $remains = ($earliest + $limit) - $now + $sleep_time + 5;
285 # negative number means we're close, so return detection latency
286 if ( $remains <= 0 ) { $remains = $sleep_time; }
287 }
288 }
289
290 # state transition logic with email notifications for entering "down" mode,
291 # leaving "down" mode, and "high" mode (except on the first iteration, when
292 # we assume the admin knows what's going on)
293 if ( $total > 0 ) { # cloud is up
294 my $running = $total - $available; # total instances running
295 my $next_mode = "low"; # assume low, if high_watermark isn't defined
296 if ( defined $high_watermark and $running >= $high_watermark ) {
297 $next_mode = "high";
298 }
299 if ( defined $mode ) { # do not notify on the first iteration, when $mode is undefined
300 if ( $mode ne "high" and $next_mode eq "high" ) {
301 notify ("heavy load", "Heavy load on the cloud: $running instances, $available available slots\n", 1);
302 } elsif ( $mode eq "down" ) {
303 notify ("cloud went up", "Cloud Controller reports $available available out of $total total slots\n", 1);
304 }
305 $mode = $next_mode;
306 }
307 } else { # cloud is down
308 if ( defined $mode and $mode ne "down" ) {
309 notify ("cloud went down", "Cloud Controller could not be contacted or reported 000/000\n", 1);
310 }
311 $mode = "down";
312 }
313
314 # write the status to a file
315 my $status = "total=$total available=$available expiration=$remains\n";
316 print $status unless $quiet;
317 if ( defined $status_file ) {
318 if ( open ( STATUS, ">$status_file" ) ) {
319 print STATUS $status;
320 close STATUS;
321 } else {
322 if ( $first ) {
323 warning ("failed to write to status file $status_file");
324 }
325 }
326 }
327
328 # killin' time...
329 my $to_kill = "";
330 my %count = ();
331 ID: foreach my $key ( keys %instances ) {
332 my $user = $instances{$key}[0];
333 my $started = $instances{$key}[1];
334 if ( not defined $count{$user} ) {
335 $count{$user} = 1;
336 } else {
337 $count{$user}++;
338 }
339 if ( defined $exempt{$user} ) { next ID }
340 if ( ( $started + $limit ) >= $now
341 and $count{$user} <= $max_instances ) { next ID }
342 $to_kill .= " $key";
343 }
344 foreach my $user ( keys %count ) {
345 if ( $count{$user} > $max_instances ) {
346 print "user $user has $count{$user} instances, exceeding max-instances=$max_instances\n" unless $quiet;
347 }
348 }
349 if ( $to_kill ne "" ) {
350 print "instances qualifying for termination:$to_kill...\n" unless $quiet;
351 if ( not $peace ) {
352 my $error = shell_command_status ("ec2-terminate-instances $to_kill");
353 print "ec2-terminate-instances returned $error\n";
354 }
355 }
356
357 $first = 0;
358 if ( $daemon) {
359 print "sleeping for $sleep_time seconds...\n" if $verbose;
360 sleep ($sleep_time);
361 }
362 } while ( $daemon );
363
364 ########################################################################
365
366 # run a shell command synchronously and return its STDOUT and STDERR
367 sub shell_command_output {
368 my ( $cmd ) = @_;
369 my $output = "";
370
371 $verbose and print "executing: $cmd\n";
372 if ( open(TMP, "$cmd |") ) {
373 my $terminator = $/;
374 undef $/;
375 $output = <TMP>;
376 $/ = $terminator;
377 close(TMP);
378 }
379 return $output; # returns "" if fork failed or if cmd returns error
380 }
381
382 # run a shell command synchronously and return its error status
383 sub shell_command_status {
384 my ( $cmd ) = @_;
385
386 $verbose and print "executing: $cmd\n";
387 if ( system ($cmd) ) {
388 if ($? == -1) {
389 print STDERR "failed to execute: $!\n";
390 } elsif ($? & 127) {
391 printf STDERR "child died with signal %d, %s coredump\n",
392 ($? & 127), ($? & 128) ? 'with' : 'without';
393 } else {
394 $verbose and printf STDERR "child exited with value %d\n", $? >> 8;
395 }
396 }
397 return $?;
398 }
399
400 sub notify {
401 my ( $subject, $body, $notify ) = @_;
402
403 if ( defined $notify_email ) {
404
405 $verbose and print "notifying $notify_email\n";
406 my $full_subject = "Eucalyptus watchdog ALERT: $subject";
407
408 if ( open (MAIL, "| mailx -s '$full_subject' $notify_email") ) {
409 print MAIL "$body";
410 close MAIL;
411 } else {
412 print STDERR "failed to run mailx for notification\n";
413 }
414 }
415 }
416
417 sub warning {
418 my ( $str ) = @_;
419
420 print STDERR "WARNING: $str\n";
421 }
422
423 sub error {
424 my ( $str, $notify ) = @_;
425
426 print STDERR "ERROR: $str\n";
427 if ( defined $notify and $notify != 0 ) {
428 notify ("euca_watchdog.pl died", "euca_watchdog.pl died due to error:\n$str\n");
429 }
430 exit 1;
431 }
Something went wrong with that request. Please try again.