Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

*** empty log message ***

  • Loading branch information...
commit 3268e75380bd51016371b1cb6928a47cd6c7bbf0 1 parent c8abd2c
massie authored
View
32 monitor-core/gmond/cmdline.c
@@ -47,6 +47,8 @@ cmdline_parser_print_help (void)
printf(" -d, --debug=INT Debug level. If greater than zero, daemon will stay \n in foreground. (default=`0')\n");
printf(" -f, --foreground Run in foreground (don't daemonize) (default=off)\n");
printf(" -t, --default_config Print the default configuration to stdout and exit \n (default=off)\n");
+ printf(" -m, --metrics Print the list of metrics this gmond supports \n (default=off)\n");
+ printf(" -b, --bandwidth Calculate minimum bandwidth use for configuration \n (default=off)\n");
}
@@ -77,12 +79,16 @@ cmdline_parser (int argc, char * const *argv, struct gengetopt_args_info *args_i
args_info->debug_given = 0 ;
args_info->foreground_given = 0 ;
args_info->default_config_given = 0 ;
+ args_info->metrics_given = 0 ;
+ args_info->bandwidth_given = 0 ;
#define clear_args() { \
args_info->conf_arg = gengetopt_strdup("/etc/gmond.conf") ;\
args_info->location_arg = gengetopt_strdup("0,0,0") ;\
args_info->debug_arg = 0 ;\
args_info->foreground_flag = 0;\
args_info->default_config_flag = 0;\
+ args_info->metrics_flag = 0;\
+ args_info->bandwidth_flag = 0;\
}
clear_args();
@@ -105,11 +111,13 @@ cmdline_parser (int argc, char * const *argv, struct gengetopt_args_info *args_i
{ "debug", 1, NULL, 'd' },
{ "foreground", 0, NULL, 'f' },
{ "default_config", 0, NULL, 't' },
+ { "metrics", 0, NULL, 'm' },
+ { "bandwidth", 0, NULL, 'b' },
{ NULL, 0, NULL, 0 }
};
stop_char = 0;
- c = getopt_long (argc, argv, "hVc:l:d:ft", long_options, &option_index);
+ c = getopt_long (argc, argv, "hVc:l:d:ftmb", long_options, &option_index);
if (c == -1) break; /* Exit from `while (1)' loop. */
@@ -184,6 +192,28 @@ cmdline_parser (int argc, char * const *argv, struct gengetopt_args_info *args_i
args_info->default_config_flag = !(args_info->default_config_flag);
break;
+ case 'm': /* Print the list of metrics this gmond supports. */
+ if (args_info->metrics_given)
+ {
+ fprintf (stderr, "%s: `--metrics' (`-m') option given more than once\n", CMDLINE_PARSER_PACKAGE);
+ clear_args ();
+ exit (EXIT_FAILURE);
+ }
+ args_info->metrics_given = 1;
+ args_info->metrics_flag = !(args_info->metrics_flag);
+ break;
+
+ case 'b': /* Calculate minimum bandwidth use for configuration. */
+ if (args_info->bandwidth_given)
+ {
+ fprintf (stderr, "%s: `--bandwidth' (`-b') option given more than once\n", CMDLINE_PARSER_PACKAGE);
+ clear_args ();
+ exit (EXIT_FAILURE);
+ }
+ args_info->bandwidth_given = 1;
+ args_info->bandwidth_flag = !(args_info->bandwidth_flag);
+ break;
+
case 0: /* Long option with no short option */
View
4 monitor-core/gmond/cmdline.h
@@ -29,6 +29,8 @@ struct gengetopt_args_info
int debug_arg; /* Debug level. If greater than zero, daemon will stay in foreground. (default='0'). */
int foreground_flag; /* Run in foreground (don't daemonize) (default=off). */
int default_config_flag; /* Print the default configuration to stdout and exit (default=off). */
+ int metrics_flag; /* Print the list of metrics this gmond supports (default=off). */
+ int bandwidth_flag; /* Calculate minimum bandwidth use for configuration (default=off). */
int help_given ; /* Whether help was given. */
int version_given ; /* Whether version was given. */
@@ -37,6 +39,8 @@ struct gengetopt_args_info
int debug_given ; /* Whether debug was given. */
int foreground_given ; /* Whether foreground was given. */
int default_config_given ; /* Whether default_config was given. */
+ int metrics_given ; /* Whether metrics was given. */
+ int bandwidth_given ; /* Whether bandwidth was given. */
} ;
View
2  monitor-core/gmond/cmdline.sh
@@ -11,6 +11,8 @@ option "location" l "Location of this host in the cluster 'rack,rank,plane'." st
option "debug" d "Debug level. If greater than zero, daemon will stay in foreground." int default="0" no
option "foreground" f "Run in foreground (don't daemonize)" flag off
option "default_config" t "Print the default configuration to stdout and exit" flag off
+option "metrics" m "Print the list of metrics this gmond supports" flag off
+option "bandwidth" b "Calculate minimum bandwidth use for configuration" flag off
#Usage (a little tutorial)
#
View
347 monitor-core/gmond/conf.c
@@ -1,38 +1,325 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
#include <stdio.h>
#include "conf.h"
+#include <apr_pools.h>
+#include <apr_strings.h>
-#if 0
-extern cfg_t *config_file;
+char *default_gmond_configuration = NULL;
-/* This function is necessary only because I need
- * to know if the user set metric thresholds */
-int metric_validate_func(cfg_t *cfg, cfg_opt_t *opt)
-{
- char buf[1024];
- snprintf(buf, 1024,"%s_given", opt->name);
- cfg_setbool(cfg, buf, 1);
- return 0;
-}
+#define BASE_GMOND_CONFIGURATION "\
+/* This configuration is as close to 2.5.x default behavior as possible \n\
+ The values closely match ./gmond/metric.h definitions in 2.5.x */ \n\
+globals { \n\
+ setuid = no \n\
+ user = nobody \n\
+} \n\
+\n\
+/* Feel free to specify as many udp_send_channels as you like. Gmond \n\
+ used to only support having a single channel */ \n\
+udp_send_channel { \n\
+ mcast_join = 239.2.11.71 \n\
+ port = 8649 \n\
+} \n\
+\n\
+/* You can specify as many udp_recv_channels as you like as well. */ \n\
+udp_recv_channel { \n\
+ mcast_join = 239.2.11.71 \n\
+ port = 8649 \n\
+ bind = 239.2.11.71 \n\
+} \n\
+\n\
+/* You can specify as many tcp_accept_channels as you like to share \n\
+ an xml description of the state of the cluster */ \n\
+tcp_accept_channel { \n\
+ port = 8649 \n\
+} \n\
+\n\
+\n\
+/* The old internal 2.5.x metric array has been replaced by the following \n\
+ collection_group directives. What follows is the default behavior for \n\
+ collecting and sending metrics that is as close to 2.5.x behavior as \n\
+ possible. */\n\
+\n\
+/* This collection group will cause a heartbeat (or beacon) to be sent every \n\
+ 20 seconds. In the heartbeat is the GMOND_STARTED data which expresses \n\
+ the age of the running gmond. */ \n\
+collection_group { \n\
+ collect_once = yes \n\
+ time_threshold = 20 \n\
+ metric { \n\
+ name = \"heartbeat\" \n\
+ } \n\
+} \n\
+\n\
+/* This collection group will send general info about this host every 1200 secs. \n\
+ This information doesn't change between reboots and is only collected once. */ \n\
+collection_group { \n\
+ collect_once = yes \n\
+ time_threshold = 1200 \n\
+ metric { \n\
+ name = \"cpu_num\" \n\
+ } \n\
+ metric { \n\
+ name = \"cpu_speed\" \n\
+ } \n\
+ metric { \n\
+ name = \"mem_total\" \n\
+ } \n\
+ /* Should this be here? Swap can be added/removed between reboots. */ \n\
+ metric { \n\
+ name = \"swap_total\" \n\
+ } \n\
+ metric { \n\
+ name = \"boottime\" \n\
+ } \n\
+ metric { \n\
+ name = \"machine_type\" \n\
+ } \n\
+ metric { \n\
+ name = \"os_name\" \n\
+ } \n\
+ metric { \n\
+ name = \"os_release\" \n\
+ } \n\
+ metric { \n\
+ name = \"location\" \n\
+ } \n\
+} \n\
+\n\
+/* This collection group will send the status of gexecd for this host every 300 secs */\n\
+/* Unlike 2.5.x the default behavior is to report gexecd OFF. */ \n\
+collection_group { \n\
+ collect_once = yes \n\
+ time_threshold = 300 \n\
+ metric { \n\
+ name = \"gexec\" \n\
+ } \n\
+} \n\
+\n\
+/* This collection group will collect the CPU and load status info every 20 secs. \n\
+ The time threshold is set to 90 seconds. In honesty, this time_threshold could be \n\
+ set significantly higher to reduce unneccessary network chatter. */ \n\
+collection_group { \n\
+ collect_every = 20 \n\
+ time_threshold = 90 \n\
+ /* CPU status */ \n\
+ metric { \n\
+ name = \"cpu_user\" \n\
+ value_threshold = \"1.0\" \n\
+ } \n\
+ metric { \n\
+ name = \"cpu_system\" \n\
+ value_threshold = \"1.0\" \n\
+ } \n\
+ metric { \n\
+ name = \"cpu_idle\" \n\
+ value_threshold = \"5.0\" \n\
+ } \n\
+ metric { \n\
+ name = \"cpu_nice\" \n\
+ value_threshold = \"1.0\" \n\
+ } \n\
+ metric { \n\
+ name = \"cpu_aidle\" \n\
+ value_threshold = \"5.0\" \n\
+ } \n\
+ /* Load Averages */ \n\
+ metric { \n\
+ name = \"load_one\" \n\
+ value_threshold = \"1.0\" \n\
+ } \n\
+ metric { \n\
+ name = \"load_five\" \n\
+ value_threshold = \"1.0\" \n\
+ } \n\
+ metric { \n\
+ name = \"load_fifteen\" \n\
+ value_threshold = \"1.0\" \n\
+ }\n\
+} \n\
+\n\
+/* This group collects the number of running and total processes */ \n\
+collection_group { \n\
+ collect_every = 80 \n\
+ time_threshold = 950 \n\
+ metric { \n\
+ name = \"proc_run\" \n\
+ value_threshold = \"1.0\" \n\
+ } \n\
+ metric { \n\
+ name = \"proc_total\" \n\
+ value_threshold = \"1.0\" \n\
+ } \n\
+}\n\
+\n\
+/* This collection group grabs the volatile memory metrics every 40 secs and \n\
+ sends them at least every 80 secs. This time_threshold can be increased \n\
+ significantly to reduce unneeded network traffic. */ \n\
+collection_group { \n\
+ collect_every = 40 \n\
+ time_threshold = 180 \n\
+ metric { \n\
+ name = \"mem_free\" \n\
+ value_threshold = \"1024.0\" \n\
+ } \n\
+ metric { \n\
+ name = \"mem_shared\" \n\
+ value_threshold = \"1024.0\" \n\
+ } \n\
+ metric { \n\
+ name = \"mem_buffers\" \n\
+ value_threshold = \"1024.0\" \n\
+ } \n\
+ metric { \n\
+ name = \"mem_cached\" \n\
+ value_threshold = \"1024.0\" \n\
+ } \n\
+ metric { \n\
+ name = \"swap_free\" \n\
+ value_threshold = \"1024.0\" \n\
+ } \n\
+} \n\
+\n\
+"
-void
-init_validate_funcs(void)
-{
- /* This is annoying but necessary. I need to know if the value (a float) was set by the user. */
- cfg_set_validate_func( config_file, "collection_group|metric|absolute_minimum", metric_validate_func);
- cfg_set_validate_func( config_file, "collection_group|metric|absolute_minimum_alert", metric_validate_func);
- cfg_set_validate_func( config_file, "collection_group|metric|absolute_minimum_warning", metric_validate_func);
- cfg_set_validate_func( config_file, "collection_group|metric|absolute_maximum_warning", metric_validate_func);
- cfg_set_validate_func( config_file, "collection_group|metric|absolute_maximum_alert", metric_validate_func);
- cfg_set_validate_func( config_file, "collection_group|metric|absolute_maximum", metric_validate_func);
- cfg_set_validate_func( config_file, "collection_group|metric|relative_change_normal", metric_validate_func);
- cfg_set_validate_func( config_file, "collection_group|metric|relative_change_warning", metric_validate_func);
- cfg_set_validate_func( config_file, "collection_group|metric|relative_change_alert", metric_validate_func);
-}
+#define SOLARIS_SPECIFIC_CONFIGURATION "\
+/* solaris specific metrics begin */ \n\
+collection_group { \n\
+ collect_every = 950 \n\
+ time_threshold = 3800 \n\
+ metric { \n\
+ name = \"cpu_wio\" \n\
+ value_threshold = \"5.0\" \n\
+ } \n\
+} \n\
+\n\
+collection_group { \n\
+ collect_every = 20 \n\
+ time_threshold = 90 \n\
+ metric { \n\
+ name = \"rcache\" \n\
+ value_threshold = 1.0 \n\
+ } \n\
+ metric { \n\
+ name = \"wcache\" \n\
+ value_threshold = 1.0 \n\
+ } \n\
+ metric { \n\
+ name = \"phread_sec\" \n\
+ value_threshold = 1.0 \n\
+ } \n\
+ metric { \n\
+ name = \"phwrite_sec\" \n\
+ value_threshold = 1.0 \n\
+ }\n\
+}\n\
+/* end solaris specific metrics */ \n\
+\n\
+"
+#define LINUX_FREEBSD_COMMON_CONFIG "\
+collection_group { \n\
+ collect_every = 40 \n\
+ time_threshold = 300 \n\
+ metric { \n\
+ name = \"bytes_out\" \n\
+ value_threshold = 4096 \n\
+ } \n\
+ metric { \n\
+ name = \"bytes_in\" \n\
+ value_threshold = 4096 \n\
+ } \n\
+ metric { \n\
+ name = \"pkts_in\" \n\
+ value_threshold = 256 \n\
+ } \n\
+ metric { \n\
+ name = \"pkts_out\" \n\
+ value_threshold = 256 \n\
+ } \n\
+}\n\
+\n\
+/* Different than 2.5.x default since the old config made no sense */ \n\
+collection_group { \n\
+ collect_every = 1800 \n\
+ time_threshold = 3600 \n\
+ metric { \n\
+ name = \"disk_total\" \n\
+ value_threshold = 1.0 \n\
+ } \n\
+}\n\
+\n\
+collection_group { \n\
+ collect_every = 40 \n\
+ time_threshold = 180 \n\
+ metric { \n\
+ name = \"disk_free\" \n\
+ value_threshold = 1.0 \n\
+ } \n\
+ metric { \n\
+ name = \"part_max_used\" \n\
+ value_threshold = 1.0 \n\
+ } \n\
+}\n\
+\n\
+"
+
+#define HPUX_SPECIFIC_CONFIGURATION "\n\
+collection_group { \n\
+ collect_every = 20 \n\
+ time_threshold = 90 \n\
+ metric { \n\
+ name = \"cpu_intr\" \n\
+ value_threshold = 1.0 \n\
+ } \n\
+ metric { \n\
+ name = \"cpu_ssys\" \n\
+ value_threshold = 1.0 \n\
+ } \n\
+ metric { \n\
+ name = \"cpu_wait\" \n\
+ value_threshold = 1.0 \n\
+ } \n\
+} \n\
+\n\
+collection_group { \n\
+ collect_every = 40 \n\
+ time_threshold = 90 \n\
+ metric { \n\
+ name = \"mem_arm\" \n\
+ value_threshold = 1024.0 \n\
+ } \n\
+ metric { \n\
+ name = \"mem_rm\" \n\
+ value_threshold = 1024.0 \n\
+ } \n\
+ metric { \n\
+ name = \"mem_avm\" \n\
+ value_threshold = 1024.0 \n\
+ } \n\
+ metric { \n\
+ name = \"mem_vm\" \n\
+ value_threshold = 1024.0 \n\
+ } \n\
+}\n\
+\n\
+"
-int
-value_callback(cfg_t *cfg, cfg_opt_t *opt, const char *value, void *result)
+void
+build_default_gmond_configuration(apr_pool_t *context)
{
- fprintf(stderr,"CALLED\n");
- return 0;
-}
+ default_gmond_configuration = apr_pstrdup(context, BASE_GMOND_CONFIGURATION);
+#if SOLARIS
+ default_gmond_configuration = apr_pstrcat(context, default_gmond_configuration, SOLARIS_SPECIFIC_CONFIGURATION, NULL);
+#endif
+#if LINUX || FREEBSD
+ default_gmond_configuration = apr_pstrcat(context, default_gmond_configuration, LINUX_FREEBSD_COMMON_CONFIG, NULL);
+#endif
+#if HPUX
+ default_gmond_configuration = apr_pstrcat(context, default_gmond_configuration, HPUX_SPECIFIC_CONFIGURATION, NULL);
#endif
+}
+
+
View
49 monitor-core/gmond/conf.h
@@ -7,47 +7,9 @@ in order for the documentation to be in order with the code
****************************/
#include "confuse.h"
+#include "apr_pools.h"
-#define DEFAULT_GMOND_CONFIGURATION "\
-globals { \n\
- setuid = no \n\
- user = nobody \n\
-} \n\
-udp_send_channel { \n\
- mcast_join = 239.2.11.71 \n\
- port = 8649 \n\
-} \n\
-udp_recv_channel { \n\
- mcast_join = 239.2.11.71 \n\
- port = 8649 \n\
- bind = 239.2.11.71 \n\
-} \n\
-tcp_accept_channel { \n\
- port = 8649 \n\
-} \n\
-collection_group { \n\
- collect_once = yes \n\
- time_threshold = 20 \n\
- metric { \n\
- name = \"heartbeat\" \n\
- } \n\
-} \n\
-collection_group { \n\
- collect_every = 60 \n\
- metric { \n\
- name = \"cpu_user\" \n\
- } \n\
- metric { \n\
- name = \"cpu_system\" \n\
- } \n\
- metric { \n\
- name = \"cpu_idle\" \n\
- } \n\
- metric { \n\
- name = \"cpu_nice\" \n\
- } \n\
-} \n\
-"
+void build_default_gmond_configuration(apr_pool_t *context);
static cfg_opt_t cluster_opts[] = {
CFG_STR("name", NULL, CFGF_NONE ),
@@ -57,6 +19,11 @@ static cfg_opt_t cluster_opts[] = {
CFG_END()
};
+static cfg_opt_t host_opts[] = {
+ CFG_STR("location", "unspecified", CFGF_NONE ),
+ CFG_END()
+};
+
static cfg_opt_t globals_opts[] = {
CFG_BOOL("daemonize", 1, CFGF_NONE),
CFG_BOOL("setuid", 1, CFGF_NONE),
@@ -67,6 +34,7 @@ static cfg_opt_t globals_opts[] = {
CFG_BOOL("mute", 0, CFGF_NONE),
CFG_BOOL("deaf", 0, CFGF_NONE),
CFG_INT("host_dmax", 0, CFGF_NONE),
+ CFG_BOOL("gexec", 0, CFGF_NONE),
CFG_END()
};
@@ -115,6 +83,7 @@ static cfg_opt_t collection_group_opts[] = {
static cfg_opt_t gmond_opts[] = {
CFG_SEC("cluster", cluster_opts, CFGF_NONE),
+ CFG_SEC("host", host_opts, CFGF_NONE),
CFG_SEC("globals", globals_opts, CFGF_NONE),
CFG_SEC("udp_send_channel", udp_send_channel_opts, CFGF_MULTI),
CFG_SEC("udp_recv_channel", udp_recv_channel_opts, CFGF_MULTI),
View
163 monitor-core/gmond/gmond.c
@@ -1,3 +1,6 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
@@ -41,8 +44,16 @@ int deaf;
int mute;
/* Cluster tag boolean */
int cluster_tag = 0;
+/* This host's location */
+char *host_location = NULL;
+/* Boolean. Will this host received gexec requests? */
+int gexec_on = 0;
/* This is tweakable by globals{max_udp_msg_len=...} */
int max_udp_message_len = 1472;
+/* The default configuration for gmond. Found in conf.c. */
+extern char *default_gmond_configuration;
+/* The number of seconds to hold "dead" hosts in the hosts hash */
+int host_dmax = 0;
/* The array for outgoing UDP message channels */
apr_array_header_t *udp_send_array = NULL;
@@ -155,7 +166,7 @@ process_configuration_file(void)
}
/* .. otherwise use our default configuration */
fprintf(stderr,"Using defaults.\n");
- if(cfg_parse_buf(config_file, DEFAULT_GMOND_CONFIGURATION) == CFG_PARSE_ERROR)
+ if(cfg_parse_buf(config_file, default_gmond_configuration) == CFG_PARSE_ERROR)
{
fprintf(stderr,"Your default configuration buffer failed to parse. Exiting.\n");
exit(1);
@@ -171,9 +182,14 @@ process_configuration_file(void)
exit(1);
}
- /* Get the maximum UDP message size */
+
tmp = cfg_getsec( config_file, "globals");
+ /* Get the maximum UDP message size */
max_udp_message_len = cfg_getint( tmp, "max_udp_msg_len");
+ /* Get the gexec status requested */
+ gexec_on = cfg_getbool(tmp, "gexec");
+ /* Get the host dmax ... */
+ host_dmax = cfg_getint( tmp, "host_dmax");
/* Free memory for this configuration file at exit */
if(tilde_expanded)
@@ -522,22 +538,31 @@ Ganglia_host_data_get( char *remoteip, apr_sockaddr_t *sa, Ganglia_message *full
hostdata->last_heard_from = apr_time_now();
}
- fprintf(stderr,"Processing a Ganglia_message from %s\n", hostdata->hostname);
+ debug_msg("Processing a Ganglia_message from %s", hostdata->hostname);
if(fullmsg->id == metric_location)
{
- hostdata->location = fullmsg->Ganglia_message_u.str;
+ /* We have to manage this memory here because.. returning NULL
+ * will not cause Ganglia_message_save to be run. Maybe this
+ * could be done better later i.e should these metrics be
+ * in the host->metrics list instead of the host structure? */
+ if(hostdata->location)
+ {
+ /* Free old location */
+ free(hostdata->location);
+ }
+ /* Save new location */
+ hostdata->location = strdup(fullmsg->Ganglia_message_u.str);
+ debug_msg("Got a location message %s\n", hostdata->location);
+ /* Processing is finished */
return NULL;
}
if(fullmsg->id == metric_heartbeat)
{
/* nothing more needs to be done. we handled the timestamps above. */
hostdata->gmond_started = fullmsg->Ganglia_message_u.u_int;
- fprintf(stderr,"Got a heartbeat message %d\n", hostdata->gmond_started);
- return NULL;
- }
- if(fullmsg->id == metric_gexec)
- {
+ debug_msg("Got a heartbeat message %d\n", hostdata->gmond_started);
+ /* Processing is finished */
return NULL;
}
@@ -1100,42 +1125,37 @@ Ganglia_metric_cb_define( char *name, g_val_t (*cb)(void))
return metric;
}
-#if 0
g_val_t
gexec_func ( void )
{
g_val_t val;
-
- if( gmond_config.no_gexec || ( SUPPORT_GEXEC == 0 ) )
- snprintf(val.str, MAX_G_STRING_SIZE, "%s", "OFF");
+ if( gexec_on )
+ snprintf(val.str, 32, "%s", "ON");
else
- snprintf(val.str, MAX_G_STRING_SIZE, "%s", "ON");
-
+ snprintf(val.str, 32, "%s", "OFF");
return val;
}
-#endif
g_val_t
heartbeat_func( void )
{
g_val_t val;
-
val.uint32 = started / APR_USEC_PER_SEC;
- debug_msg("my start_time is %d\n", val.uint32);
return val;
}
-#if 0
g_val_t
location_func(void)
{
g_val_t val;
-
- strncpy(val.str, gmond_config.location, MAX_G_STRING_SIZE);
- debug_msg("my location is %s", val.str);
+ if(!host_location)
+ {
+ cfg_t *host = cfg_getsec(config_file, "host");
+ host_location = cfg_getstr( host, "location");
+ }
+ strncpy(val.str, host_location, 32);
return val;
}
-#endif
/* This function imports the metrics from libmetrics right now but in the future
* we could easily do this via DSO. */
@@ -1164,13 +1184,6 @@ setup_metric_callbacks( void )
Ganglia_metric_cb_define("cpu_system", cpu_system_func);
Ganglia_metric_cb_define("cpu_idle", cpu_idle_func);
Ganglia_metric_cb_define("cpu_aidle", cpu_aidle_func);
- Ganglia_metric_cb_define("bytes_in", bytes_in_func);
- Ganglia_metric_cb_define("bytes_out", bytes_out_func);
- Ganglia_metric_cb_define("pkts_in", pkts_in_func);
- Ganglia_metric_cb_define("pkts_out", pkts_out_func);
- Ganglia_metric_cb_define("disk_total", disk_total_func);
- Ganglia_metric_cb_define("disk_free", disk_free_func);
- Ganglia_metric_cb_define("part_max_used", part_max_used_func);
Ganglia_metric_cb_define("load_one", load_one_func);
Ganglia_metric_cb_define("load_five", load_five_func);
Ganglia_metric_cb_define("load_fifteen", load_fifteen_func);
@@ -1182,15 +1195,51 @@ setup_metric_callbacks( void )
Ganglia_metric_cb_define("mem_cached", mem_cached_func);
Ganglia_metric_cb_define("swap_free", swap_free_func);
+ /* These are "internal" metrics for host heartbeat,location,gexec */
Ganglia_metric_cb_define("heartbeat", heartbeat_func);
+ Ganglia_metric_cb_define("location", location_func);
+ Ganglia_metric_cb_define("gexec", gexec_func);
/* Add platform specific metrics here... */
+#if SOLARIS
+ Ganglia_metric_cb_define("bread_sec", bread_sec_func);
+ Ganglia_metric_cb_define("bwrite_sec", bwrite_sec_func);
+ Ganglia_metric_cb_define("lread_sec", lread_sec_func);
+ Ganglia_metric_cb_define("lwrite_sec", lwrite_sec_func);
+ Ganglia_metric_cb_define("phread_sec", phread_sec_func);
+ Ganglia_metric_cb_define("phwrite_sec", phwrite_sec_func);
+ Ganglia_metric_cb_define("rcache", rcache_func);
+ Ganglia_metric_cb_define("wcache", wcache_func);
+ Ganglia_metric_cb_define("cpu_wio", cpu_wio_func);
+#endif
+
+#if LINUX || FREEBSD
+ Ganglia_metric_cb_define("bytes_in", bytes_in_func);
+ Ganglia_metric_cb_define("bytes_out", bytes_out_func);
+ Ganglia_metric_cb_define("pkts_in", pkts_in_func);
+ Ganglia_metric_cb_define("pkts_out", pkts_out_func);
+ Ganglia_metric_cb_define("disk_total", disk_total_func);
+ Ganglia_metric_cb_define("disk_free", disk_free_func);
+ Ganglia_metric_cb_define("part_max_used", part_max_used_func);
+#endif
+
+#if HPUX
+ Ganglia_metric_cb_define("cpu_intr", cpu_intr_func);
+ Ganglia_metric_cb_define("cpu_ssys", cpu_ssys_func);
+ Ganglia_metric_cb_define("cpu_wait", cpu_wait_func);
+ Ganglia_metric_cb_define("mem_arm", mem_arm_func);
+ Ganglia_metric_cb_define("mem_rm", mem_rm_func);
+ Ganglia_metric_cb_define("mem_avm", mem_avm_func);
+ Ganglia_metric_cb_define("mem_vm", mem_vm_func);
+#endif
+
}
-void
+double
setup_collection_groups( void )
{
int i, num_collection_groups = cfg_size( config_file, "collection_group" );
+ double bytes_per_sec = 0;
/* Create the collection group array */
collection_groups = apr_array_make( global_context, num_collection_groups,
@@ -1273,6 +1322,9 @@ setup_collection_groups( void )
}
memset( &(metric_cb->last), 0, sizeof(g_val_t));
+ /* Calculate the bandwidth this metric will use */
+ bytes_per_sec += ( (double)metric_info->msg_size / (double)group->time_threshold );
+
/* Push this metric onto the metric_array for this group */
*(Ganglia_metric_callback **)apr_array_push(group->metric_array) = metric_cb;
}
@@ -1280,6 +1332,8 @@ setup_collection_groups( void )
/* Save the collection group the collection group array */
*(Ganglia_collection_group **)apr_array_push(collection_groups) = group;
}
+
+ return bytes_per_sec;
}
void
@@ -1453,6 +1507,23 @@ process_collection_groups( apr_time_t now )
return next;
}
+static void
+print_metric_list( void )
+{
+ apr_hash_index_t *hi;
+ void *val;
+
+ for(hi = apr_hash_first(global_context, metric_callbacks);
+ hi;
+ hi = apr_hash_next(hi))
+ {
+ Ganglia_metric_callback *cb;
+ apr_hash_this(hi, NULL, NULL, &val);
+ cb = val;
+ fprintf(stdout, "%s\n", cb->name);
+ }
+}
+
int
main ( int argc, char *argv[] )
{
@@ -1461,23 +1532,43 @@ main ( int argc, char *argv[] )
/* Mark the time this gmond started */
started = apr_time_now();
+ /* Initializes the apr library in ./srclib/apr */
+ initialize_apr_library();
+
+ /* Builds a default configuration based on platform */
+ build_default_gmond_configuration(global_context);
+
if (cmdline_parser (argc, argv, &args_info) != 0)
exit(1) ;
if(args_info.default_config_flag)
{
- fprintf(stdout, DEFAULT_GMOND_CONFIGURATION);
+ fprintf(stdout, default_gmond_configuration);
+ fflush( stdout );
+ exit(0);
+ }
+
+ if(args_info.metrics_flag)
+ {
+ setup_metric_callbacks();
+ print_metric_list();
fflush( stdout );
exit(0);
}
process_configuration_file();
+ if(args_info.bandwidth_flag)
+ {
+ double bytes_per_sec;
+ setup_metric_callbacks();
+ bytes_per_sec = setup_collection_groups();
+ fprintf(stdout, "%f bytes/sec\n", bytes_per_sec);
+ exit(0);
+ }
+
daemonize_if_necessary( argv );
- /* Initializes the apr library in ./srclib/apr */
- initialize_apr_library();
-
/* Collect my hostname */
apr_gethostname( myname, APRMAXHOSTLEN+1, global_context);
@@ -1511,7 +1602,7 @@ main ( int argc, char *argv[] )
{
for(; mute || now < next_collection;)
{
- poll_listen_channels(mute? -1: next_collection - now);
+ poll_listen_channels(mute? 60 * APR_USEC_PER_SEC: next_collection - now);
now = apr_time_now();
}
}
View
218 monitor-core/gmond/gmond.conf
@@ -1,51 +1,217 @@
-/* Global behavior */
+/* This configuration is as close to 2.5.x default behavior as possible
+ The values closely match ./gmond/metric.h definitions in 2.5.x */
globals {
- setuid = no
- user = nobody
+ setuid = no
+ user = nobody
}
-/* IO Channels */
+/* Feel free to specify as many udp_send_channels as you like. Gmond
+ used to only support having a single channel */
udp_send_channel {
- ip = 127.0.0.1
+ mcast_join = 239.2.11.71
port = 8649
}
+
+/* You can specify as many udp_recv_channels as you like as well. */
udp_recv_channel {
+ mcast_join = 239.2.11.71
port = 8649
+ bind = 239.2.11.71
}
-tcp_accept_channel {
- port = 8649
-}
-/* Metrics */
-collection_group {
- collect_once = true
- time_threshold = 20
- metric {
- name = "heartbeat"
- }
-}
+/* You can specify as many tcp_accept_channels as you like to share
+ an xml description of the state of the cluster */
+tcp_accept_channel {
+ port = 8649
+}
+
+
+/* The old internal 2.5.x metric array has been replaced by the following
+ collection_group directives. What follows is the default behavior for
+ collecting and sending metrics that is as close to 2.5.x behavior as
+ possible. */
+
+/* This collection group will cause a heartbeat (or beacon) to be sent every
+ 20 seconds. In the heartbeat is the GMOND_STARTED data which expresses
+ the age of the running gmond. */
+collection_group {
+ collect_once = yes
+ time_threshold = 20
+ metric {
+ name = "heartbeat"
+ }
+}
+
+/* This collection group will send general info about this host every 1200 secs.
+ This information doesn't change between reboots and is only collected once. */
collection_group {
- collect_every = 2
- time_threshold = 60
+ collect_once = yes
+ time_threshold = 1200
+ metric {
+ name = "cpu_num"
+ }
+ metric {
+ name = "cpu_speed"
+ }
+ metric {
+ name = "mem_total"
+ }
+ /* Should this be here? Swap can be added/removed between reboots. */
+ metric {
+ name = "swap_total"
+ }
+ metric {
+ name = "boottime"
+ }
+ metric {
+ name = "machine_type"
+ }
+ metric {
+ name = "os_name"
+ }
+ metric {
+ name = "os_release"
+ }
+ metric {
+ name = "location"
+ }
+}
+
+/* This collection group will send the status of gexecd for this host every 300 secs */
+/* Unlike 2.5.x the default behavior is to report gexecd OFF. */
+collection_group {
+ collect_once = yes
+ time_threshold = 300
+ metric {
+ name = "gexec"
+ }
+}
+
+/* This collection group will collect the CPU and load status info every 20 secs.
+ The time threshold is set to 90 seconds. In honesty, this time_threshold could be
+ set significantly higher to reduce unneccessary network chatter. */
+collection_group {
+ collect_every = 20
+ time_threshold = 90
+ /* CPU status */
metric {
name = "cpu_user"
+ value_threshold = "1.0"
}
metric {
name = "cpu_system"
+ value_threshold = "1.0"
}
metric {
name = "cpu_idle"
+ value_threshold = "5.0"
}
metric {
name = "cpu_nice"
+ value_threshold = "1.0"
}
-}
-collection_group {
- collect_every = 5
- metric {
- name = "mem_free"
- }
- metric {
- name = "mem_total"
+ metric {
+ name = "cpu_aidle"
+ value_threshold = "5.0"
+ }
+ /* Load Averages */
+ metric {
+ name = "load_one"
+ value_threshold = "1.0"
+ }
+ metric {
+ name = "load_five"
+ value_threshold = "1.0"
+ }
+ metric {
+ name = "load_fifteen"
+ value_threshold = "1.0"
}
+}
+
+/* This group collects the number of running and total processes */
+collection_group {
+ collect_every = 80
+ time_threshold = 950
+ metric {
+ name = "proc_run"
+ value_threshold = "1.0"
+ }
+ metric {
+ name = "proc_total"
+ value_threshold = "1.0"
+ }
+}
+
+/* This collection group grabs the volatile memory metrics every 40 secs and
+ sends them at least every 80 secs. This time_threshold can be increased
+ significantly to reduce unneeded network traffic. */
+collection_group {
+ collect_every = 40
+ time_threshold = 180
+ metric {
+ name = "mem_free"
+ value_threshold = "1024.0"
+ }
+ metric {
+ name = "mem_shared"
+ value_threshold = "1024.0"
+ }
+ metric {
+ name = "mem_buffers"
+ value_threshold = "1024.0"
+ }
+ metric {
+ name = "mem_cached"
+ value_threshold = "1024.0"
+ }
+ metric {
+ name = "swap_free"
+ value_threshold = "1024.0"
+ }
+}
+
+collection_group {
+ collect_every = 40
+ time_threshold = 300
+ metric {
+ name = "bytes_out"
+ value_threshold = 4096
+ }
+ metric {
+ name = "bytes_in"
+ value_threshold = 4096
+ }
+ metric {
+ name = "pkts_in"
+ value_threshold = 256
+ }
+ metric {
+ name = "pkts_out"
+ value_threshold = 256
+ }
}
+
+/* Different than 2.5.x default since the old config made no sense */
+collection_group {
+ collect_every = 1800
+ time_threshold = 3600
+ metric {
+ name = "disk_total"
+ value_threshold = 1.0
+ }
+}
+
+collection_group {
+ collect_every = 40
+ time_threshold = 180
+ metric {
+ name = "disk_free"
+ value_threshold = 1.0
+ }
+ metric {
+ name = "part_max_used"
+ value_threshold = 1.0
+ }
+}
+
View
3  monitor-core/lib/apr_net.h
@@ -14,6 +14,9 @@ APR_DECLARE(apr_status_t)
apr_sockaddr_ip_buffer_get(char *addr, int len, apr_sockaddr_t *sockaddr);
apr_socket_t *
+create_mcast_client(apr_pool_t *context, char *mcast_ip, apr_port_t port, int ttl);
+
+apr_socket_t *
create_mcast_server(apr_pool_t *context, char *mcast_ip, apr_port_t port, char *bind, char *interface);
apr_socket_t *
View
84 monitor-core/lib/protocol.x
@@ -1,3 +1,5 @@
+#define UDP_HEADER_SIZE 28
+
enum Ganglia_value_types {
GANGLIA_VALUE_UNKNOWN,
GANGLIA_VALUE_STRING,
@@ -121,6 +123,7 @@ struct Ganglia_25metric
string units<32>;
string slope<32>;
string fmt<32>;
+ int msg_size;
};
#ifdef RPC_HDR
@@ -130,42 +133,42 @@ struct Ganglia_25metric
#ifdef RPC_XDR
% Ganglia_25metric ganglia_25_metric_array[GANGLIA_NUM_25_METRICS] = {
-% { metric_user_defined, "gmetric", 0, GANGLIA_VALUE_UNKNOWN, "", "" },
-% { metric_cpu_num, "cpu_num", 1200, GANGLIA_VALUE_UNSIGNED_SHORT, "CPUs", "zero", "%hu"},
-% { metric_cpu_speed, "cpu_speed", 1200, GANGLIA_VALUE_UNSIGNED_INT, "MHz", "zero", "%hu"},
-% { metric_mem_total, "mem_total", 1200, GANGLIA_VALUE_UNSIGNED_INT, "KB", "zero", "%u"},
-% { metric_swap_total, "swap_total", 1200, GANGLIA_VALUE_UNSIGNED_INT, "KB", "zero", "%u"},
-% { metric_boottime, "boottime", 1200, GANGLIA_VALUE_UNSIGNED_INT, "s", "zero", "%u"},
-% { metric_sys_clock, "sys_clock", 1200, GANGLIA_VALUE_UNSIGNED_INT, "s", "zero", "%u"},
-% { metric_machine_type, "machine_type", 1200, GANGLIA_VALUE_STRING, "", "zero", "%s"},
-% { metric_os_name, "os_name", 1200, GANGLIA_VALUE_STRING, "", "zero", "%s"},
-% { metric_os_release, "os_release", 1200, GANGLIA_VALUE_STRING, "", "zero", "%s"},
-% { metric_cpu_user, "cpu_user", 90, GANGLIA_VALUE_FLOAT, "%", "both", "%.1f"},
-% { metric_cpu_nice, "cpu_nice", 90, GANGLIA_VALUE_FLOAT, "%", "both", "%.1f"},
-% { metric_cpu_system, "cpu_system", 90, GANGLIA_VALUE_FLOAT, "%", "both", "%.1f"},
-% { metric_cpu_idle, "cpu_idle", 90, GANGLIA_VALUE_FLOAT, "%", "both", "%.1f"},
-% { metric_cpu_aidle, "cpu_aidle", 3800, GANGLIA_VALUE_FLOAT, "%", "both", "%.1f"},
-% { metric_load_one, "load_one", 70, GANGLIA_VALUE_FLOAT, "", "both", "%.2f"},
-% { metric_load_five, "load_five", 325, GANGLIA_VALUE_FLOAT, "", "both", "%.2f"},
-% { metric_load_fifteen, "load_fifteen", 950, GANGLIA_VALUE_FLOAT, "", "both", "%.2f"},
-% { metric_proc_run, "proc_run", 950, GANGLIA_VALUE_UNSIGNED_INT, "", "both", "%u"},
-% { metric_proc_total, "proc_total", 950, GANGLIA_VALUE_UNSIGNED_INT, "", "both", "%u"},
-% { metric_mem_free, "mem_free", 180, GANGLIA_VALUE_UNSIGNED_INT, "KB", "both", "%u"},
-% { metric_mem_shared, "mem_shared", 180, GANGLIA_VALUE_UNSIGNED_INT, "KB", "both", "%u"},
-% { metric_mem_buffers, "mem_buffers", 180, GANGLIA_VALUE_UNSIGNED_INT, "KB", "both", "%u"},
-% { metric_mem_cached, "mem_cached", 180, GANGLIA_VALUE_UNSIGNED_INT, "KB", "both", "%u"},
-% { metric_swap_free, "swap_free", 180, GANGLIA_VALUE_UNSIGNED_INT, "KB", "both", "%u"},
-% { metric_gexec, "gexec", 300, GANGLIA_VALUE_STRING, "", "zero", "%s"},
-% { metric_heartbeat, "heartbeat", 20, GANGLIA_VALUE_UNSIGNED_INT, "", "", "%u"},
-% { metric_mtu, "mtu", 1200, GANGLIA_VALUE_UNSIGNED_INT, "", "both", "%u"},
-% { metric_location, "location", 1200, GANGLIA_VALUE_STRING, "(x,y,z)", "", "%s"},
-% { metric_bytes_out, "bytes_out", 300, GANGLIA_VALUE_FLOAT, "bytes/sec", "both", "%.2f"},
-% { metric_bytes_in, "bytes_in", 300, GANGLIA_VALUE_FLOAT, "bytes/sec", "both", "%.2f"},
-% { metric_pkts_in, "pkts_in", 300, GANGLIA_VALUE_FLOAT, "packets/sec","both", "%.2f"},
-% { metric_pkts_out, "pkts_out", 300, GANGLIA_VALUE_FLOAT, "packets/sec","both", "%.2f"},
-% { metric_disk_total, "disk_total", 1200, GANGLIA_VALUE_DOUBLE, "GB", "both", "%.3f"},
-% { metric_disk_free, "disk_free", 180, GANGLIA_VALUE_DOUBLE, "GB", "both", "%.3f"},
-% { metric_part_max_used,"part_max_used", 180, GANGLIA_VALUE_FLOAT, "%", "both", "%.1f"}
+% {metric_user_defined, "gmetric", 0, GANGLIA_VALUE_UNKNOWN, "", "", 0 },
+% {metric_cpu_num, "cpu_num", 1200, GANGLIA_VALUE_UNSIGNED_SHORT, "CPUs", "zero", "%hu", UDP_HEADER_SIZE+8},
+% {metric_cpu_speed, "cpu_speed", 1200, GANGLIA_VALUE_UNSIGNED_INT, "MHz", "zero", "%hu", UDP_HEADER_SIZE+8},
+% {metric_mem_total, "mem_total", 1200, GANGLIA_VALUE_UNSIGNED_INT, "KB", "zero", "%u", UDP_HEADER_SIZE+8},
+% {metric_swap_total, "swap_total", 1200, GANGLIA_VALUE_UNSIGNED_INT, "KB", "zero", "%u", UDP_HEADER_SIZE+8},
+% {metric_boottime, "boottime", 1200, GANGLIA_VALUE_UNSIGNED_INT, "s", "zero", "%u", UDP_HEADER_SIZE+8},
+% {metric_sys_clock, "sys_clock", 1200, GANGLIA_VALUE_UNSIGNED_INT, "s", "zero", "%u", UDP_HEADER_SIZE+8},
+% {metric_machine_type, "machine_type",1200, GANGLIA_VALUE_STRING, "", "zero", "%s", UDP_HEADER_SIZE+32},
+% {metric_os_name, "os_name", 1200, GANGLIA_VALUE_STRING, "", "zero", "%s", UDP_HEADER_SIZE+32},
+% {metric_os_release, "os_release", 1200, GANGLIA_VALUE_STRING, "", "zero", "%s", UDP_HEADER_SIZE+32},
+% {metric_cpu_user, "cpu_user", 90, GANGLIA_VALUE_FLOAT, "%", "both", "%.1f",UDP_HEADER_SIZE+8},
+% {metric_cpu_nice, "cpu_nice", 90, GANGLIA_VALUE_FLOAT, "%", "both", "%.1f",UDP_HEADER_SIZE+8},
+% {metric_cpu_system, "cpu_system", 90, GANGLIA_VALUE_FLOAT, "%", "both", "%.1f",UDP_HEADER_SIZE+8},
+% {metric_cpu_idle, "cpu_idle", 90, GANGLIA_VALUE_FLOAT, "%", "both", "%.1f",UDP_HEADER_SIZE+8},
+% {metric_cpu_aidle, "cpu_aidle", 3800, GANGLIA_VALUE_FLOAT, "%", "both", "%.1f",UDP_HEADER_SIZE+8},
+% {metric_load_one, "load_one", 70, GANGLIA_VALUE_FLOAT, "", "both", "%.2f",UDP_HEADER_SIZE+8},
+% {metric_load_five, "load_five", 325, GANGLIA_VALUE_FLOAT, "", "both", "%.2f",UDP_HEADER_SIZE+8},
+% {metric_load_fifteen, "load_fifteen",950, GANGLIA_VALUE_FLOAT, "", "both", "%.2f",UDP_HEADER_SIZE+8},
+% {metric_proc_run, "proc_run", 950, GANGLIA_VALUE_UNSIGNED_INT, "", "both", "%u", UDP_HEADER_SIZE+8},
+% {metric_proc_total, "proc_total", 950, GANGLIA_VALUE_UNSIGNED_INT, "", "both", "%u", UDP_HEADER_SIZE+8},
+% {metric_mem_free, "mem_free", 180, GANGLIA_VALUE_UNSIGNED_INT, "KB", "both", "%u", UDP_HEADER_SIZE+8},
+% {metric_mem_shared, "mem_shared", 180, GANGLIA_VALUE_UNSIGNED_INT, "KB", "both", "%u", UDP_HEADER_SIZE+8},
+% {metric_mem_buffers, "mem_buffers", 180, GANGLIA_VALUE_UNSIGNED_INT, "KB", "both", "%u", UDP_HEADER_SIZE+8},
+% {metric_mem_cached, "mem_cached", 180, GANGLIA_VALUE_UNSIGNED_INT, "KB", "both", "%u", UDP_HEADER_SIZE+8},
+% {metric_swap_free, "swap_free", 180, GANGLIA_VALUE_UNSIGNED_INT, "KB", "both", "%u", UDP_HEADER_SIZE+8},
+% {metric_gexec, "gexec", 300, GANGLIA_VALUE_STRING, "", "zero", "%s", UDP_HEADER_SIZE+32},
+% {metric_heartbeat, "heartbeat", 20, GANGLIA_VALUE_UNSIGNED_INT, "", "", "%u", UDP_HEADER_SIZE+8},
+% {metric_mtu, "mtu", 1200, GANGLIA_VALUE_UNSIGNED_INT, "", "both", "%u", UDP_HEADER_SIZE+8},
+% {metric_location, "location", 1200, GANGLIA_VALUE_STRING, "(x,y,z)", "", "%s", UDP_HEADER_SIZE+12},
+% {metric_bytes_out, "bytes_out", 300, GANGLIA_VALUE_FLOAT, "bytes/sec", "both", "%.2f",UDP_HEADER_SIZE+8},
+% {metric_bytes_in, "bytes_in", 300, GANGLIA_VALUE_FLOAT, "bytes/sec", "both", "%.2f",UDP_HEADER_SIZE+8},
+% {metric_pkts_in, "pkts_in", 300, GANGLIA_VALUE_FLOAT, "packets/sec","both", "%.2f",UDP_HEADER_SIZE+8},
+% {metric_pkts_out, "pkts_out", 300, GANGLIA_VALUE_FLOAT, "packets/sec","both", "%.2f",UDP_HEADER_SIZE+8},
+% {metric_disk_total, "disk_total", 1200, GANGLIA_VALUE_DOUBLE, "GB", "both", "%.3f",UDP_HEADER_SIZE+16},
+% {metric_disk_free, "disk_free", 180, GANGLIA_VALUE_DOUBLE, "GB", "both", "%.3f",UDP_HEADER_SIZE+16},
+% {metric_part_max_used,"part_max_used",180, GANGLIA_VALUE_FLOAT, "%", "both", "%.1f",UDP_HEADER_SIZE+8}
% };
%
% Ganglia_25metric *
@@ -195,12 +198,3 @@ struct Ganglia_25metric
% return NULL;
% }
#endif
-
-
-
-
-
-
-
-
-
Please sign in to comment.
Something went wrong with that request. Please try again.