Permalink
Browse files

detect ECC memory correctable and uncorrectable errors; fixes #1508

  • Loading branch information...
1 parent 1887a9a commit 2ecf423c40b6a774781711d2e6843c44aad05f1c @ktsaou ktsaou committed Jan 11, 2017
Showing with 228 additions and 2 deletions.
  1. +1 −1 CMakeLists.txt
  2. +30 −0 conf.d/health.d/memory.conf
  3. +1 −0 src/Makefile.am
  4. +11 −1 src/plugin_proc.c
  5. +1 −0 src/plugin_proc.h
  6. +184 −0 src/sys_devices_system_edac_mc.c
View
@@ -101,7 +101,7 @@ set(NETDATA_SOURCE_FILES
src/registry_person.c
src/registry_person.h
src/registry_machine.c
- src/registry_machine.h src/registry_internals.c src/registry_init.c src/registry_db.c src/registry_log.c src/proc_uptime.c)
+ src/registry_machine.h src/registry_internals.c src/registry_init.c src/registry_db.c src/registry_log.c src/proc_uptime.c src/sys_devices_system_edac_mc.c)
set(APPS_PLUGIN_SOURCE_FILES
src/appconfig.c
@@ -0,0 +1,30 @@
+
+ alarm: 1hour_ecc_memory_correctable
+ on: mem.ecc_ce
+ lookup: sum -10m unaligned
+ units: errors
+ every: 1m
+ warn: $this > 0
+ delay: down 1h multiplier 1.5 max 1h
+ info: number of ECC correctable errors during the last hour
+ to: sysadmin
+
+ alarm: 1hour_ecc_memory_uncorrectable
+ on: mem.ecc_ue
+ lookup: sum -10m unaligned
+ units: errors
+ every: 1m
+ crit: $this > 0
+ delay: down 1h multiplier 1.5 max 1h
+ info: number of ECC uncorrectable errors during the last hour
+ to: sysadmin
+
+ alarm: 1hour_memory_hw_corrupted
+ on: mem.hwcorrupt
+ calc: $HardwareCorrupted
+ units: MB
+ every: 10s
+ warn: $this > 0
+ delay: down 1h multiplier 1.5 max 1h
+ info: amount of memory corrupted due to a hardware failure
+ to: sysadmin
View
@@ -47,6 +47,7 @@ netdata_SOURCES = \
popen.c popen.h \
socket.c socket.h \
sys_fs_cgroup.c \
+ sys_devices_system_edac_mc.c \
procfile.c procfile.h \
proc_self_mountinfo.c proc_self_mountinfo.h \
registry.c registry.h \
View
@@ -30,7 +30,8 @@ void *proc_main(void *ptr)
int vdo_proc_vmstat = !config_get_boolean("plugin:proc", "/proc/vmstat", 1);
int vdo_proc_net_rpc_nfs = !config_get_boolean("plugin:proc", "/proc/net/rpc/nfs", 1);
int vdo_proc_net_rpc_nfsd = !config_get_boolean("plugin:proc", "/proc/net/rpc/nfsd", 1);
- int vdo_proc_sys_kernel_random_entropy_avail = !config_get_boolean("plugin:proc", "/proc/sys/kernel/random/entropy_avail", 1);
+ int vdo_proc_sys_kernel_random_entropy_avail = !config_get_boolean("plugin:proc", "/proc/sys/kernel/random/entropy_avail", 1);
+ int vdo_proc_sys_devices_system_edac_mc = !config_get_boolean("plugin:proc", "/sys/devices/system/edac/mc", 1);
int vdo_proc_interrupts = !config_get_boolean("plugin:proc", "/proc/interrupts", 1);
int vdo_proc_softirqs = !config_get_boolean("plugin:proc", "/proc/softirqs", 1);
int vdo_proc_net_softnet_stat = !config_get_boolean("plugin:proc", "/proc/net/softnet_stat", 1);
@@ -55,6 +56,7 @@ void *proc_main(void *ptr)
usec_t sutime_proc_net_rpc_nfs = 0ULL;
usec_t sutime_proc_net_rpc_nfsd = 0ULL;
usec_t sutime_proc_sys_kernel_random_entropy_avail = 0ULL;
+ usec_t sutime_proc_sys_devices_system_edac_mc = 0ULL;
usec_t sutime_proc_interrupts = 0ULL;
usec_t sutime_proc_softirqs = 0ULL;
usec_t sutime_proc_net_softnet_stat = 0ULL;
@@ -142,6 +144,14 @@ void *proc_main(void *ptr)
}
if(unlikely(netdata_exit)) break;
+ if(!vdo_proc_sys_devices_system_edac_mc) {
+ debug(D_PROCNETDEV_LOOP, "PROCNETDEV: calling do_proc_sys_devices_system_edac_mc().");
+ now = now_realtime_usec();
+ vdo_proc_sys_devices_system_edac_mc = do_proc_sys_devices_system_edac_mc(rrd_update_every, (sutime_proc_sys_devices_system_edac_mc > 0)?now - sutime_proc_sys_devices_system_edac_mc:0ULL);
+ sutime_proc_sys_devices_system_edac_mc = now;
+ }
+ if(unlikely(netdata_exit)) break;
+
if(!vdo_proc_net_dev) {
debug(D_PROCNETDEV_LOOP, "PROCNETDEV: calling do_proc_net_dev().");
now = now_realtime_usec();
View
@@ -23,5 +23,6 @@ extern int do_proc_loadavg(int update_every, usec_t dt);
extern int do_proc_net_stat_synproxy(int update_every, usec_t dt);
extern int do_proc_net_softnet_stat(int update_every, usec_t dt);
extern int do_proc_uptime(int update_every, usec_t dt);
+extern int do_proc_sys_devices_system_edac_mc(int update_every, usec_t dt);
#endif /* NETDATA_PLUGIN_PROC_H */
@@ -0,0 +1,184 @@
+#include "common.h"
+
+struct mc {
+ char *name;
+ char ce_updated;
+ char ue_updated;
+
+ char *ce_count_filename;
+ char *ue_count_filename;
+
+ procfile *ce_ff;
+ procfile *ue_ff;
+
+ collected_number ce_count;
+ collected_number ue_count;
+
+ RRDDIM *ce_rd;
+ RRDDIM *ue_rd;
+
+ struct mc *next;
+};
+static struct mc *mc_root = NULL;
+
+static void find_all_mc() {
+ char name[FILENAME_MAX + 1];
+ snprintfz(name, FILENAME_MAX, "%s%s", global_host_prefix, "/sys/devices/system/edac/mc");
+ char *dirname = config_get("plugin:proc:/sys/devices/system/edac/mc", "directory to monitor", name);
+
+ DIR *dir = opendir(dirname);
+ if(!dir) {
+ error("Cannot read ECC memory errors directory '%s'", dirname);
+ return;
+ }
+
+ struct dirent *de = NULL;
+ while((de = readdir(dir))) {
+ if(de->d_type == DT_DIR && de->d_name[0] == 'm' && de->d_name[1] == 'c' && isdigit(de->d_name[2])) {
+ struct mc *m = callocz(1, sizeof(struct mc));
+ m->name = strdupz(de->d_name);
+
+ struct stat st;
+
+ snprintfz(name, FILENAME_MAX, "%s/%s/ce_count", dirname, de->d_name);
+ if(stat(name, &st) != -1)
+ m->ce_count_filename = strdupz(name);
+
+ snprintfz(name, FILENAME_MAX, "%s/%s/ue_count", dirname, de->d_name);
+ if(stat(name, &st) != -1)
+ m->ue_count_filename = strdupz(name);
+
+ if(!m->ce_count_filename && !m->ue_count_filename) {
+ freez(m->name);
+ freez(m);
+ }
+ else {
+ m->next = mc_root;
+ mc_root = m;
+ }
+ }
+ }
+
+ closedir(dir);
+}
+
+int do_proc_sys_devices_system_edac_mc(int update_every, usec_t dt) {
+ (void)dt;
+
+ if(unlikely(mc_root == NULL)) {
+ find_all_mc();
+ if(unlikely(mc_root == NULL))
+ return 1;
+ }
+
+ static int do_ce = -1, do_ue = -1;
+ calculated_number ce_sum = 0, ue_sum = 0;
+ struct mc *m;
+
+ if(unlikely(do_ce == -1)) {
+ do_ce = config_get_boolean_ondemand("plugin:proc:/sys/devices/system/edac/mc", "enable ECC memory correctable errors", CONFIG_ONDEMAND_ONDEMAND);
+ do_ue = config_get_boolean_ondemand("plugin:proc:/sys/devices/system/edac/mc", "enable ECC memory uncorrectable errors", CONFIG_ONDEMAND_ONDEMAND);
+ }
+
+ if(do_ce != CONFIG_ONDEMAND_NO) {
+ for(m = mc_root; m; m = m->next) {
+ if(m->ce_count_filename) {
+ m->ce_updated = 0;
+
+ if(unlikely(!m->ce_ff)) {
+ m->ce_ff = procfile_open(m->ce_count_filename, " \t", PROCFILE_FLAG_DEFAULT);
+ if(unlikely(!m->ce_ff))
+ continue;
+ }
+
+ m->ce_ff = procfile_readall(m->ce_ff);
+ if(unlikely(!m->ce_ff || procfile_lines(m->ce_ff) < 1 || procfile_linewords(m->ce_ff, 0) < 1))
+ continue;
+
+ m->ce_count = strtoull(procfile_lineword(m->ce_ff, 0, 0), NULL, 0);
+ ce_sum += m->ce_count;
+ m->ce_updated = 1;
+ }
+ }
+ }
+
+ if(do_ue != CONFIG_ONDEMAND_NO) {
+ for(m = mc_root; m; m = m->next) {
+ if(m->ue_count_filename) {
+ m->ue_updated = 0;
+
+ if(unlikely(!m->ue_ff)) {
+ m->ue_ff = procfile_open(m->ue_count_filename, " \t", PROCFILE_FLAG_DEFAULT);
+ if(unlikely(!m->ue_ff))
+ continue;
+ }
+
+ m->ue_ff = procfile_readall(m->ue_ff);
+ if(unlikely(!m->ue_ff || procfile_lines(m->ue_ff) < 1 || procfile_linewords(m->ue_ff, 0) < 1))
+ continue;
+
+ m->ue_count = strtoull(procfile_lineword(m->ue_ff, 0, 0), NULL, 0);
+ ue_sum += m->ue_count;
+ m->ue_updated = 1;
+ }
+ }
+ }
+
+ // --------------------------------------------------------------------
+
+ if(do_ce == CONFIG_ONDEMAND_YES || (do_ce == CONFIG_ONDEMAND_ONDEMAND && ce_sum > 0)) {
+ do_ce = CONFIG_ONDEMAND_YES;
+
+ static RRDSET *ce_st = NULL;
+
+ if(unlikely(!ce_st))
+ ce_st = rrdset_find("mem.ecc_ce");
+
+ if(unlikely(!ce_st)) {
+ ce_st = rrdset_create("mem", "ecc_ce", NULL, "ecc", NULL, "ECC Memory Correctable Errors", "errors", 6600
+ , update_every, RRDSET_TYPE_LINE);
+
+ for(m = mc_root; m; m = m->next)
+ if(m->ce_count_filename)
+ m->ce_rd = rrddim_add(ce_st, m->name, NULL, 1, 1, RRDDIM_INCREMENTAL);
+ }
+ else
+ rrdset_next(ce_st);
+
+ for(m = mc_root; m; m = m->next)
+ if(m->ce_count_filename && m->ce_updated)
+ rrddim_set_by_pointer(ce_st, m->ce_rd, m->ce_count);
+
+ rrdset_done(ce_st);
+ }
+
+ // --------------------------------------------------------------------
+
+ if(do_ue == CONFIG_ONDEMAND_YES || (do_ue == CONFIG_ONDEMAND_ONDEMAND && ue_sum > 0)) {
+ do_ue = CONFIG_ONDEMAND_YES;
+
+ static RRDSET *ue_st = NULL;
+
+ if(unlikely(!ue_st))
+ ue_st = rrdset_find("mem.ecc_ue");
+
+ if(unlikely(!ue_st)) {
+ ue_st = rrdset_create("mem", "ecc_ue", NULL, "ecc", NULL, "ECC Memory Uncorrectable Errors", "errors", 6610
+ , update_every, RRDSET_TYPE_LINE);
+
+ for(m = mc_root; m; m = m->next)
+ if(m->ue_count_filename)
+ m->ue_rd = rrddim_add(ue_st, m->name, NULL, 1, 1, RRDDIM_INCREMENTAL);
+ }
+ else
+ rrdset_next(ue_st);
+
+ for(m = mc_root; m; m = m->next)
+ if(m->ue_count_filename && m->ue_updated)
+ rrddim_set_by_pointer(ue_st, m->ue_rd, m->ue_count);
+
+ rrdset_done(ue_st);
+ }
+
+ return 0;
+}

0 comments on commit 2ecf423

Please sign in to comment.