Merge pull request #5610 from scholtalbers/feature/cgroup_metrics

Add basic cgroup metric collection
galaxyproject · Mar 9, 2018 · c5df43c · c5df43c
2 parents 2195911 + 47caac3
commit c5df43c
Show file tree

Hide file tree

Showing 2 changed files with 106 additions and 12 deletions.
diff --git a/config/job_metrics_conf.xml.sample b/config/job_metrics_conf.xml.sample
@@ -15,7 +15,7 @@
   <!-- Core plugin captures Galaxy slots, start and end of job (in seconds
        since epoch) and computes runtime in seconds. -->
   <core />
-  
+
   <!-- Uncomment to dump processor count for each job - linux only. -->
   <!-- <cpuinfo /> -->
   <!-- Uncomment to dump information about all processors for for each
@@ -30,13 +30,18 @@
        only. -->
   <!-- <uname /> -->
 
-  <!-- Uncomment following to enable plugin dumping complete environment 
+  <!-- Uncomment following to enable plugin dumping complete environment
        for each job, potentially useful for debuging -->
   <!-- <env /> -->
   <!-- env plugin can also record more targetted, obviously useful variables
        as well. -->
   <!-- <env variables="HOSTNAME,SLURM_CPUS_ON_NODE,SLURM_JOBID" /> -->
 
+  <!-- If galaxy jobs are run in cgroups, like slurm does if memory limits
+       are enforced, we can try to grep some information from this. -->
+  <!-- <cgroup /> -->
+  <!-- <cgroup verbose="true" /> -->
+
   <!-- <collectl /> -->
   <!-- Collectl (http://collectl.sourceforge.net/) is a powerful monitoring
        utility capable of gathering numerous system and process level
@@ -46,7 +51,7 @@
        is highly customiziable - both using the attributes documented below
        or simply hacking up the code in lib/galaxy/jobs/metrics.
 
-       Warning: In order to use this plugin collectl must be available on the 
+       Warning: In order to use this plugin collectl must be available on the
        compute server the job runs on and on the local Galaxy server as well
        (unless in this latter case summarize_process_data is set to False).
 
@@ -64,12 +69,12 @@
               full time-series data corresponding to a job run.
 
        'subsystems': Comma separated list of collectl subystems to collect
-              data for. Plugin doesn't currently expose all of them or offer 
-              summary data for any of them except 'process' but extensions 
-              would be welcome. May seem pointless to include subsystems 
+              data for. Plugin doesn't currently expose all of them or offer
+              summary data for any of them except 'process' but extensions
+              would be welcome. May seem pointless to include subsystems
               beside process since they won't be processed online by Galaxy -
               but if 'saved_logs_path' these files can be played back at anytime.
-              
+
               Available subsystems - 'process', 'cpu', 'memory', 'network',
               'disk', 'network'. (Default 'process').
 
@@ -108,13 +113,13 @@
 
        'flush': Interval (in seconds I think) between when collectl will
               flush its buffer to disk. Galaxy overrides this to disable
-              flushing by default if not set. 
+              flushing by default if not set.
 
        'local_collectl_path', 'remote_collectl_path', 'collectl_path':
-              By default, jobs will just assume collectl is on the PATH, but 
-              it can be overridden with 'local_collectl_path' and 
-              'remote_collectl_path' (or simply 'collectl_path' if it is not 
-              on the path but installed in the same location both locally and 
+              By default, jobs will just assume collectl is on the PATH, but
+              it can be overridden with 'local_collectl_path' and
+              'remote_collectl_path' (or simply 'collectl_path' if it is not
+              on the path but installed in the same location both locally and
               remotely).
 
         There are more and more increasingly obsecure options including -

diff --git a/lib/galaxy/jobs/metrics/instrumenters/cgroup.py b/lib/galaxy/jobs/metrics/instrumenters/cgroup.py
@@ -0,0 +1,89 @@
+"""The module describes the ``cgroup`` job metrics plugin."""
+import logging
+
+from galaxy.util import asbool, nice_size
+from ..instrumenters import InstrumentPlugin
+from ...metrics import formatting
+
+log = logging.getLogger(__name__)
+
+TITLES = {
+    "memory.memsw.max_usage_in_bytes": "Max memory usage (MEM+SWP)",
+    "memory.max_usage_in_bytes": "Max memory usage (MEM)",
+    "memory.limit_in_bytes": "Memory limit on cgroup (MEM)",
+    "memory.memsw.limit_in_bytes": "Memory limit on cgroup (MEM+SWP)",
+    "memory.soft_limit_in_bytes": "Memory softlimit on cgroup",
+    "memory.failcnt": "Failed to allocate memory count",
+    "memory.oom_control": "OOM Control enabled",
+    "under_oom": "Was OOM Killer active?",
+    "cpuacct.usage": "CPU Time"
+}
+CONVERSION = {
+    "memory.memsw.max_usage_in_bytes": nice_size,
+    "memory.max_usage_in_bytes": nice_size,
+    "memory.limit_in_bytes": nice_size,
+    "memory.memsw.limit_in_bytes": nice_size,
+    "memory.soft_limit_in_bytes": nice_size,
+    "under_oom": lambda x: "Yes" if x == "1" else "No",
+    "cpuacct.usage": lambda x: formatting.seconds_to_str(int(x) / 10**9)  # convert nanoseconds
+}
+
+
+class CgroupPluginFormatter(formatting.JobMetricFormatter):
+
+    def format(self, key, value):
+        title = TITLES.get(key, key)
+        if key in CONVERSION:
+            return title, CONVERSION[key](value)
+        elif key.endswith("_bytes"):
+            try:
+                return title, nice_size(key)
+            except ValueError:
+                pass
+        return title, value
+
+
+class CgroupPlugin(InstrumentPlugin):
+    """ Plugin that collects memory and cpu utilization from within a cgroup.
+    """
+    plugin_type = "cgroup"
+    formatter = CgroupPluginFormatter()
+
+    def __init__(self, **kwargs):
+        self.verbose = asbool(kwargs.get("verbose", False))
+
+    def post_execute_instrument(self, job_directory):
+        commands = []
+        commands.append(self.__record_cgroup_cpu_usage(job_directory))
+        commands.append(self.__record_cgroup_memory_usage(job_directory))
+        return commands
+
+    def job_properties(self, job_id, job_directory):
+        metrics = self.__read_metrics(self.__cgroup_metrics_file(job_directory))
+        return metrics
+
+    def __record_cgroup_cpu_usage(self, job_directory):
+        return """if [ `command -v cgget` ] && [ -e /proc/$$/cgroup ]; then cat /proc/$$/cgroup | awk -F':' '$2=="cpuacct,cpu"{print $2":"$3}' | xargs -I{} cgget -g {} > %(metrics)s ; else echo "" > %(metrics)s; fi""" % {"metrics": self.__cgroup_metrics_file(job_directory)}
+
+    def __record_cgroup_memory_usage(self, job_directory):
+        return """if [ `command -v cgget` ] && [ -e /proc/$$/cgroup ]; then cat /proc/$$/cgroup | awk -F':' '$2=="memory"{print $2":"$3}' | xargs -I{} cgget -g {} >> %(metrics)s ; else echo "" > %(metrics)s; fi""" % {"metrics": self.__cgroup_metrics_file(job_directory)}
+
+    def __cgroup_metrics_file(self, job_directory):
+        return self._instrument_file_path(job_directory, "_metrics")
+
+    def __read_metrics(self, path):
+        metrics = {}
+        with open(path, "r") as infile:
+            for line in infile:
+                line = line.strip()
+                try:
+                    key, value = line.split(": ")
+                    if key in TITLES or self.verbose:
+                        metrics[key] = value
+                except ValueError:
+                    if line.startswith("under_oom"):
+                        metrics["under_oom"] = line.split(" ")[1]
+        return metrics
+
+
+__all__ = ('CgroupPlugin', )