Skip to content

Commit

Permalink
Merge pull request #5610 from scholtalbers/feature/cgroup_metrics
Browse files Browse the repository at this point in the history
Add basic cgroup metric collection
  • Loading branch information
jmchilton committed Mar 9, 2018
2 parents 2195911 + 47caac3 commit c5df43c
Show file tree
Hide file tree
Showing 2 changed files with 106 additions and 12 deletions.
29 changes: 17 additions & 12 deletions config/job_metrics_conf.xml.sample
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
<!-- Core plugin captures Galaxy slots, start and end of job (in seconds
since epoch) and computes runtime in seconds. -->
<core />

<!-- Uncomment to dump processor count for each job - linux only. -->
<!-- <cpuinfo /> -->
<!-- Uncomment to dump information about all processors for for each
Expand All @@ -30,13 +30,18 @@
only. -->
<!-- <uname /> -->

<!-- Uncomment following to enable plugin dumping complete environment
<!-- Uncomment following to enable plugin dumping complete environment
for each job, potentially useful for debuging -->
<!-- <env /> -->
<!-- env plugin can also record more targetted, obviously useful variables
as well. -->
<!-- <env variables="HOSTNAME,SLURM_CPUS_ON_NODE,SLURM_JOBID" /> -->

<!-- If galaxy jobs are run in cgroups, like slurm does if memory limits
are enforced, we can try to grep some information from this. -->
<!-- <cgroup /> -->
<!-- <cgroup verbose="true" /> -->

<!-- <collectl /> -->
<!-- Collectl (http://collectl.sourceforge.net/) is a powerful monitoring
utility capable of gathering numerous system and process level
Expand All @@ -46,7 +51,7 @@
is highly customiziable - both using the attributes documented below
or simply hacking up the code in lib/galaxy/jobs/metrics.
Warning: In order to use this plugin collectl must be available on the
Warning: In order to use this plugin collectl must be available on the
compute server the job runs on and on the local Galaxy server as well
(unless in this latter case summarize_process_data is set to False).
Expand All @@ -64,12 +69,12 @@
full time-series data corresponding to a job run.
'subsystems': Comma separated list of collectl subystems to collect
data for. Plugin doesn't currently expose all of them or offer
summary data for any of them except 'process' but extensions
would be welcome. May seem pointless to include subsystems
data for. Plugin doesn't currently expose all of them or offer
summary data for any of them except 'process' but extensions
would be welcome. May seem pointless to include subsystems
beside process since they won't be processed online by Galaxy -
but if 'saved_logs_path' these files can be played back at anytime.
Available subsystems - 'process', 'cpu', 'memory', 'network',
'disk', 'network'. (Default 'process').
Expand Down Expand Up @@ -108,13 +113,13 @@
'flush': Interval (in seconds I think) between when collectl will
flush its buffer to disk. Galaxy overrides this to disable
flushing by default if not set.
flushing by default if not set.
'local_collectl_path', 'remote_collectl_path', 'collectl_path':
By default, jobs will just assume collectl is on the PATH, but
it can be overridden with 'local_collectl_path' and
'remote_collectl_path' (or simply 'collectl_path' if it is not
on the path but installed in the same location both locally and
By default, jobs will just assume collectl is on the PATH, but
it can be overridden with 'local_collectl_path' and
'remote_collectl_path' (or simply 'collectl_path' if it is not
on the path but installed in the same location both locally and
remotely).
There are more and more increasingly obsecure options including -
Expand Down
89 changes: 89 additions & 0 deletions lib/galaxy/jobs/metrics/instrumenters/cgroup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
"""The module describes the ``cgroup`` job metrics plugin."""
import logging

from galaxy.util import asbool, nice_size
from ..instrumenters import InstrumentPlugin
from ...metrics import formatting

log = logging.getLogger(__name__)

TITLES = {
"memory.memsw.max_usage_in_bytes": "Max memory usage (MEM+SWP)",
"memory.max_usage_in_bytes": "Max memory usage (MEM)",
"memory.limit_in_bytes": "Memory limit on cgroup (MEM)",
"memory.memsw.limit_in_bytes": "Memory limit on cgroup (MEM+SWP)",
"memory.soft_limit_in_bytes": "Memory softlimit on cgroup",
"memory.failcnt": "Failed to allocate memory count",
"memory.oom_control": "OOM Control enabled",
"under_oom": "Was OOM Killer active?",
"cpuacct.usage": "CPU Time"
}
CONVERSION = {
"memory.memsw.max_usage_in_bytes": nice_size,
"memory.max_usage_in_bytes": nice_size,
"memory.limit_in_bytes": nice_size,
"memory.memsw.limit_in_bytes": nice_size,
"memory.soft_limit_in_bytes": nice_size,
"under_oom": lambda x: "Yes" if x == "1" else "No",
"cpuacct.usage": lambda x: formatting.seconds_to_str(int(x) / 10**9) # convert nanoseconds
}


class CgroupPluginFormatter(formatting.JobMetricFormatter):

def format(self, key, value):
title = TITLES.get(key, key)
if key in CONVERSION:
return title, CONVERSION[key](value)
elif key.endswith("_bytes"):
try:
return title, nice_size(key)
except ValueError:
pass
return title, value


class CgroupPlugin(InstrumentPlugin):
""" Plugin that collects memory and cpu utilization from within a cgroup.
"""
plugin_type = "cgroup"
formatter = CgroupPluginFormatter()

def __init__(self, **kwargs):
self.verbose = asbool(kwargs.get("verbose", False))

def post_execute_instrument(self, job_directory):
commands = []
commands.append(self.__record_cgroup_cpu_usage(job_directory))
commands.append(self.__record_cgroup_memory_usage(job_directory))
return commands

def job_properties(self, job_id, job_directory):
metrics = self.__read_metrics(self.__cgroup_metrics_file(job_directory))
return metrics

def __record_cgroup_cpu_usage(self, job_directory):
return """if [ `command -v cgget` ] && [ -e /proc/$$/cgroup ]; then cat /proc/$$/cgroup | awk -F':' '$2=="cpuacct,cpu"{print $2":"$3}' | xargs -I{} cgget -g {} > %(metrics)s ; else echo "" > %(metrics)s; fi""" % {"metrics": self.__cgroup_metrics_file(job_directory)}

def __record_cgroup_memory_usage(self, job_directory):
return """if [ `command -v cgget` ] && [ -e /proc/$$/cgroup ]; then cat /proc/$$/cgroup | awk -F':' '$2=="memory"{print $2":"$3}' | xargs -I{} cgget -g {} >> %(metrics)s ; else echo "" > %(metrics)s; fi""" % {"metrics": self.__cgroup_metrics_file(job_directory)}

def __cgroup_metrics_file(self, job_directory):
return self._instrument_file_path(job_directory, "_metrics")

def __read_metrics(self, path):
metrics = {}
with open(path, "r") as infile:
for line in infile:
line = line.strip()
try:
key, value = line.split(": ")
if key in TITLES or self.verbose:
metrics[key] = value
except ValueError:
if line.startswith("under_oom"):
metrics["under_oom"] = line.split(" ")[1]
return metrics


__all__ = ('CgroupPlugin', )

0 comments on commit c5df43c

Please sign in to comment.