Skip to content
This repository has been archived by the owner on Mar 19, 2024. It is now read-only.

Commit

Permalink
Adding a hook to profile memory usage (#175)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #175

recently nvidia-smi was removed from fb cluster making it hard to see what the memory utilization was. using pytorch, we extract the information at various steps of training. the profiling hook is currently experimental and we will use it and adapt it for more better usability. down the line, we can move it to classy vision when the hook is trusted to be useful + changed accordingly

Reviewed By: min-xu-ai

Differential Revision: D26284304

fbshipit-source-id: 1a8c3cd12a498fc55999e982a6c072723d54d144
  • Loading branch information
prigoyal authored and facebook-github-bot committed Feb 9, 2021
1 parent b8e30eb commit 1b4d93f
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 2 deletions.
15 changes: 15 additions & 0 deletions vissl/config/defaults.yaml
Expand Up @@ -83,6 +83,21 @@ config:
# valid for some systems.
LOG_GPU_STATS: True

# ----------------------------------------------------------------------------------- #
# HOOKS
# ----------------------------------------------------------------------------------- #
HOOKS:
# ----------------------------------------------------------------------------------- #
# torch.cuda.memory_summary()
# ----------------------------------------------------------------------------------- #
MEMORY_SUMMARY:
# set this to true if you want to print memory summary. useful for profiling
# memory consumption of model
PRINT_MEMORY_SUMMARY: False
# at what iteration number should the memory summary be printed. usually
# set to 1 for very large models
LOG_ITERATION_NUM: 0

# ----------------------------------------------------------------------------------- #
# DATA
# ----------------------------------------------------------------------------------- #
Expand Down
3 changes: 3 additions & 0 deletions vissl/hooks/__init__.py
Expand Up @@ -7,6 +7,7 @@
from vissl.hooks.deepclusterv2_hooks import ClusterMemoryHook, InitMemoryHook # noqa
from vissl.hooks.log_hooks import ( # noqa
LogGpuStatsHook,
LogGpuMemoryHook,
LogLossLrEtaHook,
LogLossMetricsCheckpointHook,
LogPerfTimeMetricsHook,
Expand Down Expand Up @@ -100,6 +101,8 @@ def default_hook_generator(cfg: AttrDict) -> List[ClassyHook]:
hooks.extend([SSLModelComplexityHook()])
if cfg.LOG_GPU_STATS:
hooks.extend([LogGpuStatsHook()])
if cfg.HOOKS.MEMORY_SUMMARY.PRINT_MEMORY_SUMMARY:
hooks.extend([LogGpuMemoryHook(cfg.HOOKS.MEMORY_SUMMARY.LOG_ITERATION_NUM)])
if cfg.TENSORBOARD_SETUP.USE_TENSORBOARD:
assert is_tensorboard_available(), "Tensorboard must be installed to use it."
tb_hook = get_tensorboard_hook(cfg)
Expand Down
59 changes: 57 additions & 2 deletions vissl/hooks/log_hooks.py
Expand Up @@ -22,6 +22,61 @@
from vissl.utils.perf_stats import PerfStats


class LogGpuMemoryHook(ClassyHook):
"""
Hook executed at a specified iteration number and prints the
memory summary for the primary device at several steps of training.
"""

on_start = ClassyHook._noop
on_loss_and_meter = ClassyHook._noop
on_step = ClassyHook._noop
on_phase_end = ClassyHook._noop
on_end = ClassyHook._noop

def __init__(
self,
log_iteration_num: int = 1,
) -> None:
super().__init__()
self.log_iteration_num = log_iteration_num

def on_phase_start(self, task: "tasks.ClassyTask") -> None:
"""
Print the stats just before the training epoch starts
"""
self._print_memory_summary(task, "on_phase_start")

def on_forward(self, task: "tasks.ClassyTask") -> None:
"""
Print the stats after the model forward pass is done
"""
self._print_memory_summary(task, "on_forward")

def on_backward(self, task: "tasks.ClassyTask") -> None:
"""
Print the stats just after model.backward() is done
"""
self._print_memory_summary(task, "on_backward")

def on_update(self, task: "tasks.ClassyTask") -> None:
"""
Print the stats just after model params are updated
"""
self._print_memory_summary(task, "on_update")

def _print_memory_summary(self, task: "tasks.ClassyTask", stage_name: str) -> None:
if (
is_primary()
and (task.device.type == "cuda")
and task.local_iteration_num == self.log_iteration_num
):
logging.info(
f"========= Memory Summary at {stage_name} ======="
f"\n{torch.cuda.memory_summary()}\n"
)


class LogGpuStatsHook(ClassyHook):
"""
Hook executed at the start of training and after every training iteration is done.
Expand Down Expand Up @@ -92,8 +147,8 @@ def on_update(self, task: "tasks.ClassyTask") -> None:
monitoring the stats (optionally) for every N iterations to get better
idea about the batch time and training eta.
Set the btime_freq input using cfg.PERF_STAT_FREQUENCY=N ensuring that
cfg.MONITOR_PERF_STATS = True.
Set the btime_freq input using cfg.HOOKS.PERF_STATS.PERF_STAT_FREQUENCY=N
ensuring that cfg.HOOKS.PERF_STATS.MONITOR_PERF_STATS = True.
"""
phase_type = "train" if task.train else "test"
if is_primary() and phase_type == "train":
Expand Down

0 comments on commit 1b4d93f

Please sign in to comment.