Decentralized Distributed PPO (#245)

Add DD-PPO to habitat-baselines
facebookresearch · Jan 20, 2020 · 85b7907 · 85b7907
1 parent 44c8be1
commit 85b7907
Show file tree

Hide file tree

Showing 24 changed files with 1,671 additions and 40 deletions.
diff --git a/.editorconfig b/.editorconfig
@@ -20,4 +20,4 @@ include_trailing_comma = true
 ensure_newline_before_comments=true
 use_parentheses = true
 known_first_party = habitat,habitat_sim,habitat_baselines,version
-known_third_party = PIL,attr,conf,gym,imageio,matplotlib,mock,numba,numpy,orbslam2,pyrobot,pytest,quaternion,requests,scipy,setuptools,torch,torchvision,tqdm,yacs
+known_third_party = PIL,attr,conf,gym,ifcfg,imageio,matplotlib,mock,numba,numpy,orbslam2,pyrobot,pytest,quaternion,requests,scipy,setuptools,torch,torchvision,tqdm,yacs
diff --git a/habitat_baselines/__init__.py b/habitat_baselines/__init__.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 from habitat_baselines.common.base_trainer import BaseRLTrainer, BaseTrainer
+from habitat_baselines.rl.ddppo import DDPPOTrainer
 from habitat_baselines.rl.ppo.ppo_trainer import PPOTrainer, RolloutStorage
 
 __all__ = ["BaseTrainer", "BaseRLTrainer", "PPOTrainer", "RolloutStorage"]
diff --git a/habitat_baselines/common/rollout_storage.py b/habitat_baselines/common/rollout_storage.py
@@ -55,7 +55,7 @@ def __init__(
             self.actions = self.actions.long()
             self.prev_actions = self.prev_actions.long()
 
-        self.masks = torch.ones(num_steps + 1, num_envs, 1)
+        self.masks = torch.zeros(num_steps + 1, num_envs, 1)
 
         self.num_steps = num_steps
         self.step = 0
@@ -97,21 +97,26 @@ def insert(
         self.rewards[self.step].copy_(rewards)
         self.masks[self.step + 1].copy_(masks)
 
-        self.step = (self.step + 1) % self.num_steps
+        self.step = self.step + 1
 
     def after_update(self):
         for sensor in self.observations:
-            self.observations[sensor][0].copy_(self.observations[sensor][-1])
+            self.observations[sensor][0].copy_(
+                self.observations[sensor][self.step]
+            )
 
-        self.recurrent_hidden_states[0].copy_(self.recurrent_hidden_states[-1])
-        self.masks[0].copy_(self.masks[-1])
-        self.prev_actions[0].copy_(self.prev_actions[-1])
+        self.recurrent_hidden_states[0].copy_(
+            self.recurrent_hidden_states[self.step]
+        )
+        self.masks[0].copy_(self.masks[self.step])
+        self.prev_actions[0].copy_(self.prev_actions[self.step])
+        self.step = 0
 
     def compute_returns(self, next_value, use_gae, gamma, tau):
         if use_gae:
-            self.value_preds[-1] = next_value
+            self.value_preds[self.step] = next_value
             gae = 0
-            for step in reversed(range(self.rewards.size(0))):
+            for step in reversed(range(self.step)):
                 delta = (
                     self.rewards[step]
                     + gamma * self.value_preds[step + 1] * self.masks[step + 1]
@@ -120,8 +125,8 @@ def compute_returns(self, next_value, use_gae, gamma, tau):
                 gae = delta + gamma * tau * self.masks[step + 1] * gae
                 self.returns[step] = gae + self.value_preds[step]
         else:
-            self.returns[-1] = next_value
-            for step in reversed(range(self.rewards.size(0))):
+            self.returns[self.step] = next_value
+            for step in reversed(range(self.step)):
                 self.returns[step] = (
                     self.returns[step + 1] * gamma * self.masks[step + 1]
                     + self.rewards[step]
@@ -153,25 +158,25 @@ def recurrent_generator(self, advantages, num_mini_batch):
 
                 for sensor in self.observations:
                     observations_batch[sensor].append(
-                        self.observations[sensor][:-1, ind]
+                        self.observations[sensor][: self.step, ind]
                     )
 
                 recurrent_hidden_states_batch.append(
                     self.recurrent_hidden_states[0, :, ind]
                 )
 
-                actions_batch.append(self.actions[:, ind])
-                prev_actions_batch.append(self.prev_actions[:-1, ind])
-                value_preds_batch.append(self.value_preds[:-1, ind])
-                return_batch.append(self.returns[:-1, ind])
-                masks_batch.append(self.masks[:-1, ind])
+                actions_batch.append(self.actions[: self.step, ind])
+                prev_actions_batch.append(self.prev_actions[: self.step, ind])
+                value_preds_batch.append(self.value_preds[: self.step, ind])
+                return_batch.append(self.returns[: self.step, ind])
+                masks_batch.append(self.masks[: self.step, ind])
                 old_action_log_probs_batch.append(
-                    self.action_log_probs[:, ind]
+                    self.action_log_probs[: self.step, ind]
                 )
 
-                adv_targ.append(advantages[:, ind])
+                adv_targ.append(advantages[: self.step, ind])
 
-            T, N = self.num_steps, num_envs_per_batch
+            T, N = self.step, num_envs_per_batch
 
             # These are all tensors of size (T, N, -1)
             for sensor in observations_batch:

diff --git a/habitat_baselines/config/default.py b/habitat_baselines/config/default.py
@@ -62,13 +62,32 @@
 _C.RL.PPO.eps = 1e-5
 _C.RL.PPO.max_grad_norm = 0.5
 _C.RL.PPO.num_steps = 5
-_C.RL.PPO.hidden_size = 512
 _C.RL.PPO.use_gae = True
 _C.RL.PPO.use_linear_lr_decay = False
 _C.RL.PPO.use_linear_clip_decay = False
 _C.RL.PPO.gamma = 0.99
 _C.RL.PPO.tau = 0.95
 _C.RL.PPO.reward_window_size = 50
+_C.RL.PPO.use_normalized_advantage = True
+_C.RL.PPO.hidden_size = 512
+# -----------------------------------------------------------------------------
+# DECENTRALIZED DISTRIBUTED PROXIMAL POLICY OPTIMIZATION (DD-PPO)
+# -----------------------------------------------------------------------------
+_C.RL.DDPPO = CN()
+_C.RL.DDPPO.sync_frac = 0.6
+_C.RL.DDPPO.distrib_backend = "GLOO"
+_C.RL.DDPPO.rnn_type = "LSTM"
+_C.RL.DDPPO.num_recurrent_layers = 2
+_C.RL.DDPPO.backbone = "resnet50"
+_C.RL.DDPPO.pretrained_weights = "data/ddppo-models/gibson-2plus-resnet50.pth"
+# Loads pretrained weights
+_C.RL.DDPPO.pretrained = False
+# Loads just the visual encoder backbone weights
+_C.RL.DDPPO.pretrained_encoder = False
+# Whether or not the visual encoder backbone will be trained
+_C.RL.DDPPO.train_encoder = True
+# Whether or not to reset the critic linear layer
+_C.RL.DDPPO.reset_critic = True
 # -----------------------------------------------------------------------------
 # ORBSLAM2 BASELINE
 # -----------------------------------------------------------------------------

diff --git a/habitat_baselines/config/pointnav/ddppo_pointnav.yaml b/habitat_baselines/config/pointnav/ddppo_pointnav.yaml
@@ -0,0 +1,60 @@
+BASE_TASK_CONFIG_PATH: "configs/tasks/pointnav_gibson.yaml"
+TRAINER_NAME: "ddppo"
+ENV_NAME: "NavRLEnv"
+SIMULATOR_GPU_ID: 0
+TORCH_GPU_ID: 0
+VIDEO_OPTION: []
+TENSORBOARD_DIR: "tb"
+VIDEO_DIR: "video_dir"
+TEST_EPISODE_COUNT: 994
+EVAL_CKPT_PATH_DIR: "data/new_checkpoints"
+NUM_PROCESSES: 4
+SENSORS: ["DEPTH_SENSOR"]
+CHECKPOINT_FOLDER: "data/new_checkpoints"
+NUM_UPDATES: 10000
+LOG_INTERVAL: 10
+CHECKPOINT_INTERVAL: 50
+
+RL:
+  SUCCESS_REWARD: 2.5
+  PPO:
+    # ppo params
+    clip_param: 0.2
+    ppo_epoch: 2
+    num_mini_batch: 2
+    value_loss_coef: 0.5
+    entropy_coef: 0.01
+    lr: 2.5e-4
+    eps: 1e-5
+    max_grad_norm: 0.2
+    num_steps: 128
+    use_gae: True
+    gamma: 0.99
+    tau: 0.95
+    use_linear_clip_decay: False
+    use_linear_lr_decay: False
+    reward_window_size: 50
+
+    use_normalized_advantage: False
+
+    hidden_size: 512
+
+  DDPPO:
+    sync_frac: 0.6
+    # The PyTorch distributed backend to use
+    distrib_backend: GLOO
+    # Visual encoder backbone
+    pretrained_weights: data/ddppo-models/gibson-2plus-resnet50.pth
+    # Initialize with pretrained weights
+    pretrained: False
+    # Initialize just the visual encoder backbone with pretrained weights
+    pretrained_encoder: False
+    # Whether or not the visual encoder backbone will be trained.
+    train_encoder: True
+    # Whether or not to reset the critic linear layer
+    reset_critic: True
+
+    # Model parameters
+    backbone: resnet50
+    rnn_type: LSTM
+    num_recurrent_layers: 2
diff --git a/habitat_baselines/config/test/ddppo_pointnav_test.yaml b/habitat_baselines/config/test/ddppo_pointnav_test.yaml
@@ -0,0 +1,57 @@
+BASE_TASK_CONFIG_PATH: "configs/tasks/pointnav.yaml"
+TRAINER_NAME: "ddppo"
+ENV_NAME: "NavRLEnv"
+SIMULATOR_GPU_ID: 0
+TORCH_GPU_ID: 0
+VIDEO_OPTION: []
+TENSORBOARD_DIR: ""
+EVAL_CKPT_PATH_DIR: "data/test_checkpoints/ddppo/pointnav/ckpt.0.pth"
+NUM_PROCESSES: 1
+CHECKPOINT_FOLDER: "data/test_checkpoints/ddppo/pointnav/"
+NUM_UPDATES: 2
+LOG_INTERVAL: 100
+CHECKPOINT_INTERVAL: 1
+
+RL:
+  SUCCESS_REWARD: 2.5
+  PPO:
+    # ppo params
+    clip_param: 0.2
+    ppo_epoch: 2
+    num_mini_batch: 1
+    value_loss_coef: 0.5
+    entropy_coef: 0.01
+    lr: 2.5e-4
+    eps: 1e-5
+    max_grad_norm: 0.2
+    num_steps: 16
+    use_gae: True
+    gamma: 0.99
+    tau: 0.95
+    use_linear_clip_decay: False
+    use_linear_lr_decay: False
+    reward_window_size: 50
+
+    use_normalized_advantage: False
+
+    hidden_size: 512
+
+  DDPPO:
+    sync_frac: 0.6
+    # The PyTorch distributed backend to use
+    distrib_backend: GLOO
+    # Visual encoder backbone
+    pretrained_weights: data/ddppo-models/gibson-2plus-resnet50.pth
+    # Initialize with pretrained weights
+    pretrained: False
+    # Initialize just the visual encoder backbone with pretrained weights
+    pretrained_encoder: False
+    # Whether or not the visual encoder backbone will be trained.
+    train_encoder: True
+    # Whether or not to reset the critic linear layer
+    reset_critic: True
+
+    # Model parameters
+    backbone: resnet50
+    rnn_type: LSTM
+    num_recurrent_layers: 2
diff --git a/habitat_baselines/rl/ddppo/README.md b/habitat_baselines/rl/ddppo/README.md
@@ -0,0 +1,64 @@
+# Decentralized Distributed PPO
+
+Provides changes to the core baseline ppo algorithm and training script to implemented Decentralized Distributed PPO (DD-PPO).
+DD-PPO leverages distributed data parallelism to seamlessly scale PPO to hundreds of GPUs with no centralized server.
+
+See the [paper](https://arxiv.org/abs/1911.00357) for more detail.
+
+## Running
+
+There are two example scripts to run provided.  A single node script that leverages `torch.distributed.launch` to create multiple workers:
+`single_node.sh`, and a multi-node script that leverages [SLURM](https://slurm.schedmd.com/documentation.html) to create all the works on multiple nodes: `multi_node_slurm.sh`.
+
+The two recommended backends are GLOO and NCCL.  Use NCCL if your system has it, and GLOO if otherwise.
+
+See [pytorch's distributed docs](https://pytorch.org/docs/stable/distributed.html#backends-that-come-with-pytorch)
+and [pytorch's distributed tutorial](https://pytorch.org/tutorials/intermediate/dist_tuto.html) for more information.
+
+## Pretrained Models (PointGoal Navigation with GPS+Compass)
+
+
+All weights available as a zip [here](https://drive.google.com/open?id=1ueXuIqP2HZ0oxhpDytpc3hpciXSd8H16).
+
+### Depth models
+
+| Architecture | Training Data | Val SPL | Test SPL | URL |
+| ------------ | ------------- | ------- | -------- | --- |
+| ResNet50 + LSTM512 | Gibson 4+ | 0.922 | 0.917 | |
+| ResNet50 + LSTM512 | Gibson 4+ and MP3D(train/val/test)<br/> **Caution:** Trained on MP3D val and test | 0.956 | 0.941 |
+| ResNet50 + LSTM512 | Gibson 2+ | 0.956 | 0.944 | |
+| SE-ResNeXt50 + LSTM512 | Gibson 2+ | 0.959 | 0.943 | |
+| SE-ResNeXt101 + LSTM1024 | Gibson 2+ | 0.969 | 0.948 |  |
+
+### RGB models
+
+| Architecture | Training Data | Val SPL | Test SPL | URL |
+| ------------ | ------------- | ------- | -------- | --- |
+| ResNet50 + LSTM512 | Gibson 2+ and MP3D(train/val/test)<br/> **Caution:** Trained on MP3D val and test |  |  |
+| SE-ResNeXt50 + LSTM512 | Gibson 2+ and MP3D(train/val/test)<br/> **Caution:** Trained on MP3D val and test | 0.933 | 0.920 |
+
+
+### Blind Models
+
+| Architecture | Training Data | Val SPL | Test SPL | URL |
+| ------------ | ------------- | ------- | -------- | --- |
+| LSTM512 | Gibson 0+ and MP3D(train/val/test)<br/> **Caution:** Trained on MP3D val and test | 0.729  |  0.676 |
+
+
+
+
+**Note:** Evaluation was done with *sampled* actions.
+
+All model weights are subject to [Matterport3D Terms-of-Use](http://dovahkiin.stanford.edu/matterport/public/MP_TOS.pdf).
+
+
+## Citing
+
+If you use DD-PPO or the model-weights in your research, please cite the following [paper](https://arxiv.org/abs/1911.00357):
+
+    @article{wijmans2020ddppo,
+      title = {{D}ecentralized {D}istributed {PPO}: {S}olving {P}oint{G}oal {N}avigation},
+      author =  {Erik Wijmans and Abhishek Kadian and Ari Morcos and Stefan Lee and Irfan Essa and Devi Parikh and Manolis Savva and Dhruv Batra},
+      journal = {International Conference on Learning Representations (ICLR)},
+      year =    {2020}
+    }
diff --git a/habitat_baselines/rl/ddppo/__init__.py b/habitat_baselines/rl/ddppo/__init__.py
@@ -0,0 +1,7 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from habitat_baselines.rl.ddppo.algo import DDPPOTrainer
diff --git a/habitat_baselines/rl/ddppo/algo/__init__.py b/habitat_baselines/rl/ddppo/algo/__init__.py
@@ -0,0 +1,7 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from habitat_baselines.rl.ddppo.algo.ddppo_trainer import DDPPOTrainer