Skip to content

Commit

Permalink
Decentralized Distributed PPO (#245)
Browse files Browse the repository at this point in the history
Add DD-PPO to habitat-baselines
  • Loading branch information
erikwijmans committed Jan 20, 2020
1 parent 44c8be1 commit 85b7907
Show file tree
Hide file tree
Showing 24 changed files with 1,671 additions and 40 deletions.
2 changes: 1 addition & 1 deletion .editorconfig
Expand Up @@ -20,4 +20,4 @@ include_trailing_comma = true
ensure_newline_before_comments=true
use_parentheses = true
known_first_party = habitat,habitat_sim,habitat_baselines,version
known_third_party = PIL,attr,conf,gym,imageio,matplotlib,mock,numba,numpy,orbslam2,pyrobot,pytest,quaternion,requests,scipy,setuptools,torch,torchvision,tqdm,yacs
known_third_party = PIL,attr,conf,gym,ifcfg,imageio,matplotlib,mock,numba,numpy,orbslam2,pyrobot,pytest,quaternion,requests,scipy,setuptools,torch,torchvision,tqdm,yacs
1 change: 1 addition & 0 deletions habitat_baselines/__init__.py
Expand Up @@ -5,6 +5,7 @@
# LICENSE file in the root directory of this source tree.

from habitat_baselines.common.base_trainer import BaseRLTrainer, BaseTrainer
from habitat_baselines.rl.ddppo import DDPPOTrainer
from habitat_baselines.rl.ppo.ppo_trainer import PPOTrainer, RolloutStorage

__all__ = ["BaseTrainer", "BaseRLTrainer", "PPOTrainer", "RolloutStorage"]
43 changes: 24 additions & 19 deletions habitat_baselines/common/rollout_storage.py
Expand Up @@ -55,7 +55,7 @@ def __init__(
self.actions = self.actions.long()
self.prev_actions = self.prev_actions.long()

self.masks = torch.ones(num_steps + 1, num_envs, 1)
self.masks = torch.zeros(num_steps + 1, num_envs, 1)

self.num_steps = num_steps
self.step = 0
Expand Down Expand Up @@ -97,21 +97,26 @@ def insert(
self.rewards[self.step].copy_(rewards)
self.masks[self.step + 1].copy_(masks)

self.step = (self.step + 1) % self.num_steps
self.step = self.step + 1

def after_update(self):
for sensor in self.observations:
self.observations[sensor][0].copy_(self.observations[sensor][-1])
self.observations[sensor][0].copy_(
self.observations[sensor][self.step]
)

self.recurrent_hidden_states[0].copy_(self.recurrent_hidden_states[-1])
self.masks[0].copy_(self.masks[-1])
self.prev_actions[0].copy_(self.prev_actions[-1])
self.recurrent_hidden_states[0].copy_(
self.recurrent_hidden_states[self.step]
)
self.masks[0].copy_(self.masks[self.step])
self.prev_actions[0].copy_(self.prev_actions[self.step])
self.step = 0

def compute_returns(self, next_value, use_gae, gamma, tau):
if use_gae:
self.value_preds[-1] = next_value
self.value_preds[self.step] = next_value
gae = 0
for step in reversed(range(self.rewards.size(0))):
for step in reversed(range(self.step)):
delta = (
self.rewards[step]
+ gamma * self.value_preds[step + 1] * self.masks[step + 1]
Expand All @@ -120,8 +125,8 @@ def compute_returns(self, next_value, use_gae, gamma, tau):
gae = delta + gamma * tau * self.masks[step + 1] * gae
self.returns[step] = gae + self.value_preds[step]
else:
self.returns[-1] = next_value
for step in reversed(range(self.rewards.size(0))):
self.returns[self.step] = next_value
for step in reversed(range(self.step)):
self.returns[step] = (
self.returns[step + 1] * gamma * self.masks[step + 1]
+ self.rewards[step]
Expand Down Expand Up @@ -153,25 +158,25 @@ def recurrent_generator(self, advantages, num_mini_batch):

for sensor in self.observations:
observations_batch[sensor].append(
self.observations[sensor][:-1, ind]
self.observations[sensor][: self.step, ind]
)

recurrent_hidden_states_batch.append(
self.recurrent_hidden_states[0, :, ind]
)

actions_batch.append(self.actions[:, ind])
prev_actions_batch.append(self.prev_actions[:-1, ind])
value_preds_batch.append(self.value_preds[:-1, ind])
return_batch.append(self.returns[:-1, ind])
masks_batch.append(self.masks[:-1, ind])
actions_batch.append(self.actions[: self.step, ind])
prev_actions_batch.append(self.prev_actions[: self.step, ind])
value_preds_batch.append(self.value_preds[: self.step, ind])
return_batch.append(self.returns[: self.step, ind])
masks_batch.append(self.masks[: self.step, ind])
old_action_log_probs_batch.append(
self.action_log_probs[:, ind]
self.action_log_probs[: self.step, ind]
)

adv_targ.append(advantages[:, ind])
adv_targ.append(advantages[: self.step, ind])

T, N = self.num_steps, num_envs_per_batch
T, N = self.step, num_envs_per_batch

# These are all tensors of size (T, N, -1)
for sensor in observations_batch:
Expand Down
21 changes: 20 additions & 1 deletion habitat_baselines/config/default.py
Expand Up @@ -62,13 +62,32 @@
_C.RL.PPO.eps = 1e-5
_C.RL.PPO.max_grad_norm = 0.5
_C.RL.PPO.num_steps = 5
_C.RL.PPO.hidden_size = 512
_C.RL.PPO.use_gae = True
_C.RL.PPO.use_linear_lr_decay = False
_C.RL.PPO.use_linear_clip_decay = False
_C.RL.PPO.gamma = 0.99
_C.RL.PPO.tau = 0.95
_C.RL.PPO.reward_window_size = 50
_C.RL.PPO.use_normalized_advantage = True
_C.RL.PPO.hidden_size = 512
# -----------------------------------------------------------------------------
# DECENTRALIZED DISTRIBUTED PROXIMAL POLICY OPTIMIZATION (DD-PPO)
# -----------------------------------------------------------------------------
_C.RL.DDPPO = CN()
_C.RL.DDPPO.sync_frac = 0.6
_C.RL.DDPPO.distrib_backend = "GLOO"
_C.RL.DDPPO.rnn_type = "LSTM"
_C.RL.DDPPO.num_recurrent_layers = 2
_C.RL.DDPPO.backbone = "resnet50"
_C.RL.DDPPO.pretrained_weights = "data/ddppo-models/gibson-2plus-resnet50.pth"
# Loads pretrained weights
_C.RL.DDPPO.pretrained = False
# Loads just the visual encoder backbone weights
_C.RL.DDPPO.pretrained_encoder = False
# Whether or not the visual encoder backbone will be trained
_C.RL.DDPPO.train_encoder = True
# Whether or not to reset the critic linear layer
_C.RL.DDPPO.reset_critic = True
# -----------------------------------------------------------------------------
# ORBSLAM2 BASELINE
# -----------------------------------------------------------------------------
Expand Down
60 changes: 60 additions & 0 deletions habitat_baselines/config/pointnav/ddppo_pointnav.yaml
@@ -0,0 +1,60 @@
BASE_TASK_CONFIG_PATH: "configs/tasks/pointnav_gibson.yaml"
TRAINER_NAME: "ddppo"
ENV_NAME: "NavRLEnv"
SIMULATOR_GPU_ID: 0
TORCH_GPU_ID: 0
VIDEO_OPTION: []
TENSORBOARD_DIR: "tb"
VIDEO_DIR: "video_dir"
TEST_EPISODE_COUNT: 994
EVAL_CKPT_PATH_DIR: "data/new_checkpoints"
NUM_PROCESSES: 4
SENSORS: ["DEPTH_SENSOR"]
CHECKPOINT_FOLDER: "data/new_checkpoints"
NUM_UPDATES: 10000
LOG_INTERVAL: 10
CHECKPOINT_INTERVAL: 50

RL:
SUCCESS_REWARD: 2.5
PPO:
# ppo params
clip_param: 0.2
ppo_epoch: 2
num_mini_batch: 2
value_loss_coef: 0.5
entropy_coef: 0.01
lr: 2.5e-4
eps: 1e-5
max_grad_norm: 0.2
num_steps: 128
use_gae: True
gamma: 0.99
tau: 0.95
use_linear_clip_decay: False
use_linear_lr_decay: False
reward_window_size: 50

use_normalized_advantage: False

hidden_size: 512

DDPPO:
sync_frac: 0.6
# The PyTorch distributed backend to use
distrib_backend: GLOO
# Visual encoder backbone
pretrained_weights: data/ddppo-models/gibson-2plus-resnet50.pth
# Initialize with pretrained weights
pretrained: False
# Initialize just the visual encoder backbone with pretrained weights
pretrained_encoder: False
# Whether or not the visual encoder backbone will be trained.
train_encoder: True
# Whether or not to reset the critic linear layer
reset_critic: True

# Model parameters
backbone: resnet50
rnn_type: LSTM
num_recurrent_layers: 2
57 changes: 57 additions & 0 deletions habitat_baselines/config/test/ddppo_pointnav_test.yaml
@@ -0,0 +1,57 @@
BASE_TASK_CONFIG_PATH: "configs/tasks/pointnav.yaml"
TRAINER_NAME: "ddppo"
ENV_NAME: "NavRLEnv"
SIMULATOR_GPU_ID: 0
TORCH_GPU_ID: 0
VIDEO_OPTION: []
TENSORBOARD_DIR: ""
EVAL_CKPT_PATH_DIR: "data/test_checkpoints/ddppo/pointnav/ckpt.0.pth"
NUM_PROCESSES: 1
CHECKPOINT_FOLDER: "data/test_checkpoints/ddppo/pointnav/"
NUM_UPDATES: 2
LOG_INTERVAL: 100
CHECKPOINT_INTERVAL: 1

RL:
SUCCESS_REWARD: 2.5
PPO:
# ppo params
clip_param: 0.2
ppo_epoch: 2
num_mini_batch: 1
value_loss_coef: 0.5
entropy_coef: 0.01
lr: 2.5e-4
eps: 1e-5
max_grad_norm: 0.2
num_steps: 16
use_gae: True
gamma: 0.99
tau: 0.95
use_linear_clip_decay: False
use_linear_lr_decay: False
reward_window_size: 50

use_normalized_advantage: False

hidden_size: 512

DDPPO:
sync_frac: 0.6
# The PyTorch distributed backend to use
distrib_backend: GLOO
# Visual encoder backbone
pretrained_weights: data/ddppo-models/gibson-2plus-resnet50.pth
# Initialize with pretrained weights
pretrained: False
# Initialize just the visual encoder backbone with pretrained weights
pretrained_encoder: False
# Whether or not the visual encoder backbone will be trained.
train_encoder: True
# Whether or not to reset the critic linear layer
reset_critic: True

# Model parameters
backbone: resnet50
rnn_type: LSTM
num_recurrent_layers: 2
64 changes: 64 additions & 0 deletions habitat_baselines/rl/ddppo/README.md
@@ -0,0 +1,64 @@
# Decentralized Distributed PPO

Provides changes to the core baseline ppo algorithm and training script to implemented Decentralized Distributed PPO (DD-PPO).
DD-PPO leverages distributed data parallelism to seamlessly scale PPO to hundreds of GPUs with no centralized server.

See the [paper](https://arxiv.org/abs/1911.00357) for more detail.

## Running

There are two example scripts to run provided. A single node script that leverages `torch.distributed.launch` to create multiple workers:
`single_node.sh`, and a multi-node script that leverages [SLURM](https://slurm.schedmd.com/documentation.html) to create all the works on multiple nodes: `multi_node_slurm.sh`.

The two recommended backends are GLOO and NCCL. Use NCCL if your system has it, and GLOO if otherwise.

See [pytorch's distributed docs](https://pytorch.org/docs/stable/distributed.html#backends-that-come-with-pytorch)
and [pytorch's distributed tutorial](https://pytorch.org/tutorials/intermediate/dist_tuto.html) for more information.

## Pretrained Models (PointGoal Navigation with GPS+Compass)


All weights available as a zip [here](https://drive.google.com/open?id=1ueXuIqP2HZ0oxhpDytpc3hpciXSd8H16).

### Depth models

| Architecture | Training Data | Val SPL | Test SPL | URL |
| ------------ | ------------- | ------- | -------- | --- |
| ResNet50 + LSTM512 | Gibson 4+ | 0.922 | 0.917 | |
| ResNet50 + LSTM512 | Gibson 4+ and MP3D(train/val/test)<br/> **Caution:** Trained on MP3D val and test | 0.956 | 0.941 |
| ResNet50 + LSTM512 | Gibson 2+ | 0.956 | 0.944 | |
| SE-ResNeXt50 + LSTM512 | Gibson 2+ | 0.959 | 0.943 | |
| SE-ResNeXt101 + LSTM1024 | Gibson 2+ | 0.969 | 0.948 | |

### RGB models

| Architecture | Training Data | Val SPL | Test SPL | URL |
| ------------ | ------------- | ------- | -------- | --- |
| ResNet50 + LSTM512 | Gibson 2+ and MP3D(train/val/test)<br/> **Caution:** Trained on MP3D val and test | | |
| SE-ResNeXt50 + LSTM512 | Gibson 2+ and MP3D(train/val/test)<br/> **Caution:** Trained on MP3D val and test | 0.933 | 0.920 |


### Blind Models

| Architecture | Training Data | Val SPL | Test SPL | URL |
| ------------ | ------------- | ------- | -------- | --- |
| LSTM512 | Gibson 0+ and MP3D(train/val/test)<br/> **Caution:** Trained on MP3D val and test | 0.729 | 0.676 |




**Note:** Evaluation was done with *sampled* actions.

All model weights are subject to [Matterport3D Terms-of-Use](http://dovahkiin.stanford.edu/matterport/public/MP_TOS.pdf).


## Citing

If you use DD-PPO or the model-weights in your research, please cite the following [paper](https://arxiv.org/abs/1911.00357):

@article{wijmans2020ddppo,
title = {{D}ecentralized {D}istributed {PPO}: {S}olving {P}oint{G}oal {N}avigation},
author = {Erik Wijmans and Abhishek Kadian and Ari Morcos and Stefan Lee and Irfan Essa and Devi Parikh and Manolis Savva and Dhruv Batra},
journal = {International Conference on Learning Representations (ICLR)},
year = {2020}
}
7 changes: 7 additions & 0 deletions habitat_baselines/rl/ddppo/__init__.py
@@ -0,0 +1,7 @@
#!/usr/bin/env python3

# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

from habitat_baselines.rl.ddppo.algo import DDPPOTrainer
7 changes: 7 additions & 0 deletions habitat_baselines/rl/ddppo/algo/__init__.py
@@ -0,0 +1,7 @@
#!/usr/bin/env python3

# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

from habitat_baselines.rl.ddppo.algo.ddppo_trainer import DDPPOTrainer

0 comments on commit 85b7907

Please sign in to comment.