Skip to content

Commit

Permalink
documentation/policy (#151)
Browse files Browse the repository at this point in the history
* update autodoc options

* ad DeterministicPolicy documentation

* add docs to Gaussian policy

* add docs to Greedy policy

* add documentation for the SoftDeterministicPolicy

* add SoftmaxPolicy documentaiton

* tweak policy documentation wording

* update Environment documentation

* clean up some other issues with documentation
  • Loading branch information
cpnota committed Jun 7, 2020
1 parent 794ea2d commit 5547867
Show file tree
Hide file tree
Showing 10 changed files with 84 additions and 11 deletions.
9 changes: 3 additions & 6 deletions all/environments/abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,11 @@ def step(self, action):
Returns
-------
State
The state of the environment after the action is applied
all.environments.State
The State of the environment after the action is applied.
This State object includes both the done flag and any additional "info"
float
The reward achieved by the previous action
done
True if the environment has entered a terminal state and should be reset
info
Diagnostic information useful for debugging
"""

@abstractmethod
Expand Down
3 changes: 1 addition & 2 deletions all/nn/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ class RLNetwork(nn.Module):
"""
Wraps a network such that States can be given as input.
"""

def __init__(self, model, _=None):
super().__init__()
self.model = model
Expand All @@ -20,7 +19,7 @@ def forward(self, state):
return self.model(state.features.float()) * state.mask.float().unsqueeze(-1)

class Aggregation(nn.Module):
"""len()
"""
Aggregation layer for the Dueling architecture.
https://arxiv.org/abs/1511.06581
Expand Down
2 changes: 2 additions & 0 deletions all/optim/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
from .scheduler import LinearScheduler, Schedulable

__all__ = ['Schedulable', 'LinearScheduler']
13 changes: 13 additions & 0 deletions all/policies/deterministic.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,18 @@


class DeterministicPolicy(Approximation):
'''
A DDPG-style deterministic policy.
Args:
model (torch.nn.Module): A Pytorch module representing the policy network.
The input shape should be the same as the shape of the state space,
and the output shape should be the same as the shape of the action space.
optimizer (torch.optim.Optimizer): A optimizer initialized with the
model parameters, e.g. SGD, Adam, RMSprop, etc.
action_space (gym.spaces.Box): The Box representing the action space.
kwargs (optional): Any other arguments accepted by all.approximation.Approximation
'''
def __init__(
self,
model,
Expand All @@ -20,6 +32,7 @@ def __init__(
**kwargs
)


class DeterministicPolicyNetwork(RLNetwork):
def __init__(self, model, space):
super().__init__(model)
Expand Down
18 changes: 18 additions & 0 deletions all/policies/gaussian.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,24 @@


class GaussianPolicy(Approximation):
'''
A Gaussian stochastic policy.
This policy will choose actions from a distribution represented by a spherical Gaussian.
The first n outputs the model will be squashed to [-1, 1] through a tanh function, and then
scaled to the given action_space, and the remaining n outputs will define the amount of noise added.
Args:
model (torch.nn.Module): A Pytorch module representing the policy network.
The input shape should be the same as the shape of the state (or feature) space,
and the output shape should be double the size of the the action space.
The first n outputs will be the unscaled mean of the action for each dimension,
and the second n outputs will be the logarithm of the variance.
optimizer (torch.optim.Optimizer): A optimizer initialized with the
model parameters, e.g. SGD, Adam, RMSprop, etc.
action_space (gym.spaces.Box): The Box representing the action space.
kwargs (optional): Any other arguments accepted by all.approximation.Approximation
'''
def __init__(
self,
model,
Expand Down
13 changes: 13 additions & 0 deletions all/policies/greedy.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,19 @@
from all.optim import Schedulable

class GreedyPolicy(Schedulable):
'''
An "epsilon-greedy" action selection policy for discrete action spaces.
This policy will usually choose the optimal action according to an approximation
of the action value function (the "q-function"), but with probabilty epsilon will
choose a random action instead. GreedyPolicy is a Schedulable, meaning that
epsilon can be varied over time by passing a Scheduler object.
Args:
q (all.approximation.QNetwork): The action-value or "q-function"
num_actions (int): The number of available actions.
epsilon (float, optional): The probability of selecting a random action.
'''
def __init__(
self,
q,
Expand Down
16 changes: 15 additions & 1 deletion all/policies/soft_deterministic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,20 @@
from all.nn import RLNetwork

class SoftDeterministicPolicy(Approximation):
'''
A "soft" deterministic policy compatible with soft actor-critic (SAC).
Args:
model (torch.nn.Module): A Pytorch module representing the policy network.
The input shape should be the same as the shape of the state (or feature) space,
and the output shape should be double the size of the the action space
The first n outputs will be the unscaled mean of the action for each dimension,
and the second n outputs will be the logarithm of the variance.
optimizer (torch.optim.Optimizer): A optimizer initialized with the
model parameters, e.g. SGD, Adam, RMSprop, etc.
action_space (gym.spaces.Box): The Box representing the action space.
kwargs (optional): Any other arguments accepted by all.approximation.Approximation
'''
def __init__(
self,
model,
Expand Down Expand Up @@ -32,7 +46,7 @@ def forward(self, state):
return self._squash(normal.loc)

def _normal(self, outputs):
means = outputs[:, 0 : self._action_dim]
means = outputs[:, 0:self._action_dim]
logvars = outputs[:, self._action_dim:]
std = logvars.mul(0.5).exp_()
return torch.distributions.normal.Normal(means, std)
Expand Down
11 changes: 11 additions & 0 deletions all/policies/softmax.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,17 @@


class SoftmaxPolicy(Approximation):
'''
A softmax (or Boltzmann) stochastic policy for discrete actions.
Args:
model (torch.nn.Module): A Pytorch module representing the policy network.
The input shape should be the same as the shape of the state (or feature) space,
and the output should be a vector the size of the action set.
optimizer (torch.optim.Optimizer): A optimizer initialized with the
model parameters, e.g. SGD, Adam, RMSprop, etc.
kwargs (optional): Any other arguments accepted by all.approximation.Approximation
'''
def __init__(
self,
model,
Expand Down
9 changes: 7 additions & 2 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,17 @@
]

# Autosummary settings
autodoc_default_flags = ['members']
autodoc_default_options = {
'members': True,
'undoc-members': True,
'show-inheritance': True
}
autosummary_generate = True
autodoc_inherit_docstrings = True

# Mock requirements to save resources during doc build machine setup
autodoc_mock_imports = [
'torch',
# 'torch',
'torchvision',
]

Expand Down
1 change: 1 addition & 0 deletions docs/source/modules/nn.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ all.nn
=================

.. automodule:: all.nn
:ignore-module-all:
:members:

0 comments on commit 5547867

Please sign in to comment.