Skip to content

Commit

Permalink
Merge pull request #152 from cpnota/release/0.5.2
Browse files Browse the repository at this point in the history
release/0.5.2
  • Loading branch information
cpnota committed Jun 7, 2020
2 parents 68d355a + ff83a20 commit 6d1111a
Show file tree
Hide file tree
Showing 19 changed files with 186 additions and 35 deletions.
2 changes: 1 addition & 1 deletion all/approximation/q_dist.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def __init__(
v_min,
v_max,
name="q_dist",
**kwargs,
**kwargs
):
device = next(model.parameters()).device
self.n_actions = n_actions
Expand Down
4 changes: 2 additions & 2 deletions all/approximation/v_network_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@ def test_multi_reinforce(self):
mask=torch.tensor([1, 1, 0, 1, 0, 0])
)
result1 = self.v(states[0:2])
result2 = self.v(states[2:4])
result3 = self.v(states[4:6])
self.v.reinforce(loss(result1, torch.tensor([1, 2])).float())
result2 = self.v(states[2:4])
self.v.reinforce(loss(result2, torch.tensor([1, 1])).float())
result3 = self.v(states[4:6])
self.v.reinforce(loss(result3, torch.tensor([1, 2])).float())
with self.assertRaises(Exception):
self.v.reinforce(loss(result3, torch.tensor([1, 2])).float())
Expand Down
9 changes: 3 additions & 6 deletions all/environments/abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,11 @@ def step(self, action):
Returns
-------
State
The state of the environment after the action is applied
all.environments.State
The State of the environment after the action is applied.
This State object includes both the done flag and any additional "info"
float
The reward achieved by the previous action
done
True if the environment has entered a terminal state and should be reset
info
Diagnostic information useful for debugging
"""

@abstractmethod
Expand Down
2 changes: 1 addition & 1 deletion all/experiments/parallel_env_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def __init__(

# test state
self._test_episodes = 100
self._test_episodes_started = 0
self._test_episodes_started = self._n_envs
self._test_returns = []
self._should_save_returns = [True] * self._n_envs

Expand Down
2 changes: 1 addition & 1 deletion all/experiments/single_env_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def _run_training_episode(self):
def _run_test_episode(self):
# initialize the episode
self._env.reset()
action = self._agent.act(self._env.state, self._env.reward)
action = self._agent.eval(self._env.state, self._env.reward)
returns = 0

# loop until the episode is finished
Expand Down
4 changes: 2 additions & 2 deletions all/experiments/single_env_experiment_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ def test_writes_test_returns(self):
experiment = MockExperiment(dqn(), self.env, quiet=True)
experiment.train(episodes=5)
returns = experiment.test(episodes=4)
expected_mean = 10.25
expected_std = 1.0897247358851685
expected_mean = 9.5
expected_std = 0.5
np.testing.assert_equal(np.mean(returns), expected_mean)
np.testing.assert_equal(
experiment._writer.data["evaluation/returns-test/mean"]["values"],
Expand Down
5 changes: 1 addition & 4 deletions all/experiments/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ class ExperimentWriter(SummaryWriter, Writer):
tagging the run with a combination of the agent name, the commit hash of the
current git repo of the working directory (if any), and the current time.
Also writes summary statistics into CSV files.
Args:
experiment (all.experiments.Experiment): The Experiment associated with the Writer object.
agent_name (str): The name of the Agent the Experiment is being performed on
Expand All @@ -24,7 +22,7 @@ class ExperimentWriter(SummaryWriter, Writer):
'''
def __init__(self, experiment, agent_name, env_name, loss=True):
self.env_name = env_name
current_time = str(datetime.now())
current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S %f')
os.makedirs(
os.path.join(
"runs", ("%s %s %s" % (agent_name, COMMIT_HASH, current_time)), env_name
Expand All @@ -51,7 +49,6 @@ def add_schedule(self, name, value, step="frame"):
def add_scalar(self, name, value, step="frame"):
'''
Log an arbitrary scalar.
Args:
name (str): The tag to associate with the scalar
value (number): The value of the scalar at the current step
Expand Down
3 changes: 1 addition & 2 deletions all/nn/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ class RLNetwork(nn.Module):
"""
Wraps a network such that States can be given as input.
"""

def __init__(self, model, _=None):
super().__init__()
self.model = model
Expand All @@ -20,7 +19,7 @@ def forward(self, state):
return self.model(state.features.float()) * state.mask.float().unsqueeze(-1)

class Aggregation(nn.Module):
"""len()
"""
Aggregation layer for the Dueling architecture.
https://arxiv.org/abs/1511.06581
Expand Down
2 changes: 2 additions & 0 deletions all/optim/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
from .scheduler import LinearScheduler, Schedulable

__all__ = ['Schedulable', 'LinearScheduler']
13 changes: 13 additions & 0 deletions all/policies/deterministic.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,18 @@


class DeterministicPolicy(Approximation):
'''
A DDPG-style deterministic policy.
Args:
model (torch.nn.Module): A Pytorch module representing the policy network.
The input shape should be the same as the shape of the state space,
and the output shape should be the same as the shape of the action space.
optimizer (torch.optim.Optimizer): A optimizer initialized with the
model parameters, e.g. SGD, Adam, RMSprop, etc.
action_space (gym.spaces.Box): The Box representing the action space.
kwargs (optional): Any other arguments accepted by all.approximation.Approximation
'''
def __init__(
self,
model,
Expand All @@ -20,6 +32,7 @@ def __init__(
**kwargs
)


class DeterministicPolicyNetwork(RLNetwork):
def __init__(self, model, space):
super().__init__(model)
Expand Down
14 changes: 5 additions & 9 deletions all/policies/deterministic_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,28 +57,24 @@ def test_target(self):
self.space,
target=FixedTarget(3)
)

# choose initial action
state = State(torch.ones(1, STATE_DIM))
action = self.policy(state)
tt.assert_equal(action, torch.zeros(1, ACTION_DIM))

# run update step, make sure target network doesn't change
action.sum().backward(retain_graph=True)
self.policy(state).sum().backward()
self.policy.step()
tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM))

# again...
action.sum().backward(retain_graph=True)
self.policy(state).sum().backward()
self.policy.step()
tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM))

# third time, target should be updated
action.sum().backward(retain_graph=True)
self.policy(state).sum().backward()
self.policy.step()
tt.assert_allclose(
self.policy.eval(state),
torch.tensor([[-0.595883, -0.595883, -0.595883]]),
self.policy.target(state),
torch.tensor([[-0.574482, -0.574482, -0.574482]]),
atol=1e-4,
)

Expand Down
18 changes: 18 additions & 0 deletions all/policies/gaussian.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,24 @@


class GaussianPolicy(Approximation):
'''
A Gaussian stochastic policy.
This policy will choose actions from a distribution represented by a spherical Gaussian.
The first n outputs the model will be squashed to [-1, 1] through a tanh function, and then
scaled to the given action_space, and the remaining n outputs will define the amount of noise added.
Args:
model (torch.nn.Module): A Pytorch module representing the policy network.
The input shape should be the same as the shape of the state (or feature) space,
and the output shape should be double the size of the the action space.
The first n outputs will be the unscaled mean of the action for each dimension,
and the second n outputs will be the logarithm of the variance.
optimizer (torch.optim.Optimizer): A optimizer initialized with the
model parameters, e.g. SGD, Adam, RMSprop, etc.
action_space (gym.spaces.Box): The Box representing the action space.
kwargs (optional): Any other arguments accepted by all.approximation.Approximation
'''
def __init__(
self,
model,
Expand Down
13 changes: 13 additions & 0 deletions all/policies/greedy.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,19 @@
from all.optim import Schedulable

class GreedyPolicy(Schedulable):
'''
An "epsilon-greedy" action selection policy for discrete action spaces.
This policy will usually choose the optimal action according to an approximation
of the action value function (the "q-function"), but with probabilty epsilon will
choose a random action instead. GreedyPolicy is a Schedulable, meaning that
epsilon can be varied over time by passing a Scheduler object.
Args:
q (all.approximation.QNetwork): The action-value or "q-function"
num_actions (int): The number of available actions.
epsilon (float, optional): The probability of selecting a random action.
'''
def __init__(
self,
q,
Expand Down
41 changes: 36 additions & 5 deletions all/policies/soft_deterministic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,20 @@
from all.nn import RLNetwork

class SoftDeterministicPolicy(Approximation):
'''
A "soft" deterministic policy compatible with soft actor-critic (SAC).
Args:
model (torch.nn.Module): A Pytorch module representing the policy network.
The input shape should be the same as the shape of the state (or feature) space,
and the output shape should be double the size of the the action space
The first n outputs will be the unscaled mean of the action for each dimension,
and the second n outputs will be the logarithm of the variance.
optimizer (torch.optim.Optimizer): A optimizer initialized with the
model parameters, e.g. SGD, Adam, RMSprop, etc.
action_space (gym.spaces.Box): The Box representing the action space.
kwargs (optional): Any other arguments accepted by all.approximation.Approximation
'''
def __init__(
self,
model,
Expand Down Expand Up @@ -32,18 +46,35 @@ def forward(self, state):
return self._squash(normal.loc)

def _normal(self, outputs):
means = outputs[:, 0 : self._action_dim]
means = outputs[:, 0:self._action_dim]
logvars = outputs[:, self._action_dim:]
std = logvars.mul(0.5).exp_()
return torch.distributions.normal.Normal(means, std)

def _sample(self, normal):
raw = normal.rsample()
action = self._squash(raw)
log_prob = self._log_prob(normal, raw)
return self._squash(raw), log_prob

def _log_prob(self, normal, raw):
'''
Compute the log probability of a raw action after the action is squashed.
Both inputs act on the raw underlying distribution.
Because tanh_mean does not affect the density, we can ignore it.
However, tanh_scale will affect the relative contribution of each component.'
See Appendix C in the Soft Actor-Critic paper
Args:
normal (torch.distributions.normal.Normal): The "raw" normal distribution.
raw (torch.Tensor): The "raw" action.
Returns:
torch.Tensor: The probability of the raw action, accounting for the affects of tanh.
'''
log_prob = normal.log_prob(raw)
log_prob -= torch.log(1 - action.pow(2) + 1e-6)
log_prob = log_prob.sum(1)
return action, log_prob
log_prob -= torch.log(1 - torch.tanh(raw).pow(2) + 1e-6)
log_prob /= self._tanh_scale
return log_prob.sum(1)

def _squash(self, x):
return torch.tanh(x) * self._tanh_scale + self._tanh_mean
Expand Down
68 changes: 68 additions & 0 deletions all/policies/soft_deterministic_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import unittest
import torch
import numpy as np
import torch_testing as tt
from gym.spaces import Box
from all import nn
from all.environments import State
from all.policies import SoftDeterministicPolicy

STATE_DIM = 2
ACTION_DIM = 3

class TestSoftDeterministic(unittest.TestCase):
def setUp(self):
torch.manual_seed(2)
self.model = nn.Sequential(
nn.Linear0(STATE_DIM, ACTION_DIM * 2)
)
self.optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.01)
self.space = Box(np.array([-1, -1, -1]), np.array([1, 1, 1]))
self.policy = SoftDeterministicPolicy(
self.model,
self.optimizer,
self.space
)

def test_output_shape(self):
state = State(torch.randn(1, STATE_DIM))
action, log_prob = self.policy(state)
self.assertEqual(action.shape, (1, ACTION_DIM))
self.assertEqual(log_prob.shape, torch.Size([1]))

state = State(torch.randn(5, STATE_DIM))
action, log_prob = self.policy(state)
self.assertEqual(action.shape, (5, ACTION_DIM))
self.assertEqual(log_prob.shape, torch.Size([5]))

def test_step_one(self):
state = State(torch.randn(1, STATE_DIM))
self.policy(state)
self.policy.step()

def test_converge(self):
state = State(torch.randn(1, STATE_DIM))
target = torch.tensor([0.25, 0.5, -0.5])

for _ in range(0, 200):
action, _ = self.policy(state)
loss = ((target - action) ** 2).mean()
loss.backward()
self.policy.step()

self.assertLess(loss, 0.2)

def test_scaling(self):
self.space = Box(np.array([-10, -5, 100]), np.array([10, -2, 200]))
self.policy = SoftDeterministicPolicy(
self.model,
self.optimizer,
self.space
)
state = State(torch.randn(1, STATE_DIM))
action, log_prob = self.policy(state)
tt.assert_allclose(action, torch.tensor([[-3.09055, -4.752777, 188.98222]]))
tt.assert_allclose(log_prob, torch.tensor([-0.397002]), rtol=1e-4)

if __name__ == '__main__':
unittest.main()
11 changes: 11 additions & 0 deletions all/policies/softmax.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,17 @@


class SoftmaxPolicy(Approximation):
'''
A softmax (or Boltzmann) stochastic policy for discrete actions.
Args:
model (torch.nn.Module): A Pytorch module representing the policy network.
The input shape should be the same as the shape of the state (or feature) space,
and the output should be a vector the size of the action set.
optimizer (torch.optim.Optimizer): A optimizer initialized with the
model parameters, e.g. SGD, Adam, RMSprop, etc.
kwargs (optional): Any other arguments accepted by all.approximation.Approximation
'''
def __init__(
self,
model,
Expand Down
7 changes: 6 additions & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,13 @@
]

# Autosummary settings
autodoc_default_flags = ['members']
autodoc_default_options = {
'members': True,
'undoc-members': True,
'show-inheritance': True
}
autosummary_generate = True
autodoc_inherit_docstrings = True

# Mock requirements to save resources during doc build machine setup
autodoc_mock_imports = [
Expand Down
1 change: 1 addition & 0 deletions docs/source/modules/nn.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ all.nn
=================

.. automodule:: all.nn
:ignore-module-all:
:members:

0 comments on commit 6d1111a

Please sign in to comment.