Merge pull request #248 from cpnota/release/0.7.1

Release/0.7.1
cpnota · Jun 14, 2021 · 01836e0 · 01836e0
2 parents 67b27aa + 074d0ca
commit 01836e0
Show file tree

Hide file tree

Showing 46 changed files with 707 additions and 159 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -30,6 +30,7 @@ jobs:
         pip install torch==1.8.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
         make install
         AutoROM -v
+        python -m atari_py.import_roms $(python -c 'import site; print(site.getsitepackages()[0])')/multi_agent_ale_py/ROM
     - name: Lint code
       run: |
         make lint

diff --git a/all/agents/a2c.py b/all/agents/a2c.py
@@ -101,7 +101,7 @@ def _make_buffer(self):
         )
 
 
-class A2CTestAgent(Agent):
+class A2CTestAgent(Agent, ParallelAgent):
     def __init__(self, features, policy):
         self.features = features
         self.policy = policy

diff --git a/all/agents/dqn.py b/all/agents/dqn.py
@@ -81,12 +81,8 @@ def _should_train(self):
 
 
 class DQNTestAgent(Agent):
-    def __init__(self, q, n_actions, exploration=0.):
-        self.q = q
-        self.n_actions = n_actions
-        self.exploration = 0.001
+    def __init__(self, policy):
+        self.policy = policy
 
     def act(self, state):
-        if np.random.rand() < self.exploration:
-            return np.random.randint(0, self.n_actions)
-        return torch.argmax(self.q.eval(state)).item()
+        return self.policy.eval(state)
diff --git a/all/agents/sac.py b/all/agents/sac.py
@@ -98,7 +98,7 @@ def _train(self):
 
             # adjust temperature
             temperature_grad = (_log_probs + self.entropy_target).mean()
-            self.temperature += self.lr_temperature * temperature_grad.detach()
+            self.temperature = max(0, self.temperature + self.lr_temperature * temperature_grad.detach())
 
             # additional debugging info
             self.writer.add_loss('entropy', -_log_probs.mean())

diff --git a/all/agents/vqn.py b/all/agents/vqn.py
@@ -50,4 +50,9 @@ def _train(self, reward, next_state):
             self.q.reinforce(loss)
 
 
-VQNTestAgent = DQNTestAgent
+class VQNTestAgent(Agent, ParallelAgent):
+    def __init__(self, policy):
+        self.policy = policy
+
+    def act(self, state):
+        return self.policy.eval(state)
diff --git a/all/agents/vsarsa.py b/all/agents/vsarsa.py
@@ -1,7 +1,6 @@
 from torch.nn.functional import mse_loss
-from ._agent import Agent
 from ._parallel_agent import ParallelAgent
-from .dqn import DQNTestAgent
+from .vqn import VQNTestAgent
 
 
 class VSarsa(ParallelAgent):
@@ -47,4 +46,4 @@ def _train(self, reward, next_state, next_action):
             self.q.reinforce(loss)
 
 
-VSarsaTestAgent = DQNTestAgent
+VSarsaTestAgent = VQNTestAgent
diff --git a/all/bodies/atari.py b/all/bodies/atari.py
@@ -6,7 +6,8 @@
 
 class DeepmindAtariBody(Body):
     def __init__(self, agent, lazy_frames=False, episodic_lives=True, frame_stack=4, clip_rewards=True):
-        agent = FrameStack(agent, lazy=lazy_frames, size=frame_stack)
+        if frame_stack > 1:
+            agent = FrameStack(agent, lazy=lazy_frames, size=frame_stack)
         if clip_rewards:
             agent = ClipRewards(agent)
         if episodic_lives:
@@ -19,7 +20,7 @@ def process_state(self, state):
         if 'life_lost' not in state:
             return state
 
-        if len(state) == 1:
+        if len(state.shape) == 0:
             if state['life_lost']:
                 return state.update('mask', 0.)
             return state

diff --git a/all/bodies/vision.py b/all/bodies/vision.py
@@ -69,10 +69,9 @@ def update(self, key, value):
         x = {}
         for k in self.keys():
             if not k == key:
-                x[k] = super().__getitem__(k)
+                x[k] = dict.__getitem__(self, k)
         x[key] = value
-        state = LazyState(x, device=self.device)
-        state.to_cache = self.to_cache
+        state = LazyState.from_state(x, x['observation'], self.to_cache)
         return state
 
     def to(self, device):

diff --git a/all/environments/__init__.py b/all/environments/__init__.py
@@ -1,9 +1,12 @@
 from ._environment import Environment
-from._multiagent_environment import MultiagentEnvironment
+from ._multiagent_environment import MultiagentEnvironment
+from ._vector_environment import VectorEnvironment
 from .gym import GymEnvironment
 from .atari import AtariEnvironment
 from .multiagent_atari import MultiagentAtariEnv
 from .multiagent_pettingzoo import MultiagentPettingZooEnv
+from .duplicate_env import DuplicateEnvironment
+from .vector_env import GymVectorEnvironment
 from .pybullet import PybulletEnvironment
 
 __all__ = [
@@ -13,5 +16,7 @@
     "AtariEnvironment",
     "MultiagentAtariEnv",
     "MultiagentPettingZooEnv",
+    "GymVectorEnvironment",
+    "DuplicateEnvironment",
     "PybulletEnvironment",
 ]
diff --git a/all/environments/_vector_environment.py b/all/environments/_vector_environment.py
@@ -0,0 +1,116 @@
+from abc import ABC, abstractmethod
+
+
+class VectorEnvironment(ABC):
+    """
+    A reinforcement learning vector Environment.
+
+    Similar to a regular RL environment except many environments are stacked together
+    in the observations, rewards, and dones, and the vector environment expects
+    an action to be given for each environment in step.
+
+    Also, since sub-environments are done at different times, you do not need to
+    manually reset the environments when they are done, rather the vector environment
+    automatically resets environments when they are complete.
+    """
+
+    @property
+    @abstractmethod
+    def name(self):
+        """
+        The name of the environment.
+        """
+
+    @abstractmethod
+    def reset(self):
+        """
+        Reset the environment and return a new initial state.
+
+        Returns
+        -------
+        State
+            The initial state for the next episode.
+        """
+
+    @abstractmethod
+    def step(self, action):
+        """
+        Apply an action and get the next state.
+
+        Parameters
+        ----------
+        action : Action
+            The action to apply at the current time step.
+
+        Returns
+        -------
+        all.environments.State
+            The State of the environment after the action is applied.
+            This State object includes both the done flag and any additional "info"
+        float
+            The reward achieved by the previous action
+        """
+
+    @abstractmethod
+    def close(self):
+        """
+        Clean up any extraneous environment objects.
+        """
+
+    @property
+    @abstractmethod
+    def state_array(self):
+        """
+        A StateArray of the Environments at the current timestep.
+        """
+
+    @property
+    @abstractmethod
+    def state_space(self):
+        """
+        The Space representing the range of observable states for each environment.
+
+        Returns
+        -------
+        Space
+            An object of type Space that represents possible states the agent may observe
+        """
+
+    @property
+    def observation_space(self):
+        """
+        Alias for Environment.state_space.
+
+        Returns
+        -------
+        Space
+            An object of type Space that represents possible states the agent may observe
+        """
+        return self.state_space
+
+    @property
+    @abstractmethod
+    def action_space(self):
+        """
+        The Space representing the range of possible actions for each environment.
+
+        Returns
+        -------
+        Space
+            An object of type Space that represents possible actions the agent may take
+        """
+
+    @property
+    @abstractmethod
+    def device(self):
+        """
+        The torch device the environment lives on.
+        """
+
+    @property
+    @abstractmethod
+    def num_envs(self):
+        """
+        Number of environments in vector. This is the number of actions step() expects as input
+        and the number of observations, dones, etc returned by the environment.
+        """
diff --git a/all/environments/atari.py b/all/environments/atari.py
@@ -8,6 +8,7 @@
     LifeLostEnv,
 )
 from all.core import State
+from .duplicate_env import DuplicateEnvironment
 
 
 class AtariEnvironment(GymEnvironment):
@@ -38,6 +39,6 @@ def reset(self):
         return self._state
 
     def duplicate(self, n):
-        return [
+        return DuplicateEnvironment([
             AtariEnvironment(self._name, *self._args, **self._kwargs) for _ in range(n)
-        ]
+        ])
diff --git a/all/environments/duplicate_env.py b/all/environments/duplicate_env.py
@@ -0,0 +1,73 @@
+import gym
+import torch
+from all.core import State
+from ._vector_environment import VectorEnvironment
+import numpy as np
+
+
+class DuplicateEnvironment(VectorEnvironment):
+    '''
+    Turns a list of ALL Environment objects into a VectorEnvironment object
+
+    This wrapper just takes the list of States the environments generate and outputs
+    a StateArray object containing all of the environment states. Like all vector
+    environments, the sub environments are automatically reset when done.
+
+    Args:
+        envs: A list of ALL environments
+        device (optional): the device on which tensors will be stored
+    '''
+
+    def __init__(self, envs, device=torch.device('cpu')):
+        self._name = envs[0].name
+        self._envs = envs
+        self._state = None
+        self._action = None
+        self._reward = None
+        self._done = True
+        self._info = None
+        self._device = device
+
+    @property
+    def name(self):
+        return self._name
+
+    def reset(self):
+        self._state = State.array([sub_env.reset() for sub_env in self._envs])
+        return self._state
+
+    def step(self, actions):
+        states = []
+        actions = actions.cpu().detach().numpy()
+        for sub_env, action in zip(self._envs, actions):
+            state = sub_env.reset() if sub_env.state.done else sub_env.step(action)
+            states.append(state)
+        self._state = State.array(states)
+        return self._state
+
+    def close(self):
+        return self._env.close()
+
+    def seed(self, seed):
+        for i, env in enumerate(self._envs):
+            env.seed(seed + i)
+
+    @property
+    def state_space(self):
+        return self._envs[0].observation_space
+
+    @property
+    def action_space(self):
+        return self._envs[0].action_space
+
+    @property
+    def state_array(self):
+        return self._state
+
+    @property
+    def device(self):
+        return self._device
+
+    @property
+    def num_envs(self):
+        return len(self._envs)
diff --git a/all/environments/duplicate_env_test.py b/all/environments/duplicate_env_test.py
@@ -0,0 +1,54 @@
+import unittest
+import gym
+import torch
+from all.environments import DuplicateEnvironment, GymEnvironment
+
+
+def make_vec_env(num_envs=3):
+    env = [GymEnvironment('CartPole-v0') for i in range(num_envs)]
+    return env
+
+
+class DuplicateEnvironmentTest(unittest.TestCase):
+    def test_env_name(self):
+        env = DuplicateEnvironment(make_vec_env())
+        self.assertEqual(env.name, 'CartPole-v0')
+
+    def test_num_envs(self):
+        num_envs = 5
+        env = DuplicateEnvironment(make_vec_env(num_envs))
+        self.assertEqual(env.num_envs, num_envs)
+        self.assertEqual((num_envs,), env.reset().shape)
+
+    def test_reset(self):
+        num_envs = 5
+        env = DuplicateEnvironment(make_vec_env(num_envs))
+        state = env.reset()
+        self.assertEqual(state.observation.shape, (num_envs, 4))
+        self.assertTrue((state.reward == torch.zeros(num_envs, )).all())
+        self.assertTrue((state.done == torch.zeros(num_envs, )).all())
+        self.assertTrue((state.mask == torch.ones(num_envs, )).all())
+
+    def test_step(self):
+        num_envs = 5
+        env = DuplicateEnvironment(make_vec_env(num_envs))
+        env.reset()
+        state = env.step(torch.ones(num_envs, dtype=torch.int32))
+        self.assertEqual(state.observation.shape, (num_envs, 4))
+        self.assertTrue((state.reward == torch.ones(num_envs, )).all())
+        self.assertTrue((state.done == torch.zeros(num_envs, )).all())
+        self.assertTrue((state.mask == torch.ones(num_envs, )).all())
+
+    def test_step_until_done(self):
+        num_envs = 3
+        env = DuplicateEnvironment(make_vec_env(num_envs))
+        env.seed(5)
+        env.reset()
+        for _ in range(100):
+            state = env.step(torch.ones(num_envs, dtype=torch.int32))
+            if state.done[0]:
+                break
+        self.assertEqual(state[0].observation.shape, (4,))
+        self.assertEqual(state[0].reward, 1.)
+        self.assertTrue(state[0].done)
+        self.assertEqual(state[0].mask, 0)