# Acme: Quickstart

In [1]:
import copy
import IPython


from acme import environment_loop
from acme.tf import networks
from acme.adders import reverb as adders
from acme.agents.tf import actors as actors
from acme.datasets import reverb as datasets
from acme.wrappers import gym_wrapper
from acme import specs
from acme import wrappers
from acme.agents.tf import d4pg
from acme.agents import agent
from acme.tf import utils as tf2_utils
from acme.utils import loggers

import gym
import dm_env
import matplotlib.pyplot as plt
import numpy as np
import reverb
import sonnet as snt
import tensorflow as tf

# Import dm_control if it exists.
try:
    from dm_control import suite
except (OSError, ModuleNotFoundError):
    pass


## Load an environment

We can now load an environment. In what follows we'll create an environment and grab the environment's specifications.

In [2]:
environment = gym_wrapper.GymWrapper(gym.make('MountainCarContinuous-v0'))
environment = wrappers.SinglePrecisionWrapper(environment)

def render(env):
    return env.environment.render(mode='rgb_array')

# Grab the spec of the environment.
environment_spec = specs.make_environment_spec(environment)

 ## Create a D4PG agent

In [3]:
#@title Build agent networks

# Get total number of action dimensions from action spec.
num_dimensions = np.prod(environment_spec.actions.shape, dtype=int)

# Create the shared observation network; here simply a state-less operation.
observation_network = tf2_utils.batch_concat

# Create the deterministic policy network.
policy_network = snt.Sequential([
    networks.LayerNormMLP((256, 256, 256), activate_final=True),
    networks.NearZeroInitializedLinear(num_dimensions),
    networks.TanhToSpec(environment_spec.actions),
])

# Create the distributional critic network.
critic_network = snt.Sequential([
    # The multiplexer concatenates the observations/actions.
    networks.CriticMultiplexer(),
    networks.LayerNormMLP((512, 512, 256), activate_final=True),
    networks.DiscreteValuedHead(vmin=-150., vmax=150., num_atoms=51),
])


In [4]:
# Create a logger for agent specific diagnostics.
agent_logger = loggers.TerminalLogger(label='agent', time_delta=10)

# Create the D4PG agent.
agent = d4pg.D4PG(
    environment_spec=environment_spec,
    policy_network=policy_network,
    critic_network=critic_network,
    observation_network=observation_network,
    logger=agent_logger,
    checkpoint=False
)

## Run a training loop

In [15]:
# Create a logger for agent specific diagnostics.
env_loop_logger = loggers.TerminalLogger(label='env_loop', time_delta=10)

env_loop = environment_loop.EnvironmentLoop(environment, agent, logger=env_loop_logger)
env_loop.run(num_episodes=100)

[Agent] Critic Loss = 0.417 | Policy Loss = 0.000 | Steps = 12988 | Walltime = 274.950
[Agent] Critic Loss = 0.406 | Policy Loss = 0.000 | Steps = 13961 | Walltime = 284.958
[Env Loop] Episode Length = 999 | Episode Return = -11.474 | Episodes = 8 | Steps = 7992 | Steps Per Second = 826.953
[Agent] Critic Loss = 0.395 | Policy Loss = 0.000 | Steps = 14953 | Walltime = 294.958
[Env Loop] Episode Length = 999 | Episode Return = -9.487 | Episodes = 16 | Steps = 15984 | Steps Per Second = 829.099
[Agent] Critic Loss = 0.403 | Policy Loss = 0.000 | Steps = 15942 | Walltime = 304.967
[Env Loop] Episode Length = 999 | Episode Return = -9.438 | Episodes = 24 | Steps = 23976 | Steps Per Second = 827.074
[Agent] Critic Loss = 0.404 | Policy Loss = 0.000 | Steps = 16934 | Walltime = 314.967
[Env Loop] Episode Length = 999 | Episode Return = -8.877 | Episodes = 32 | Steps = 31968 | Steps Per Second = 830.415
[Agent] Critic Loss = 0.408 | Policy Loss = 0.000 | Steps = 17927 | Walltime = 324.974
[En

## (Optional) Visualize an evaluation loop


In [11]:
# Install and import the necessary dependencies for visualization

# !sudo apt-get install -y xvfb 

import pyvirtualdisplay
import imageio
import base64

# Set up a virtual display for rendering OpenAI gym environments.
display = pyvirtualdisplay.Display(visible=0, size=(1400, 900)).start()


In [19]:
def display_video(frames, filename='temp.mp4'):
    """Save and display video."""
    # Write video
    with imageio.get_writer(filename, fps=60) as video:
        for frame in frames:
            video.append_data(frame)
    # Read video and display the video
    video = open(filename, 'rb').read()
    b64_video = base64.b64encode(video)
    video_tag = ('<video  width="320" height="240" controls alt="test" '
               'src="data:video/mp4;base64,{0}">').format(b64_video.decode())
    return IPython.display.HTML(video_tag)

In [18]:
# Run the actor in the environment for desired number of steps.
frames = []
num_steps = 100
timestep = environment.reset()

for _ in range(num_steps):
    frames.append(render(environment))
    action = agent.select_action(timestep.observation)
    timestep = environment.step(action)

# Save video of the behaviour.
display_video(np.array(frames))

