In [None]:
!sudo apt-get install -y xvfb ffmpeg
!pip install 'gym==0.10.11'
!pip install imageio
!pip install PILLOW
!pip install 'pyglet==1.3.2'
!pip install pyvirtualdisplay

!pip install dm-acme
!pip install dm-acme[reverb]
!pip install dm-acme[tf]
!pip install dm-acme[envs]

from IPython.display import clear_output
clear_output()

In [None]:
!pip install -q tf-agents

In [None]:
#@title Import modules.
#python3

%%capture
import copy
import pyvirtualdisplay
import imageio 
import base64
import IPython


from acme import environment_loop
from acme.tf import networks
from acme.adders import reverb as adders
from acme.agents.tf import actors as actors
from acme.datasets import reverb as datasets
from acme.wrappers import gym_wrapper
from acme import specs
from acme import wrappers
from acme.agents.tf import ddpg
from acme.agents.tf import d4pg

from acme.agents import agent
from acme.tf import utils as tf2_utils
from acme.utils import loggers
from tf_agents.networks import q_network

import gym 
import dm_env
import matplotlib.pyplot as plt
import numpy as np
import reverb
import sonnet as snt
import tensorflow as tf


# Import dm_control if it exists.
try:
  from dm_control import suite
except (ModuleNotFoundError, OSError):
  pass

# Set up a virtual display for rendering OpenAI gym environments.
display = pyvirtualdisplay.Display(visible=0, size=(1400, 900)).start()

In [None]:
environment_name = 'gym_mountaincar'  # @param ['dm_cartpole', 'gym_mountaincar']
# task_name = 'balance'  # @param ['swingup', 'balance']

def make_environment(domain_name='cartpole', task='balance'):
  env = suite.load(domain_name, task)
  env = wrappers.SinglePrecisionWrapper(env)
  return env

if 'dm_cartpole' in environment_name:
  environment = make_environment('cartpole')
  def render(env):
    return env._physics.render(camera_id=0)  #pylint: disable=protected-access

elif 'gym_mountaincar' in environment_name:
  environment = gym_wrapper.GymWrapper(gym.make('MountainCarContinuous-v0'))
  environment = wrappers.SinglePrecisionWrapper(environment)
  def render(env):
    return env.environment.render(mode='rgb_array')
else:
  raise ValueError('Unknown environment: {}.'.format(environment_name))

# Show the frame.
frame = render(environment)
plt.imshow(frame)
plt.axis('off')

(-0.5, 599.5, 399.5, -0.5)

In [None]:
environment_spec = specs.make_environment_spec(environment)

print('actions:\n', environment_spec.actions, '\n')
print('observations:\n', environment_spec.observations, '\n')
print('rewards:\n', environment_spec.rewards, '\n')
print('discounts:\n', environment_spec.discounts, '\n')

actions:
 BoundedArray(shape=(1,), dtype=dtype('float32'), name='action', minimum=[-1.], maximum=[1.]) 

observations:
 BoundedArray(shape=(2,), dtype=dtype('float32'), name='observation', minimum=[-1.2  -0.07], maximum=[0.6  0.07]) 

rewards:
 Array(shape=(), dtype=dtype('float32'), name='reward') 

discounts:
 BoundedArray(shape=(), dtype=dtype('float32'), name='discount', minimum=0.0, maximum=1.0) 



In [None]:
#@title Choisir les hyperparamètres
max_replay_size=1000000
n_step=5
discount=0.99
batch_size=256
prefetch_size=4
sigma=0.3
target_update_period=100

In [None]:
replay_table_name=adders.DEFAULT_PRIORITY_TABLE
replay_table = reverb.Table(
        name=replay_table_name,
        sampler=reverb.selectors.Uniform(),
        remover=reverb.selectors.Fifo(),
        max_size=max_replay_size,
        rate_limiter=reverb.rate_limiters.MinSize(1),
        signature=adders.NStepTransitionAdder.signature(environment_spec))
replay_server = reverb.Server([replay_table], port=None)
replay_server_address = 'localhost:%d' % replay_server.port

In [None]:
adder = adders.NStepTransitionAdder(
        priority_fns={replay_table_name: lambda x: 1.},
        client=reverb.Client(replay_server_address),
        n_step=n_step,
        discount=discount)

In [None]:
 dataset = datasets.make_reverb_dataset(
        table=replay_table_name,
        server_address=replay_server_address,
        batch_size=batch_size,
        prefetch_size=prefetch_size)



In [None]:
observation_network=tf2_utils.batch_concat
observation_network = tf2_utils.to_sonnet_module(observation_network)

In [None]:
act_spec = environment_spec.actions
obs_spec = environment_spec.observations
emb_spec = tf2_utils.create_variables(observation_network, [obs_spec])

In [None]:
action_spec = environment_spec.actions
action_size = np.prod(action_spec.shape, dtype=int)

policy_modules = [
    tf2_utils.batch_concat,
    networks.LayerNormMLP(layer_sizes=(300, 200, action_size)),
    networks.TanhToSpec(spec=environment_spec.actions)]

policy_network = snt.Sequential(policy_modules)

critic_network = snt.Sequential([
    networks.CriticMultiplexer(
        observation_network=tf2_utils.batch_concat,
        action_network=tf.identity,
        critic_network=networks.LayerNormMLP(
            layer_sizes=(400, 300),
            activate_final=True)),
    # Value-head gives a 51-atomed delta distribution over state-action values.
    networks.DiscreteValuedHead(vmin=-150., vmax=150., num_atoms=51)])

In [None]:
target_policy_network = copy.deepcopy(policy_network)
target_critic_network = copy.deepcopy(critic_network)
target_observation_network = copy.deepcopy(observation_network)

In [None]:
behavior_network = snt.Sequential([
        observation_network,
        policy_network,
        networks.ClippedGaussian(sigma),
        networks.ClipToSpec(act_spec),
         ])

In [None]:
tf2_utils.create_variables(policy_network, [emb_spec])
tf2_utils.create_variables(critic_network, [emb_spec, act_spec])
tf2_utils.create_variables(target_policy_network, [emb_spec])
tf2_utils.create_variables(target_critic_network, [emb_spec, act_spec])
tf2_utils.create_variables(target_observation_network, [obs_spec])

TensorSpec(shape=(2,), dtype=tf.float32, name=None)

In [None]:
 actor = actors.FeedForwardActor(behavior_network, adder=adder)

In [None]:
 policy_optimizer = snt.optimizers.Adam(learning_rate=1e-4)
critic_optimizer = snt.optimizers.Adam(learning_rate=1e-4)

In [None]:
learner=ddpg.DDPGLearner(policy_network=policy_network,
                         critic_network=critic_network,
                         target_policy_network=target_policy_network,
                         target_critic_network=target_critic_network,
                         discount=discount,
                         target_update_period=target_update_period,
                         dataset=dataset,
                         observation_network=observation_network)

In [None]:
ddpg_agent=agent.Agent(actor=actor,
                       learner=learner,
                       min_observations=1000,
                       observations_per_step=8.)

In [None]:
# This may be necessary if any of the episodes were cancelled above.
adder.reset()

# We also want to make sure the logger doesn't write to disk because that can
# cause issues in colab on occasion.
logger = loggers.TerminalLogger(time_delta=10.)

In [None]:
loop = environment_loop.EnvironmentLoop(environment, ddpg_agent, logger=logger)
loop.run(num_episodes=50)

TypeError: ignored