In [1]:
#@title Install necessary dependencies.

!sudo apt-get install -y xvfb
!pip install 'gym==0.10.11'
!pip install imageio
!pip install PILLOW
!pip install 'pyglet==1.3.2'
!pip install pyvirtualdisplay

!pip install dm-acme
!pip install dm-acme[reverb]
!pip install dm-acme[tf]
!pip install dm-acme[envs]
!pip install -q tf-agents

from IPython.display import clear_output
clear_output()

In [2]:
#@title Importer les modules.
#python3

%%capture
import copy
import pyvirtualdisplay
import imageio 
import base64
import IPython


from acme import environment_loop
from acme.tf import networks
from acme.adders import reverb as adders
from acme.agents.tf import actors as actors
from acme.datasets import reverb as datasets
from acme.wrappers import gym_wrapper
from acme import specs
from acme import wrappers
from acme.agents.tf import dqn
from acme.agents import agent
from acme.tf import utils as tf2_utils
from acme.utils import loggers
from tf_agents.networks import q_network
from tf_agents.environments import tf_py_environment
from tf_agents.environments import suite_gym


import gym 
import dm_env
import matplotlib.pyplot as plt
import numpy as np
import reverb
import sonnet as snt
import tensorflow as tf
import trfl
import torch


# Render une video pour l'environnment.
display = pyvirtualdisplay.Display(visible=0, size=(1920, 1080)).start()

In [3]:
#@title Choisir l'environnement
environment_name = 'gym_mountaincar'

def make_environment(domain_name='cartpole', task='balance'):
  env = suite.load(domain_name, task)
  env = wrappers.SinglePrecisionWrapper(env)
  return env

if 'gym_mountaincar' in environment_name:
  environment = gym_wrapper.GymWrapper(gym.make('MountainCarContinuous-v0'))
  environment = wrappers.SinglePrecisionWrapper(environment)
  def render(env):
    return env.environment.render(mode='rgb_array')
else:
  raise ValueError('Unknown environment: {}.'.format(environment_name))

# Show the frame.
frame = render(environment)
plt.imshow(frame)
plt.axis('off')

(-0.5, 599.5, 399.5, -0.5)

In [4]:
environment_spec = specs.make_environment_spec(environment)

print('actions:\n', environment_spec.actions, '\n')
print('observations:\n', environment_spec.observations, '\n')
print('rewards:\n', environment_spec.rewards, '\n')
print('discounts:\n', environment_spec.discounts, '\n')

actions:
 BoundedArray(shape=(1,), dtype=dtype('float32'), name='action', minimum=[-1.], maximum=[1.]) 

observations:
 BoundedArray(shape=(2,), dtype=dtype('float32'), name='observation', minimum=[-1.2  -0.07], maximum=[0.6  0.07]) 

rewards:
 Array(shape=(), dtype=dtype('float32'), name='reward') 

discounts:
 BoundedArray(shape=(), dtype=dtype('float32'), name='discount', minimum=0.0, maximum=1.0) 



In [5]:
#@title Choisir hyperparamètres
epsilon = 0.5
batch_size=256
prefetch_size=4
priority_exponent=0.6
max_replay_size=1000000

In [6]:
#@title Définir la fonction display_video
def display_video(frames, filename='temp.mp4'):
  """Save and display video."""
  # Write video
  with imageio.get_writer(filename, fps=60) as video:
    for frame in frames:
      video.append_data(frame)
  # Read video and display the video
  video = open(filename, 'rb').read()
  b64_video = base64.b64encode(video)
  video_tag = ('<video  width="320" height="240" controls alt="test" '
               'src="data:video/mp4;base64,{0}">').format(b64_video.decode())
  return IPython.display.HTML(video_tag)

In [7]:
timestep = environment.reset()

In [8]:
#@title Définition du Q-Network
train_py_env = suite_gym.load('MountainCarContinuous-v0')
eval_py_env = suite_gym.load('MountainCarContinuous-v0')

train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

fc_layer_params = (100,)

network = q_network.QNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    fc_layer_params=fc_layer_params)

In [9]:
replay_table = reverb.Table(name=adders.DEFAULT_PRIORITY_TABLE,
        sampler=reverb.selectors.Prioritized(priority_exponent),
        remover=reverb.selectors.Fifo(),
        max_size=max_replay_size,
        rate_limiter=reverb.rate_limiters.MinSize(1),
        signature=adders.NStepTransitionAdder.signature(environment_spec))
replay_server = reverb.Server([replay_table], port=None)
replay_server_address = 'localhost:%d' % replay_server.port

In [10]:
#@title Adder
# Create a 5-step transition adder where in between those steps a discount of
# 0.99 is used (which should be the same discount used for learning).
adder = adders.NStepTransitionAdder(
    client=reverb.Client(replay_server_address),
    n_step=5,
    discount=0.99)

In [11]:
#@title Dataset
replay_client = reverb.TFClient(replay_server_address)
dataset = datasets.make_reverb_dataset(
        server_address=replay_server_address,
        batch_size=batch_size,
        prefetch_size=prefetch_size)



In [13]:
#@title Policy Network
num_dimensions = np.prod(environment_spec.actions.shape, dtype=int)
policy_network = snt.Sequential([
    networks.LayerNormMLP((256, 256, 256), activate_final=True),
    networks.NearZeroInitializedLinear(num_dimensions),
    networks.TanhToSpec(environment_spec.actions),
  ])
#policy_network = snt.Sequential([
 #         network,
  #        lambda q: trfl.epsilon_greedy(q, epsilon=epsilon).sample(),
   #   ])

In [14]:
#@title Target network.
target_network = copy.deepcopy(network)

In [15]:
# Ensure that we create the variables before proceeding (maybe not needed).
tf2_utils.create_variables(network=network, input_spec=[environment_spec.observations])
tf2_utils.create_variables(network=target_network,input_spec= [environment_spec.observations])

(TensorSpec(shape=(3,), dtype=tf.float32, name=None), ())

In [16]:
#@title Actor
actor = actors.FeedForwardActor(policy_network=policy_network, adder=adder,)

In [24]:
from typing import Dict, List
@tf.function
def my_step(self) -> Dict[str, tf.Tensor]:
  """Do a step of SGD and update the priorities."""

  # Pull out the data needed for updates/priorities.
  inputs = next(self._iterator)
  o_tm1, a_tm1, r_t, d_t, o_t = inputs.data
  keys, probs = inputs.info[:2]

  #new instruction
  print(inputs.data)
  #print(o_tm1[0, 0])

  with tf.GradientTape() as tape:
    # Evaluate our networks.
    q_tm1 = self._network(o_tm1)
    q_t_value = self._target_network(o_t)
    q_t_selector = self._network(o_t)

    print(q_tm1)
    #print(q_tm1[0])
    q_tm1=q_tm1[0]

    print(r_t.dtype)
    #print(q_tm1.dtype)

    # The rewards and discounts have to have the same type as network values.
    r_t = tf.cast(r_t, q_tm1.dtype.as_numpy_dtype)
    r_t = tf.clip_by_value(r_t, -1., 1.)
    d_t = tf.cast(d_t, q_tm1.dtype) * tf.cast(self._discount, q_tm1.dtype)


    

    # Compute the loss.
    _, extra = trfl.double_qlearning(q_tm1, a_tm1, r_t, d_t, q_t_value,
                                       q_t_selector)
    loss = losses.huber(extra.td_error, self._huber_loss_parameter)

    # Get the importance weights.
    importance_weights = 1. / probs  # [B]
    importance_weights **= self._importance_sampling_exponent
    importance_weights /= tf.reduce_max(importance_weights)

    # Reweight.
    loss *= tf.cast(importance_weights, loss.dtype)  # [B]
    loss = tf.reduce_mean(loss, axis=[0])  # []

  # Do a step of SGD.
  gradients = tape.gradient(loss, self._network.trainable_variables)
  self._optimizer.apply(gradients, self._network.trainable_variables)

  # Update the priorities in the replay buffer.
  if self._replay_client:
    priorities = tf.cast(tf.abs(extra.td_error), tf.float64)
    self._replay_client.update_priorities(
          table=adders.DEFAULT_PRIORITY_TABLE, keys=keys, priorities=priorities)

  # Periodically update the target network.
  if tf.math.mod(self._num_steps, self._target_update_period) == 0:
    for src, dest in zip(self._network.variables,
                           self._target_network.variables):
        dest.assign(src)
  self._num_steps.assign_add(1)

  # Report loss & statistics for logging.
  fetches = {
        'loss': loss,
    }

  return fetches

dqn.DQNLearner._step = my_step

In [18]:
learner=dqn.DQNLearner(network=network, target_network=target_network,discount=0.99, importance_sampling_exponent=1e-3,learning_rate=0.2,target_update_period=100, dataset=dataset)

In [19]:
dqn_agent = agent.Agent(actor=actor,
                         learner=learner,
                         min_observations=1000,
                         observations_per_step=8.)

In [25]:
# This may be necessary if any of the episodes were cancelled above.
adder.reset()

# We also want to make sure the logger doesn't write to disk because that can
# cause issues in colab on occasion.
logger = loggers.TerminalLogger(time_delta=10.)
agent_logger = loggers.TerminalLogger(label='agent', time_delta=10.)
env_loop_logger = loggers.TerminalLogger(label='env_loop', time_delta=10.)

In [21]:
[method_or_attr for method_or_attr in dir(learner)  # pylint: disable=expression-not-assigned
 if not method_or_attr.startswith('_')]

['get_variables', 'run', 'state', 'step']

In [22]:
env_loop = environment_loop.EnvironmentLoop(environment=environment,actor=dqn_agent,logger=env_loop_logger)

In [26]:
env_loop.run(50)

(<tf.Tensor 'IteratorGetNext:4' shape=(256, 2) dtype=float32>, <tf.Tensor 'IteratorGetNext:5' shape=(256, 1) dtype=float32>, <tf.Tensor 'IteratorGetNext:6' shape=(256,) dtype=float32>, <tf.Tensor 'IteratorGetNext:7' shape=(256,) dtype=float32>, <tf.Tensor 'IteratorGetNext:8' shape=(256, 2) dtype=float32>)
(<tf.Tensor 'QNetwork/dense_1/BiasAdd:0' shape=(256, 3) dtype=float32>, ())
<dtype: 'float32'>


AttributeError: ignored