## Installation

In [2]:
#@title Install necessary dependencies.

!sudo apt-get install -y xvfb ffmpeg
!pip install 'gym==0.10.11'
!pip install imageio
!pip install PILLOW
!pip install 'pyglet==1.3.2'
!pip install pyvirtualdisplay

!pip install dm-acme
!pip install dm-acme[reverb]
!pip install dm-acme[tf]
!pip install dm-acme[envs]

from IPython.display import clear_output
clear_output()

In [3]:
#@title Import modules.
#python3

%%capture
import copy
import pyvirtualdisplay
import imageio 
import base64
import IPython


from acme import environment_loop
from acme.tf import networks
from acme.adders import reverb as adders
from acme.agents.tf import actors as actors
from acme.datasets import reverb as datasets
from acme.wrappers import gym_wrapper
from acme import specs
from acme import wrappers
from acme.agents.tf import d4pg
from acme.agents import agent
from acme.tf import utils as tf2_utils
from acme.utils import loggers

import gym 
import dm_env
import matplotlib.pyplot as plt
import numpy as np
import reverb
import sonnet as snt
import tensorflow as tf

# Import dm_control if it exists.
try:
  from dm_control import suite
except (ModuleNotFoundError, OSError):
  pass

# Set up a virtual display for rendering OpenAI gym environments.
display = pyvirtualdisplay.Display(visible=0, size=(1400, 900)).start()

## Environnement

In [4]:
environment = gym_wrapper.GymWrapper(gym.make('MountainCarContinuous-v0'))
environment = wrappers.SinglePrecisionWrapper(environment)
def render(env):
  return env.environment.render(mode='rgb_array')

# Show the frame.
frame = render(environment)
plt.imshow(frame)
plt.axis('on')

(-0.5, 599.5, 399.5, -0.5)

### Environement specs

In [5]:
environment_spec = specs.make_environment_spec(environment)

print('actions:\n', environment_spec.actions, '\n')
print('observations:\n', environment_spec.observations, '\n')
print('rewards:\n', environment_spec.rewards, '\n')
print('discounts:\n', environment_spec.discounts, '\n')

actions:
 BoundedArray(shape=(1,), dtype=dtype('float32'), name='action', minimum=[-1.], maximum=[1.]) 

observations:
 BoundedArray(shape=(2,), dtype=dtype('float32'), name='observation', minimum=[-1.2  -0.07], maximum=[0.6  0.07]) 

rewards:
 Array(shape=(), dtype=dtype('float32'), name='reward') 

discounts:
 BoundedArray(shape=(), dtype=dtype('float32'), name='discount', minimum=0.0, maximum=1.0) 



# Agent choice

In [6]:
import numpy as np
actions_discrete = np.linspace(environment_spec.actions.minimum, environment_spec.actions.maximum, 20)
actions_discrete.shape
environment_spec.observations


BoundedArray(shape=(2,), dtype=dtype('float32'), name='observation', minimum=[-1.2  -0.07], maximum=[0.6  0.07])

In [7]:
#@title Build agent networks


# Get total number of action dimensions from action spec.
num_dimensions = np.prod(environment_spec.actions.shape, dtype=int)

# Create the shared observation network; here simply a state-less operation.
observation_network = tf2_utils.batch_concat

# Create the deterministic policy network.
policy_network = snt.Sequential([
    networks.LayerNormMLP((256, 256, 256), activate_final=True),
    networks.NearZeroInitializedLinear(num_dimensions),
    networks.TanhToSpec(environment_spec.actions),
])

# Create the distributional critic network.
critic_network = snt.Sequential([
    # The multiplexer concatenates the observations/actions.
    networks.CriticMultiplexer(),
    networks.LayerNormMLP((512, 512, 256), activate_final=True),
    networks.DiscreteValuedHead(vmin=-150., vmax=150., num_atoms=51),
])


In [8]:
# Create a logger for the agent and environment loop.
agent_logger = loggers.TerminalLogger(label='agent', time_delta=10.)
env_loop_logger = loggers.TerminalLogger(label='env_loop', time_delta=10.)

from typing import Optional, Tuple

from acme import adders
from acme import core
from acme import types
# Internal imports.
from acme.tf import utils as tf2_utils
from acme.tf import variable_utils as tf2_variable_utils

import dm_env
import sonnet as snt
import tensorflow as tf
import tensorflow_probability as tfp

tfd = tfp.distributions


def my_function(self, observation: types.NestedArray) -> types.NestedArray:
    # Pass the observation through the policy network.
    action = self._policy(self, observation)
    #new implementation
    """x = np.linspace(-1.0, 1.0, 10)
    proximity = 10
    index = 0
    for i in range(len(x)):
      if np.abs(action - x[i]) < proximity:
        proximity = np.abs(action - x[i])
        index = i 
    action = x[index]"""
    # Return a numpy array with squeezed out batch dimension.
    return tf2_utils.to_numpy_squeeze(action)

# Create the D4PG agent.
agent = d4pg.D4PG(
    environment_spec=environment_spec,
    policy_network=policy_network,
    critic_network=critic_network,
    observation_network=observation_network,
    sigma=1.0,
    logger=agent_logger,
    checkpoint=False
)

#surcharger methode selection action agent

def actionDecorator(og_function):
  def nestedFunction(*args, **kwargs):
    results = og_function(*args, **kwargs)
    x = [-1.0, -0.5, 0, 0.5, 1]
    proximity = 10
    index = 0
    for i in range(len(x)):
      if np.abs(results[0] - x[i]) < proximity:
        proximity = np.abs(results[0] - x[i])
        index = i 
    results = np.abs(np.round(results)) + x[index]
    #print(results)
    return results
  return nestedFunction

agent.select_action = actionDecorator(agent.select_action)

#2eme possibilitée
#environment_loop.EnvironmentLoop = ma_function

# Create an loop connecting this agent to the environment created above.
env_loop = environment_loop.EnvironmentLoop(
    environment, agent, logger=env_loop_logger)

#a = env_loop.run_episode()
#print(a['episode_return'])



In [9]:
# Run a `num_episodes` training episodes.
# Rerun this cell until the agent has learned the given task.
big_loop = env_loop.run(num_episodes=50)



[Agent] Critic Loss = 3.884 | Policy Loss = 0.430 | Steps = 31 | Walltime = 0.846
[Env Loop] Episode Length = 999 | Episode Return = -107.35039520263672 | Episodes = 2 | Steps = 1998 | Steps Per Second = 134.004
[Agent] Critic Loss = 3.698 | Policy Loss = 0.013 | Steps = 448 | Walltime = 10.862
[Env Loop] Episode Length = 999 | Episode Return = -78.62532043457031 | Episodes = 6 | Steps = 5994 | Steps Per Second = 328.460
[Agent] Critic Loss = 3.520 | Policy Loss = 0.004 | Steps = 850 | Walltime = 20.864
[Env Loop] Episode Length = 999 | Episode Return = -87.20030975341797 | Episodes = 10 | Steps = 9782 | Steps Per Second = 325.512
[Agent] Critic Loss = 3.340 | Policy Loss = 0.008 | Steps = 1254 | Walltime = 30.869
[Env Loop] Episode Length = 999 | Episode Return = -121.87545776367188 | Episodes = 14 | Steps = 13778 | Steps Per Second = 314.526
[Agent] Critic Loss = 3.153 | Policy Loss = 0.001 | Steps = 1646 | Walltime = 40.892
[Agent] Critic Loss = 2.996 | Policy Loss = 0.002 | Steps =

In [10]:
#print(big_loop)

In [11]:
"""import matplotlib.pyplot as plt
x = [i for i in range(1,301)]
plt.plot(x, big_loop["episode_return"], label= "episode returns")
plt.legend(loc = 'best')
plt.show()"""

'import matplotlib.pyplot as plt\nx = [i for i in range(1,301)]\nplt.plot(x, big_loop["episode_return"], label= "episode returns")\nplt.legend(loc = \'best\')\nplt.show()'

# Display


In [12]:
def display_video(frames, filename='temp.mp4'):
  """Save and display video."""
  # Write video
  with imageio.get_writer(filename, fps=60) as video:
    for frame in frames:
      video.append_data(frame)
  # Read video and display the video
  video = open(filename, 'rb').read()
  b64_video = base64.b64encode(video)
  video_tag = ('<video  width="320" height="240" controls alt="test" '
               'src="data:video/mp4;base64,{0}">').format(b64_video.decode())
  return IPython.display.HTML(video_tag)
  
timestep = environment.reset()
frames = [render(environment)]

while not timestep.last():
  # Simple environment loop.
  action = agent.select_action(timestep.observation)
  timestep = environment.step(action)

  # Render the scene and add it to the frame stack.
  frames.append(render(environment))

# Save and display a video of the behaviour.
display_video(np.array(frames))

