# Proximal Policy Optimization (PPO) playground

Notebook for running PPO on simple environments from OpenAI Gym

## Install dependencies (only on Google Colab)

In [234]:
# Installing our own implementation
! git clone https://github.com/emasquil/ppo.git
! pip install -e /content/ppo

# Visualization stuff
!sudo apt-get update
!sudo apt-get install -y xvfb ffmpeg freeglut3-dev
!pip install 'imageio==2.4.0'
!pip install pyvirtualdisplay

fatal: destination path 'ppo' already exists and is not an empty directory.
[31mERROR: /content/ppo is not a valid editable requirement. It should either be a path to a local project or a VCS URL (beginning with svn+, git+, hg+, or bzr+).[0m
[sudo] password for nviolante: 
[sudo] password for nviolante: 

In [1]:
import base64
import imageio
import IPython
import itertools
import time
import jax
import jax.numpy as jnp

from acme import specs
import pyvirtualdisplay

# Set up a virtual display for rendering.
display = pyvirtualdisplay.Display(visible=0, size=(1400, 900)).start()

import ppo.dm_helper as helpers
from ppo.agents import RandomAgent, VanillaPPO, general_advantage_estmation, add_advantage
from ppo.env_wrapper import PendulumEnv
from ppo.networks import PolicyNetwork, ValueNetwork
from ppo.replay_buffers import DataLoader


2022-03-17 15:03:48.905635: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory


In [2]:
%load_ext autoreload
%autoreload 2

### Visualization functions

In [3]:
def display_video(frames, filename="temp.mp4", frame_repeat=1):
    """Save and display video."""
    # Write video
    with imageio.get_writer(filename, fps=60) as video:
        for frame in frames:
            for _ in range(frame_repeat):
                video.append_data(frame)
    # Read video and display the video
    video = open(filename, "rb").read()
    b64_video = base64.b64encode(video)
    video_tag = ('<video  width="320" height="240" controls alt="test" ' 'src="data:video/mp4;base64,{0}">').format(
        b64_video.decode()
    )
    return IPython.display.HTML(video_tag)

## Definitions

Definition of all the parts used in the learning loop: environment, agent, etc.

In [4]:
environment = PendulumEnv()
environment_spec = specs.make_environment_spec(environment)

In [5]:
# Create the networks
# We should use our own netowkrs here
# agent_networks = dm_ppo.make_gym_networks(environment_spec)

In [6]:
# Create the agent
# We should use PPO agent
def policy_network(observations):
    return PolicyNetwork((10, 20), environment_spec.actions, "policy")(observations)


def value_network(observations):
    return ValueNetwork((10, 20), "value")(observations)


key = jax.random.PRNGKey(1)

agent = VanillaPPO(environment_spec.observations, policy_network, value_network, key)



## Interaction loop

In [11]:
def training_loop(
    environment,
    agent,
    num_training_iterations=2,
    num_epochs=2,
    len_rollout=5,
):
    """Perform the run loop.

    We are following the Acme run loop.

    Run the environment loop for `num_episodes` episodes. Each episode is itself
    a loop which interacts first with the environment to get an observation and
    then give that observation to the agent in order to retrieve an action. Upon
    termination of an episode a new episode will be started. If the number of
    episodes is not given then this will interact with the environment
    infinitely.

    Args:
      environment: dm_env used to generate trajectories.
      agent: acme.Actor for selecting actions in the run loop.
      num_steps: number of episodes to run the loop for. If `None` (default), runs
        without limit.
      num_episodes: number of episodes to run the loop for. If `None` (default),
        runs without limit.
    """
    all_returns = []

    for iteration in range(num_training_iterations):

        # Annealing

        # Rollout phase
        # Reset any counts and start the environment.
        agent.replay_buffer.clear()
        timestep = environment.reset()
        rollout_return = 0

        # Make the first observation.
        agent.observe_first(timestep)

        for rollout_step in range(len_rollout):
            value = agent.get_value(timestep.observation)
            action, log_prob = agent.select_action_and_prob(timestep.observation)
            timestep = environment.step(action)
            agent.observe(value, log_prob, action, timestep)

            rollout_return += timestep.reward

            # Have the agent observe the timestep and let the agent update itself.
            if timestep.last():
                timestep = environment.reset()
                agent.observe_first(timestep)
                all_returns.append(rollout_return)
                rollout_return = 0

        # Learning phase
        dataloader = DataLoader(agent.replay_buffer, len_rollout)
        trajectory = dataloader.get_full_memory()
        advantages = general_advantage_estmation(trajectory, agent, timestep)
        add_advantage(trajectory, advantages, agent)

        for epoch in range(num_epochs):
            dataloader.shuffle()
            for batch in dataloader:
                agent.update()

        all_returns.append(rollout_return)
    return all_returns

In [12]:
def evaluate(environment, agent, evaluation_episodes):
    frames = []

    for episode in range(evaluation_episodes):
        timestep = environment.reset()
        episode_return = 0
        steps = 0
        while not timestep.last():
            frames.append(environment.render(mode="rgb_array"))

            action = agent.select_action(timestep.observation)
            timestep = environment.step(action)
            steps += 1
            episode_return += timestep.reward
        print(f"Episode {episode} ended with reward {episode_return} in {steps} steps")
    return frames

### Train

In [13]:
training_loop(agent=agent, environment=environment)

[-10.306608  -12.212279   -3.8972623 -14.094865   -7.421748 ]
[ -7.421748  -14.094865  -12.212279   -3.8972623 -10.306608 ]
[-26.941513 -34.771732  -9.596671 -42.114544 -18.567183]
[-18.567183 -42.114544 -34.771732  -9.596671 -26.941513]




[-14.84803129842938, -47.52309121672059]

### Evaluate

In [17]:
display_video(evaluate(agent=agent, environment=environment, evaluation_episodes=5))

Episode 0 ended with reward -945.0801167720851 in 200 steps
Episode 1 ended with reward -1068.5012638233127 in 200 steps
Episode 2 ended with reward -1288.6962668164922 in 200 steps
Episode 3 ended with reward -1254.172315046835 in 200 steps




Episode 4 ended with reward -801.1252990105222 in 200 steps


