# Proximal Policy Optimization (PPO) playground

Notebook for running PPO on simple environments from OpenAI Gym

## Install dependencies (only on Google Colab)

In [None]:
# Installing our own implementation
! git clone https://github.com/emasquil/ppo.git -b main-notebook
! pip install -e /content/ppo

# Visualization stuff
!sudo apt-get update
!sudo apt-get install -y xvfb ffmpeg freeglut3-dev
!pip install 'imageio==2.4.0'
!pip install pyvirtualdisplay

In [None]:
import base64
import imageio
import IPython
import itertools
import time

from acme import specs
from acme.utils import loggers
import pyvirtualdisplay

# Set up a virtual display for rendering.
display = pyvirtualdisplay.Display(visible=0, size=(1400, 900)).start()

import ppo.dm_helper as helpers
from ppo.agents import RandomAgent
from ppo.env_wrapper import PendulumEnv

### Visualization functions

In [None]:
def display_video(frames, filename="temp.mp4", frame_repeat=1):
    """Save and display video."""
    # Write video
    with imageio.get_writer(filename, fps=60) as video:
        for frame in frames:
            for _ in range(frame_repeat):
                video.append_data(frame)
    # Read video and display the video
    video = open(filename, "rb").read()
    b64_video = base64.b64encode(video)
    video_tag = (
        '<video  width="320" height="240" controls alt="test" '
        'src="data:video/mp4;base64,{0}">'
    ).format(b64_video.decode())
    return IPython.display.HTML(video_tag)

## Definitions

Definition of all the parts used in the learning loop: environment, agent, etc.

In [None]:
# Configuration
ENV_NAME = "Pendulum-v1"

In [None]:
# Create an environment, grab the spec
# We should use our own environment here
environment = helpers.make_environment(ENV_NAME)
environment_spec = specs.make_environment_spec(environment)

# environment = PendulumEnv()
# environment_spec = specs.make_environment_spec(environment)

In [None]:
# Create the networks
# We should use our own netowkrs here
# agent_networks = dm_ppo.make_gym_networks(environment_spec)

In [None]:
# Create the agent
# We should use PPO agent
agent = RandomAgent(environment_spec)

## Interaction loop

In [None]:
def training_loop(
    environment,
    agent,
    num_episodes=None,
    num_steps=None,
    logger_time_delta=1.0,
    label="training_loop",
):
    """Perform the run loop.

    We are following the Acme run loop.

    Run the environment loop for `num_episodes` episodes. Each episode is itself
    a loop which interacts first with the environment to get an observation and
    then give that observation to the agent in order to retrieve an action. Upon
    termination of an episode a new episode will be started. If the number of
    episodes is not given then this will interact with the environment
    infinitely.

    Args:
      environment: dm_env used to generate trajectories.
      agent: acme.Actor for selecting actions in the run loop.
      num_steps: number of episodes to run the loop for. If `None` (default), runs
        without limit.
      num_episodes: number of episodes to run the loop for. If `None` (default),
        runs without limit.
      logger_time_delta: time interval (in seconds) between consecutive logging
        steps.
      label: optional label used at logging steps.
    """
    logger = loggers.TerminalLogger(label=label, time_delta=logger_time_delta)
    iterator = range(num_episodes) if num_episodes else itertools.count()
    all_returns = []

    num_total_steps = 0
    for episode in iterator:
        # Reset any counts and start the environment.
        start_time = time.time()
        episode_steps = 0
        episode_return = 0
        episode_loss = 0

        timestep = environment.reset()

        # Make the first observation.
        agent.observe_first(timestep)

        # Run an episode.
        while not timestep.last():
            # Generate an action from the agent's policy and step the environment.
            action = agent.select_action(timestep.observation)
            timestep = environment.step(action)

            # Have the agent observe the timestep and let the agent update itself.
            agent.observe(action, next_timestep=timestep)
            agent.update()

            # Book-keeping.
            episode_steps += 1
            num_total_steps += 1
            episode_return += timestep.reward

            if num_steps is not None and num_total_steps >= num_steps:
                break

        # Collect the results and combine with counts.
        steps_per_second = episode_steps / (time.time() - start_time)
        result = {
            "episode": episode,
            "episode_length": episode_steps,
            "episode_return": episode_return,
        }
        print(result)

        all_returns.append(episode_return)

        # Log the given results.
        logger.write(result)

        if num_steps is not None and num_total_steps >= num_steps:
            break
    return all_returns

In [None]:
def evaluate(environment, agent, evaluation_episodes):
    frames = []

    for episode in range(evaluation_episodes):
        timestep = environment.reset()
        episode_return = 0
        steps = 0
        while not timestep.last():
            frames.append(environment.render(mode="rgb_array"))

            action = agent.select_action(timestep.observation)
            timestep = environment.step(action)
            steps += 1
            episode_return += timestep.reward
        print(f"Episode {episode} ended with reward {episode_return} in {steps} steps")
    return frames

### Train

In [None]:
training_loop(agent=agent, environment=environment, num_episodes=3)

### Evaluate

In [None]:
display_video(evaluate(agent=agent, environment=environment, evaluation_episodes=1))