# PPO

We will use Ray libiray to implement the PPO model, with code modified from the official website of Ray.

Code reference: https://docs.ray.io/en/latest/rllib/rllib-training.html#basic-python-api

In [None]:
#if needed: !pip install -U "ray[rllib]"
import numpy as np
from random import random
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
import copy
import os
import time
import torch, torchvision, cv2

import gym
import ray
from ray import tune
import ray.rllib.agents.ppo as ppo
from ray.tune.logger import pretty_print

In [None]:
device = torch.device('cpu')
if torch.cuda.is_available():
   device = torch.device('cuda')

print(device)

In [None]:
def randomize(seed_rng=12345, seed_np=42, seed_torch=42):
    os.environ['PYTHONHASHSEED'] = '0'
    rng = np.random.default_rng(12345)
    np.random.seed(42)
    torch.manual_seed(42)

In [None]:
#initialize ray
ray.init(ignore_reinit_error=True)
#set config
config_ppo = ppo.DEFAULT_CONFIG.copy()
config_ppo["num_gpus"] = 0
config_ppo["num_workers"] = 4
config_ppo["framework"] = 'torch'

In [None]:
#instantiate trainer
trainer_ppo = ppo.PPOTrainer(config=config_ppo, env="Breakout-v0")

In [None]:
config_ppo

In [None]:
#PPO training
#initialize for random seeds/states
randomize()

avg_rewards_ppo = []

for i in range(2):
    # Perform one iteration of training the policy with PPO
    result_ppo = trainer_ppo.train()
    #print(pretty_print(result_ppo))
    print(result_ppo['episode_reward_mean'])
    avg_rewards_ppo.append(result_ppo['episode_reward_mean'])

    if (i+1) % 100 == 0:
        checkpoint_ppo = trainer_ppo.save()
        print("checkpoint saved at", checkpoint_ppo)

In [None]:
#tune hyperparameters
stop=tune.stopper.MaximumIterationStopper(max_iter=50)
config_ppo_tune={
        "env": "Breakout-v0",
        "num_gpus": 0,
        "num_workers": 4,
        "framework": "torch",
        "lr": tune.grid_search([5e-5, 1e-5, 5e-6, 1e-6]),
    }

#initialize for random seeds/states
randomize()

# tune.run() allows setting a custom log directory (other than ``~/ray-results``)
# and automatically saving the trained agent
analysis = tune.run(
    ppo.PPOTrainer,
    config=config_ppo_tune,
    stop=stop,
    checkpoint_at_end=True)

# list of lists: one list per checkpoint; each checkpoint list contains
# 1st the path, 2nd the metric value
checkpoints = analysis.get_trial_checkpoints_paths(
    trial=analysis.get_best_trial("episode_reward_mean"),
    metric="episode_reward_mean")

# if there are multiple trials, select a specific trial or automatically
# choose the best one according to a given metric
last_checkpoint = analysis.get_last_checkpoint(
    metric="episode_reward_mean", mode="max"
)

In [None]:
last_checkpoint[1]

In [None]:
#load DQN rewards
with open("avg_rewards1_restore.txt", "r") as file:
    avg_rewards1_restore = eval(file.readline())

In [None]:
plt.plot([i+200 for i in range(800)], avg_rewards1_restore, label='DQN rainbow excl dueling')
plt.plot(last_checkpoint[1], label='PPO')

plt.xlabel('Count of Iterations')
plt.ylabel('Reward per Episode')

plt.title('Average Reward in Breakout')
plt.legend(loc='upper left')
plt.show()