# PPO Breakout Example

### Atari Breakout

Please do note that this example may take a long time to train.

With the default 4 threads runnning on an 8-core CPU with a GTX 1080 Ti, it will take several hours to train to a decent level of play.

Running on a platform with more GPU power and a larger cluster of CPUs could siginificantly reduce training time.

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPool2D, Flatten
import gym
from ludus.policies import PPOTrainer
from ludus.env import EnvController, make_lunar_lander_c

In [2]:
env = make_lunar_lander_c() # This instance of the environment is only used
                              # to get action dimensions

# Creating a conv net for the policy and value estimator
obs_op = Input(shape=env.observation_space.shape)
dense1 = Dense(32, activation='tanh')(obs_op)
dense2 = Dense(32, activation='tanh')(dense1)
act_probs_op = Dense(env.action_space.shape[0])(dense2) # Prob dist over possible actions

# Output value of observed state
vdense1 = Dense(32, activation='tanh')(obs_op)
vdense2 = Dense(32, activation='tanh')(vdense1)
value_op = Dense(1)(vdense2)

# Wrap a Proximal Policy Optimization Trainer on top of the network
network = PPOTrainer(obs_op, act_probs_op, value_op, act_type='continuous', ppo_iters=40, entropy_coef=1.)

In [3]:
n_episodes = 1000000 # Total episodes of data to collect
max_steps = 400 # Max number of frames per game
batch_size = 8 # Smaller = faster, larger = stabler
print_freq = 10 # How many training updates between printing progress

In [4]:
# Create the environment controller for generating game data
ec = EnvController(make_lunar_lander_c, n_threads=4)
# Set the preprocessing function for observations
ec.set_act_transform(lambda x: np.clip(x, -1, 1))

In [None]:
update_rewards = []

for i in range(int(n_episodes / batch_size)):
    ec.sim_episodes(network, batch_size, max_steps) # Simualate env to generate data
    update_rewards.append(ec.get_avg_reward()) # Append rewards to reward tracker list
    dat = ec.get_data() # Get all the data gathered
    network.train(dat) # Train the network with PPO
    if i != 0 and i % print_freq == 0:
        print(f'Update #{i}, Avg Reward: {np.mean(update_rewards[-print_freq:])}') # Print an update

Update #10, Avg Reward: -850.6773962874729
Update #20, Avg Reward: -962.7616625182025
Update #30, Avg Reward: -862.9903716181104
Update #40, Avg Reward: -942.4902358438164
Update #50, Avg Reward: -713.0604259238575
Update #60, Avg Reward: -639.5282811790452
Update #70, Avg Reward: -587.8604149474841
Update #80, Avg Reward: -575.1997295481408
Update #90, Avg Reward: -557.1504359281716
Update #100, Avg Reward: -543.2690545694776
Update #110, Avg Reward: -576.5753169932234
Update #120, Avg Reward: -582.7779439075148
Update #130, Avg Reward: -583.6348714589659
Update #140, Avg Reward: -559.1131551288923
Update #150, Avg Reward: -589.3752964302123
Update #160, Avg Reward: -555.6869767844771
Update #170, Avg Reward: -596.1755354114016
Update #180, Avg Reward: -187.95824379401782
Update #190, Avg Reward: -130.8521085372281
Update #200, Avg Reward: -129.1013015171032
Update #210, Avg Reward: -142.55861677398724
Update #220, Avg Reward: -130.57231005373623
Update #230, Avg Reward: -126.63236973

In [None]:
ec.render_episodes(network, 5, max_steps) # Render an episode to see the result