# PPO Breakout Example

### Atari Breakout

Please do note that this example may take a long time to train.

With the default 4 threads runnning on an 8-core CPU with a GTX 1080 Ti, it will take several hours to train to a decent level of play.

Running on a platform with more GPU power and a larger cluster of CPUs could siginificantly reduce training time.

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPool2D, Flatten
import gym
from ludus.policies import PPOTrainer
from ludus.env import EnvController, make_lunar_lander_c

In [2]:
env = make_lunar_lander_c() # This instance of the environment is only used
                              # to get action dimensions

# Creating a conv net for the policy and value estimator
obs_op = Input(shape=env.observation_space.shape)
dense1 = Dense(32, activation='tanh')(obs_op)
dense2 = Dense(32, activation='tanh')(dense1)
act_probs_op = Dense(env.action_space.shape[0])(dense2) # Prob dist over possible actions

# Output value of observed state
vdense1 = Dense(32, activation='tanh')(obs_op)
vdense2 = Dense(32, activation='tanh')(vdense1)
value_op = Dense(1)(vdense2)

# Wrap a Proximal Policy Optimization Trainer on top of the network
network = PPOTrainer(obs_op, act_probs_op, value_op, act_type='continuous', ppo_iters=40, entropy_coef=1.)

In [3]:
n_episodes = 1000000 # Total episodes of data to collect
max_steps = 400 # Max number of frames per game
batch_size = 8 # Smaller = faster, larger = stabler
print_freq = 10 # How many training updates between printing progress

In [4]:
# Create the environment controller for generating game data
ec = EnvController(make_lunar_lander_c, n_threads=4)
# Set the preprocessing function for observations
ec.set_act_transform(lambda x: np.clip(x, -1, 1))

In [None]:
update_rewards = []

for i in range(int(n_episodes / batch_size)):
    ec.sim_episodes(network, batch_size, max_steps) # Simualate env to generate data
    update_rewards.append(ec.get_avg_reward()) # Append rewards to reward tracker list
    dat = ec.get_data() # Get all the data gathered
    network.train(dat) # Train the network with PPO
    if i != 0 and i % print_freq == 0:
        print(f'Update #{i}, Avg Reward: {np.mean(update_rewards[-print_freq:])}') # Print an update

Update #10, Avg Reward: -951.5817344531704
Update #20, Avg Reward: -977.9308919496021
Update #30, Avg Reward: -1069.3685113132105
Update #40, Avg Reward: -1181.0752731930672
Update #50, Avg Reward: -985.9168143248814
Update #60, Avg Reward: -1070.654274490439
Update #70, Avg Reward: -1229.3568425367146
Update #80, Avg Reward: -1139.5595003235844
Update #90, Avg Reward: -1129.9825958779925
Update #100, Avg Reward: -1031.7554481750685
Update #110, Avg Reward: -1217.3933648142279
Update #120, Avg Reward: -1131.3286765103576
Update #130, Avg Reward: -1201.2071976323532
Update #140, Avg Reward: -1027.4117211911807
Update #150, Avg Reward: -1017.6186943137994
Update #160, Avg Reward: -1056.6364737022805
Update #170, Avg Reward: -882.8073535089819
Update #180, Avg Reward: -137.12347564251718
Update #190, Avg Reward: -127.97753084625961
Update #200, Avg Reward: -130.21208005978485
Update #210, Avg Reward: -124.86524294932443
Update #220, Avg Reward: -136.08882896017
Update #230, Avg Reward: -1

Update #1830, Avg Reward: -132.436072810815
Update #1840, Avg Reward: -135.19286775250276
Update #1850, Avg Reward: -135.317635565049
Update #1860, Avg Reward: -137.00554115908375
Update #1870, Avg Reward: -139.45730175280167
Update #1880, Avg Reward: -126.81659599457342
Update #1890, Avg Reward: -136.60378637794207
Update #1900, Avg Reward: -136.32408766865723
Update #1910, Avg Reward: -139.21952133055976
Update #1920, Avg Reward: -132.01249972143793
Update #1930, Avg Reward: -135.7232598100644
Update #1940, Avg Reward: -135.11305079415746
Update #1950, Avg Reward: -132.82381343554465
Update #1960, Avg Reward: -127.49525285831169
Update #1970, Avg Reward: -134.27786641431106
Update #1980, Avg Reward: -134.28838613036237
Update #1990, Avg Reward: -141.29897576568595
Update #2000, Avg Reward: -141.94559430687528
Update #2010, Avg Reward: -137.4089898235612
Update #2020, Avg Reward: -132.68127491633157
Update #2030, Avg Reward: -131.94396543184635
Update #2040, Avg Reward: -127.194299825

In [None]:
ec.render_episodes(network, 5, max_steps) # Render an episode to see the result