# PPO Breakout Example

### Atari Breakout

Please do note that this example may take a long time to train.

With the default 4 threads runnning on an 8-core CPU with a GTX 1080 Ti, it will take several hours to train to a decent level of play.

Running on a platform with more GPU power and a larger cluster of CPUs could siginificantly reduce training time.

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPool2D, Flatten, LSTM
import gym
from ludus.policies import PPOTrainer
from ludus.env import EnvController
from ludus.utils import preprocess_atari, reshape_train_var

In [2]:
env = gym.make('Breakout-v0') # This instance of the environment is only used
                              # to get action dimensions
in_shape = [84, 84, 1] # Size of reshaped observations

# Creating a conv net for the policy and value estimator
obs_op = Input(shape=in_shape)
conv1 = Conv2D(16, 8, (4, 4), activation='relu')(obs_op)
max_pool1 = MaxPool2D(2, 2)(conv1)
conv2 = Conv2D(32, 4, (2, 2), activation='relu')(max_pool1)
max_pool2 = MaxPool2D(2, 2)(conv2)
flattened = Flatten()(max_pool2)
dense1 = Dense(256, activation='relu')(flattened)

# Output probability distribution over possible actions
act_probs_op = Dense(env.action_space.n, activation='softmax')(flattened)

# Output value of observed state
value_op = Dense(1)(flattened)

# Wrap a Proximal Policy Optimization Trainer on top of the network
network = PPOTrainer(obs_op, act_probs_op, value_op, act_type='discrete', ppo_iters=80)

  result = entry_point.load(False)


In [3]:
n_episodes = 10000 # Total episodes of data to collect
max_steps = 2048 # Max number of frames per game
batch_size = 8 # Smaller = faster, larger = stabler
print_freq = 10 # How many training updates between printing progress

In [4]:
# Create the environment controller for generating game data
ec = EnvController(lambda: gym.make('Breakout-v0'), n_threads=4)
# Set the preprocessing function for observations
ec.set_obs_transform(lambda x: preprocess_atari(x.squeeze()))

In [5]:
update_rewards = []

for i in range(int(n_episodes / batch_size)):
    ec.sim_episodes(network, batch_size, max_steps) # Simualate env to generate data
    update_rewards.append(ec.get_avg_reward()) # Append rewards to reward tracker list
    dat = ec.get_data() # Get all the data gathered
    
#     processed_obs = np.concatenate([reshape_train_var(dat[:-1, 0]), reshape_train_var(dat[1:, 0])], axis=3)
#     dat = dat[:-1]
#     dat[:,0] = processed_obs
    
    network.train(dat) # Train the network with PPO
    if i != 0 and i % print_freq == 0:
        print(f'Update #{i}, Avg Reward: {np.mean(update_rewards[-print_freq:])}') # Print an update

Exception in thread Thread-4:
Traceback (most recent call last):
  File "/home/ejmejm/anaconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/home/ejmejm/anaconda3/lib/python3.6/threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "/home/ejmejm/MLProjects/ludus/env.py", line 127, in sim_thread
    act = network.gen_act(obs)
  File "/home/ejmejm/MLProjects/ludus/policies.py", line 62, in gen_act
    return self._gen_discrete_act(obs)
  File "/home/ejmejm/MLProjects/ludus/policies.py", line 50, in _gen_discrete_act
    act_probs = self.sess.run(self.out_op, feed_dict={self.in_op: [obs]})
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 929, in run
    run_metadata_ptr)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1128, in _run
    str(subfeed_t.get_shape())))
ValueError: Cannot feed value of shape (1, 84, 84, 1)

IndexError: too many indices for array

In [None]:
ec.render_episodes(network, 5, max_steps) # Render an episode to see the result