In [1]:
from memory import MemoryBuffer
from policies import PPOTrainer
import tensorflow as tf
from tensorflow.layers import dense, conv2d, max_pooling2d, flatten
from env_control import EnvController
import numpy as np
from utils import preprocess_atari
import time
import gym

In [2]:
# Pacman network

model_name = 'breakout'
env = gym.make('Breakout-v0')
in_shape = [84, 84, 1]

obs = tf.placeholder(tf.float32, shape=[None]+in_shape)
conv1 = conv2d(obs, 16, 8, (4, 4), activation=tf.nn.relu)
max_pool1 = max_pooling2d(conv1, 2, 2)
conv2 = conv2d(max_pool1, 32, 4, (2, 2), activation=tf.nn.relu)
max_pool2 = max_pooling2d(conv2, 2, 2)
flattened = flatten(max_pool2)
dense1 = dense(flattened, 256, activation=tf.nn.relu)
act_probs = dense(dense1, env.action_space.n)
softmax_probs = tf.nn.softmax(act_probs)

# v_conv1 = conv2d(obs, 16, 3, activation=tf.relu)
# v_pool2 = max_pooling2d(v_conv1, 2, 2)
# v_conv2 = conv2d(v_pool2, 16, 3, activation=tf.relu)
# v_pool3 = max_pooling2d(v_conv2, 2, 2)
# v_conv3 = conv2d(v_pool3, 16, 3, activation=tf.relu)
# v_flattened = flatten(v_conv3)
value = dense(dense1, 1)

network = PPOTrainer(obs, softmax_probs, value, act_type='d', ppo_iters=80)

# Load network if previously saved
saver = tf.train.Saver()
try:
    saver.restore(network.sess, f'models/{model_name}.ckpt')
except:
    print('No model found, randomly initializing parameters')

INFO:tensorflow:Restoring parameters from models/breakout.ckpt


In [3]:
total_parameters = 0
for variable in tf.trainable_variables():
    # shape is an array of tf.Dimension
    shape = variable.get_shape()
    variable_parameters = 1
    for dim in shape:
        variable_parameters *= dim.value
    total_parameters += variable_parameters
print(total_parameters)

43573


In [4]:
n_episodes = 1000000
max_steps = 4096
batch_size = 32
save_freq = 10

ec = EnvController(lambda: gym.make('Breakout-v0'), n_threads=4)
ec.set_obs_transform(lambda x: preprocess_atari(x.squeeze()))

In [9]:
update_rewards = []

for i in range(int(n_episodes / batch_size)):
    ec.sim_episodes(network, batch_size, max_steps)
    update_rewards.append(ec.get_avg_reward())
    dat = ec.get_data()
    network.train(dat)
    if i != 0 and i % save_freq == 0:
        print(f'Update #{i}, Avg Reward: {np.mean(update_rewards)}')
        # Save the model
        saver = tf.train.Saver()
        saver.save(network.sess, f'models/breakout_e{i}_r{np.mean(update_rewards)}.ckpt')
        update_rewards = []

Update #10, Avg Reward: 17.514204545454547
Update #20, Avg Reward: 17.1375
Update #30, Avg Reward: 18.09375


KeyboardInterrupt: 

In [10]:
ec.render_episodes(network, 10, max_steps)

In [None]:
# Save the model
saver = tf.train.Saver()
saver.save(network.sess, 'models/breakout.ckpt')