In [1]:
from memory import MemoryBuffer
from policies import PPOTrainer
import tensorflow as tf
from tensorflow.layers import dense, conv2d, max_pooling2d, flatten
from env_control import EnvController
import numpy as np
from utils import preprocess_atari
import time
import gym

In [2]:
# Pacman network

model_name = 'breakout'
env = gym.make('Breakout-v0')
in_shape = [64, 64, 1]

obs = tf.placeholder(tf.float32, shape=[None]+in_shape)
conv1 = conv2d(obs, 8, 3, activation=tf.tanh)
pool2 = max_pooling2d(conv1, 2, 2)
conv2 = conv2d(pool2, 8, 3, activation=tf.tanh)
pool3 = max_pooling2d(conv2, 2, 2)
conv3 = conv2d(pool3, 8, 3, activation=tf.tanh)
flattened = flatten(conv3)
act_probs = dense(flattened, env.action_space.n)
softmax_probs = tf.nn.softmax(act_probs)

# v_conv1 = conv2d(obs, 16, 3, activation=tf.tanh)
# v_pool2 = max_pooling2d(v_conv1, 2, 2)
# v_conv2 = conv2d(v_pool2, 16, 3, activation=tf.tanh)
# v_pool3 = max_pooling2d(v_conv2, 2, 2)
# v_conv3 = conv2d(v_pool3, 16, 3, activation=tf.tanh)
# v_flattened = flatten(v_conv3)
value = dense(flattened, 1)

network = PPOTrainer(obs, softmax_probs, value, act_type='d', ppo_iters=80)

# Load network if previously saved
saver = tf.train.Saver()
try:
    saver.restore(network.sess, f'models/{model_name}.ckpt')
except:
    print('No model found, randomly initializing parameters')

  result = entry_point.load(False)


INFO:tensorflow:Restoring parameters from models/breakout.ckpt


In [3]:
total_parameters = 0
for variable in tf.trainable_variables():
    # shape is an array of tf.Dimension
    shape = variable.get_shape()
    variable_parameters = 1
    for dim in shape:
        variable_parameters *= dim.value
    total_parameters += variable_parameters
print(total_parameters)

16325


In [4]:
n_episodes = 1000000
max_steps = 512
batch_size = 3
print_freq = 10

ec = EnvController(lambda: gym.make('Breakout-v0'), n_threads=3)
ec.set_obs_transform(lambda x: preprocess_atari(x.squeeze()))

In [None]:
for i in range(int(n_episodes / batch_size)):
    ec.sim_episodes(network, batch_size, max_steps)
    print(f'Avg Reward: {ec.get_avg_reward()}')
    dat = ec.get_data()
    print(dat.shape)
    network.train(dat)

Avg Reward: 5.333333333333333
(1445, 4)


In [None]:
# Save the model
saver = tf.train.Saver()
saver.save(network.sess, 'models/breakout.ckpt')