In [1]:
from memory import MemoryBuffer
from policies import PPOTrainer
import tensorflow as tf
from tensorflow.layers import dense, conv2d, max_pooling2d, flatten
import numpy as np
from utils import preprocess_atari
import time
import gym

In [2]:
# Pacman network

model_name = 'breakout'
env = gym.make('Breakout-v0')
in_shape = [64, 64, 1]

obs = tf.placeholder(tf.float32, shape=[None]+in_shape)
conv1 = conv2d(obs, 16, 3, activation=tf.tanh)
pool2 = max_pooling2d(conv1, 2, 2)
conv2 = conv2d(pool2, 16, 3, activation=tf.tanh)
flattened = flatten(conv2)
act_probs = dense(flattened, env.action_space.n)
softmax_probs = tf.nn.softmax(act_probs)

v_conv1 = conv2d(obs, 16, 3, activation=tf.tanh)
v_pool2 = max_pooling2d(v_conv1, 2, 2)
v_conv2 = conv2d(v_pool2, 16, 3, activation=tf.tanh)
v_flattened = flatten(v_conv2)
value = dense(v_flattened, 1)

network = PPOTrainer(obs, softmax_probs, value, act_type='d', ppo_iters=80)

# Load network if previously saved
saver = tf.train.Saver()
try:
    saver.restore(network.sess, f'models/{model_name}.ckpt')
except NotFoundError:
    pass

INFO:tensorflow:Restoring parameters from models/breakout.ckpt


In [3]:
n_episodes = 1000000
max_steps = 1024
update_freq = 16
print_freq = 1

mb = MemoryBuffer()

In [None]:
all_rewards = []

for episode in range(n_episodes):
    ep_reward = 0
    
    mb.start_rollout()
    obs = env.reset()
    for step in range(max_steps):
        obs = obs.squeeze()
        obs = preprocess_atari(obs)
        act = network.gen_act(obs)
        
        obs_next, rew, d, _ = env.step(act)
        ep_reward += rew
        
        if False:
            env.render()
            time.sleep(0.02)
        
        mb.record(obs, act, rew)
        obs = obs_next
        
        if d:
            break
            
    all_rewards.append(ep_reward)
            
    if episode % update_freq == 0 and episode != 0:
        network.train(mb.to_data())
        
        if episode % (update_freq * print_freq) == 0:
            print(f'Update #{episode // update_freq}, Reward: {np.mean(all_rewards[-update_freq*print_freq:])}')

Update #1, Reward: 6.5625
Update #2, Reward: 6.6875
Update #3, Reward: 7.4375
Update #4, Reward: 7.3125
Update #5, Reward: 8.375
Update #6, Reward: 7.125
Update #7, Reward: 8.0625
Update #8, Reward: 6.3125
Update #9, Reward: 7.0625
Update #10, Reward: 7.125
Update #11, Reward: 7.875
Update #12, Reward: 7.0
Update #13, Reward: 7.8125
Update #14, Reward: 7.0
Update #15, Reward: 6.875
Update #16, Reward: 7.5
Update #17, Reward: 7.3125
Update #18, Reward: 7.0
Update #19, Reward: 6.5625
Update #20, Reward: 4.0625
Update #21, Reward: 4.0
Update #22, Reward: 5.8125
Update #23, Reward: 5.125
Update #24, Reward: 5.375
Update #25, Reward: 4.5
Update #26, Reward: 4.9375
Update #27, Reward: 5.125
Update #28, Reward: 5.0625
Update #29, Reward: 5.3125
Update #30, Reward: 5.3125
Update #31, Reward: 5.6875
Update #32, Reward: 6.25
Update #33, Reward: 6.625
Update #34, Reward: 5.5625
Update #35, Reward: 7.6875
Update #36, Reward: 5.0625
Update #37, Reward: 6.9375


In [None]:
# Save the model
saver = tf.train.Saver()
saver.save(network.sess, 'models/breakout.ckpt')