# PPO Breakout Example

### Atari Breakout

Please do note that this example may take a long time to train.

With the default 4 threads runnning on an 8-core CPU with a GTX 1080 Ti, it will take several hours to train to a decent level of play.

Running on a platform with more GPU power and a larger cluster of CPUs could siginificantly reduce training time.

Paper: https://arxiv.org/pdf/1705.05363.pdf

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPool2D, Flatten
from tensorflow.keras.backend import categorical_crossentropy
from ludus.policies import BaseTrainer
from ludus.env import EnvController
from ludus.utils import preprocess_atari, reshape_train_var
from ludus.memory import MTMemoryBuffer
import gym
# Super Mario stuff
from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT

In [2]:
def make_env():
    env = gym_super_mario_bros.make('SuperMarioBros-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)
    return env

In [3]:
class IMTrainer(BaseTrainer):
    def __init__(self, in_op, out_op, value_out_op, act_type='discrete', sess=None, clip_val=0.2, ppo_iters=80,
                 target_kl=0.01, v_coef=1., entropy_coef=0.01):
        self.value_out_op = value_out_op
        self.clip_val = clip_val
        self.ppo_iters = ppo_iters
        self.target_kl = target_kl
        self.v_coef = v_coef
        self.entropy_coef = entropy_coef
        
        # ICM parameters, TODO: make parameters for these
        self.ro_coef = 0.5
        self.beta = 0.2
        self.eta = 1
        self.r_i_coef = 1
        self.r_e_coef = 0.2
        
        super().__init__(in_op, out_op, act_type, sess)
        
    def _create_ICM(self, optimizer=tf.train.AdamOptimizer()):
        feature_dim = 128 # TODO: make a parameter for this
        
        # Create placeholder
        self.next_obs_holders = tf.placeholder(tf.float32, shape=self.in_op.shape)
        
        # Observation feature encoder
        with tf.variable_scope('feature_encoder'):
            enc_layers = [
                Conv2D(32, 3, activation=tf.nn.tanh, name='fe_conv'),
                MaxPool2D(2, name='fe_max_pool'),
                Conv2D(32, 3, activation=tf.nn.tanh, name='fe_conv2'),
                MaxPool2D(2, name='fe_max_pool2'),
                Conv2D(32, 3, activation=tf.nn.tanh, name='fe_conv3'),
                MaxPool2D(2, name='fe_max_pool3'),
                Flatten(name='fe_flattened'),
                Dense(feature_dim, activation=tf.nn.tanh, use_bias=False, name='fe_dense')
            ]
            
            # Encoding state
            self.f_obs = enc_layers[0](self.in_op)
            for i in range(1, len(enc_layers)):
                self.f_obs = enc_layers[i](self.f_obs)
            
            # Encoding the next state
            self.f_obs_next = enc_layers[0](self.next_obs_holders)
            for i in range(1, len(enc_layers)):
                self.f_obs_next = enc_layers[i](self.f_obs_next)
            
        # State predictor forward model
        with tf.variable_scope('forward_model'):
            self.state_act_pair = tf.concat([self.out_op, self.f_obs], axis=1)
            self.sp_dense = Dense(64, activation=tf.nn.tanh)(self.state_act_pair)
            self.f_obs_next_pred = Dense(feature_dim, activation=tf.nn.tanh, use_bias=False)(self.sp_dense)
        
        # Inverse dynamics model (predicting action)
        with tf.variable_scope('inverse_model'):
            self.state_state_pair = tf.concat([self.f_obs, self.f_obs_next], axis=1)
            self.act_preds = Dense(64, activation=tf.nn.relu)(self.state_state_pair)
            # TODO: softmax only works for discrete
            self.act_preds = Dense(self.out_op.shape[1], use_bias=False, activation=tf.nn.softmax)(self.act_preds)
        
        # Calculating intrinsic reward
        self.obs_pred_diff = self.f_obs_next_pred - self.f_obs_next
        self.r_i = 0.5 * self.eta * tf.reduce_sum(self.obs_pred_diff ** 2, axis=1) # Fix these squares (Probably okay)
        self.r_ie = self.r_i_coef * self.r_i # + self.r_e_coef * self.reward_holders
        
        # Calculating losses
        self.pre_loss_i = categorical_crossentropy(self.act_masks, self.act_preds) # tf.reduce_sum((self.act_holders - self.act_pred) ** 2, axis=1)
        self.pre_loss_f = 0.5 * tf.reduce_sum(self.obs_pred_diff ** 2, axis=1)
        
        self.loss_i = (1 - self.beta) * tf.reduce_mean(self.pre_loss_i)
        self.loss_f = self.beta * tf.reduce_mean(self.pre_loss_f)
        self.loss_p = -self.ro_coef * tf.reduce_mean(self.r_ie)
        
        # Making update functions
        self.i_train_vars = tf.trainable_variables(scope='feature_encoder') + tf.trainable_variables(scope='inverse_model')
        self.f_train_vars = tf.trainable_variables(scope='forward_model')
        self.p_train_vars = [var for var in tf.trainable_variables() if var not in (self.i_train_vars + self.f_train_vars)]
        
        self.li_update = optimizer.minimize(self.loss_i, var_list=self.i_train_vars)
        self.lf_update = optimizer.minimize(self.loss_f, var_list=self.f_train_vars)
        self.lp_update = optimizer.minimize(self.loss_p, var_list=self.p_train_vars)
        
        self.icm_updates = [self.li_update, self.lf_update, self.lp_update]
        self.losses = [self.loss_i, self.loss_f, self.loss_p]
        
    def _create_discrete_trainer(self, optimizer=tf.train.AdamOptimizer()):
        """
        Creates a function for vanilla policy training with a discrete action space
        """
        # First passthrough
        
        self.act_holders = tf.placeholder(tf.int32, shape=[None])
        self.reward_holders = tf.placeholder(tf.float32, shape=[None])
        
        self.act_masks = tf.one_hot(self.act_holders, self.out_op.shape[1].value, dtype=tf.float32)
        self.resp_acts = tf.reduce_sum(self.act_masks *  self.out_op, axis=1)
        
        self.advantages = self.reward_holders - tf.squeeze(self.value_out_op)
        
        self._create_ICM()
        
        # Second passthrough
        
        self.advatange_holders = tf.placeholder(dtype=tf.float32, shape=self.advantages.shape)
        self.old_prob_holders = tf.placeholder(dtype=tf.float32, shape=self.resp_acts.shape)
 
        self.policy_ratio = self.resp_acts / self.old_prob_holders
        self.clipped_ratio = tf.clip_by_value(self.policy_ratio, 1 - self.clip_val, 1 + self.clip_val)

        self.min_loss = tf.minimum(self.policy_ratio * self.advatange_holders, self.clipped_ratio * self.advatange_holders)
        
        self.optimizer = tf.train.AdamOptimizer()

        # Actor update
        
        self.kl_divergence = tf.reduce_mean(tf.log(self.old_prob_holders) - tf.log(self.resp_acts))
        self.actor_loss = -tf.reduce_mean(self.min_loss)
        self.actor_update = self.optimizer.minimize(self.actor_loss)

        # Value update
        
        self.value_loss = tf.reduce_mean(tf.square(self.reward_holders - tf.squeeze(self.value_out_op)))
        self.value_update = self.optimizer.minimize(self.value_loss)
        
        # Combined update
        
        self.entropy = -tf.reduce_mean(tf.reduce_sum(self.out_op * tf.log(1. / tf.clip_by_value(self.out_op, 1e-8, 1.0)), axis=1))
        self.combined_loss = self.actor_loss + self.v_coef * self.value_loss + self.entropy_coef * self.entropy
        self.combined_update = self.optimizer.minimize(self.combined_loss)
        
        def update_func(train_data, train_type=0):
            if train_type == 0:
                i_rew, li, lf, lp, _, _, _ = self.sess.run([tf.reduce_mean(self.r_i)] + self.losses + self.icm_updates, 
                                       feed_dict={self.in_op: reshape_train_var(train_data[:, 0]),
                                                  self.act_holders: reshape_train_var(train_data[:, 1]),
                                                  self.reward_holders: train_data[:, 2],
                                                  self.next_obs_holders: reshape_train_var(train_data[:, 3])})
                return i_rew, [li, lf, lp]
            else:
                self.old_probs, self.old_advantages = self.sess.run([self.resp_acts, self.advantages], 
                                        feed_dict={self.in_op: reshape_train_var(train_data[:, 0]),
                                                   self.act_holders: train_data[:, 1],
                                                   self.reward_holders: train_data[:, 2]})

                for i in range(self.ppo_iters):
                    kl_div, _ = self.sess.run([self.kl_divergence, self.combined_update], 
                                   feed_dict={self.in_op: reshape_train_var(train_data[:, 0]),
                                        self.act_holders: reshape_train_var(train_data[:, 1]),
                                        self.reward_holders: train_data[:, 2],
                                        self.old_prob_holders: self.old_probs,
                                        self.advatange_holders: self.old_advantages})
                    if kl_div > 1.5 * self.target_kl:
                        break

        self.sess.run(tf.global_variables_initializer())
        
        return update_func
        
    def _create_continuous_trainer(self):
        return

In [4]:
env = make_env() # This instance of the environment is only used
                              # to get action dimensions
in_shape = [42, 42, 4] # Size of reshaped observations

# Creating a conv net for the policy and value estimator
obs_op = Input(shape=in_shape)
conv1 = Conv2D(32, 3, activation='relu')(obs_op)
max_pool1 = MaxPool2D(2, 2)(conv1)
conv2 = Conv2D(32, 3, activation='relu')(max_pool1)
max_pool2 = MaxPool2D(2, 2)(conv2)
conv3 = Conv2D(32, 3, activation='relu')(max_pool2)
max_pool3 = MaxPool2D(2, 2)(conv3)
flattened = Flatten()(max_pool3)
dense1 = Dense(128, activation='relu')(flattened)
dense2 = Dense(256, activation='relu')(dense1)
dense3 = Dense(128, activation='relu')(dense1)

# Output probability distribution over possible actions
act_probs_op = Dense(env.action_space.n, activation='softmax')(dense2)

# Output value of observed state
value_op = Dense(1)(dense3)

# Wrap a Proximal Policy Optimization Trainer on top of the network
network = IMTrainer(obs_op, act_probs_op, value_op, act_type='discrete', ppo_iters=80)

  result = entry_point.load(False)


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [5]:
n_episodes = 10000 # Total episodes of data to collect
max_steps = 1024 # Max number of frames per game
batch_size = 4 # Smaller = faster, larger = stabler
print_freq = 1 # How many training updates between printing progress

In [6]:
agent_hist = {} # Keeps track of up to 3 previous frames for each agent

# Create observation transformation that adds the two last frames on
# as two extra dimensions
def new_obs_transform(obs, agent_id):
    new_frame = preprocess_atari(obs.squeeze(), size=(42, 42)) # First preprocess the new frame
    
    if agent_id in agent_hist: # Case for a continued episode
        agent_hist[agent_id] = agent_hist[agent_id][1:]
        agent_hist[agent_id].append(new_frame)
    else: # Case for a new episode
        agent_hist[agent_id] = [new_frame, new_frame, new_frame, new_frame]
    
    # Format the data
    arr = np.array(agent_hist[agent_id])
    return np.swapaxes(arr, 0, 3).squeeze()

############################################################
############################################################

mtmb = MTMemoryBuffer() # Create a memory buffer to store the episode data

# Edit the memory buffer's start_rollout function so that every time
# an episode ends, it resets the respective agent's history
old_start_rollout = mtmb.start_rollout

def new_start_rollout(agent_id):
    old_start_rollout(agent_id)
    agent_hist.pop(agent_id, None)
    
mtmb.start_rollout = new_start_rollout

In [7]:
# Create the environment controller for generating game data
ec = EnvController(make_env, n_threads=4, memory_buffer=mtmb)
# Set the preprocessing function for observations
ec.set_obs_transform(new_obs_transform)

In [8]:
update_rewards = []
update_i_rewards = []
train_type = 0

for i in range(int(n_episodes / batch_size)):
    ec.sim_episodes(network, batch_size, max_steps) # Simualate env to generate data
    update_rewards.append(ec.get_avg_reward()) # Append rewards to reward tracker list
    dat = ec.get_data() # Get all the data gathered
    
#     if train_type == 0 and len(update_i_rewards) >= 4 and update_i_rewards[-1] <= update_i_rewards[-2] and \
#        update_i_rewards[-1] <= update_i_rewards[-3] and update_i_rewards[-1] <= update_i_rewards[-4]:
#         print('Switching to PPO training')
#         train_type = 1
    
    if train_type == 0:
        i_rew, losses = network.train(dat, train_type=0) # Train the network with PPO
        update_i_rewards.append(i_rew)

        if i != 0 and i % print_freq == 0:
            print(f'Update #{i}, Avg Reward (E, I): {np.mean(update_rewards[-print_freq:])}, ' + \
                  f'{np.mean(update_i_rewards[-print_freq:])}')
            print(f'I: {losses[0]}, F: {losses[1]}, P: {losses[2]}')
            print()
    else:
        network.train(dat, train_type=1) # Train the network with PPO

        if i != 0 and i % print_freq == 0:
            print(f'Update #{i}, Avg Reward (E): {np.mean(update_rewards[-print_freq:])}')
        
#     if i != 0 and i % (print_freq * 5) == 0:
#         ec.render_episodes(network, 1, max_steps)

Update #1, Avg Reward (E, I): 477.25, 4.468448638916016
I: 1.576297402381897, F: 0.893689751625061, P: -2.234224319458008

Update #2, Avg Reward (E, I): 503.0, 4.810794830322266
I: 1.5645735263824463, F: 0.9621589779853821, P: -2.405397415161133

Update #3, Avg Reward (E, I): 563.0, 4.931573867797852
I: 1.5636327266693115, F: 0.9863147735595703, P: -2.465786933898926

Update #4, Avg Reward (E, I): 500.0, 4.566880226135254
I: 1.5561046600341797, F: 0.9133760333061218, P: -2.283440113067627

Update #5, Avg Reward (E, I): 565.75, 4.51606559753418
I: 1.555545687675476, F: 0.9032131433486938, P: -2.25803279876709

Update #6, Avg Reward (E, I): 530.5, 4.299077033996582
I: 1.545089602470398, F: 0.8598154187202454, P: -2.149538516998291

Update #7, Avg Reward (E, I): 562.75, 4.18452262878418
I: 1.530725359916687, F: 0.8369045257568359, P: -2.09226131439209

Update #8, Avg Reward (E, I): 566.75, 4.242562770843506
I: 1.5090140104293823, F: 0.848512589931488, P: -2.121281385421753

Update #9, Avg

Update #66, Avg Reward (E, I): 565.25, 7.85507869720459
I: 1.2684125900268555, F: 1.57101571559906, P: -3.927539348602295

Update #67, Avg Reward (E, I): 564.0, 11.23826789855957
I: 0.9952138066291809, F: 2.2476537227630615, P: -5.619133949279785

Update #68, Avg Reward (E, I): 544.75, 14.776544570922852
I: 0.5438024401664734, F: 2.9553089141845703, P: -7.388272285461426

Update #69, Avg Reward (E, I): 711.5, 17.694551467895508
I: 0.16238872706890106, F: 3.538910388946533, P: -8.847275733947754

Update #70, Avg Reward (E, I): 686.0, 19.301603317260742
I: 0.04399462044239044, F: 3.860320806503296, P: -9.650801658630371

Update #71, Avg Reward (E, I): 687.0, 19.1826114654541
I: 0.007098187692463398, F: 3.836522340774536, P: -9.59130573272705

Update #72, Avg Reward (E, I): 687.0, 17.89019012451172
I: 0.0013404606143012643, F: 3.578037977218628, P: -8.94509506225586

Update #73, Avg Reward (E, I): 687.0, 16.02907371520996
I: 0.00048588216304779053, F: 3.205814838409424, P: -8.014536857604

KeyboardInterrupt: 

In [None]:
ec.render_episodes(network, 5, max_steps) # Render an episode to see the result