In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPool2D, Flatten
from tensorflow.keras.backend import categorical_crossentropy
from ludus.policies import BaseTrainer
from ludus.env import EnvController
from ludus.utils import preprocess_atari, reshape_train_var
from ludus.memory import MTMemoryBuffer
import gym
# Super Mario stuff
from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT

In [2]:
def make_env():
    env = gym_super_mario_bros.make('Breakout-v0')
    return env

In [3]:
class IMTrainer(BaseTrainer):
    def __init__(self, in_op, out_op, value_out_op, act_type='discrete', sess=None, clip_val=0.2, ppo_iters=80,
                 target_kl=0.01, v_coef=1., entropy_coef=0.01):
        self.value_out_op = value_out_op
        self.clip_val = clip_val
        self.ppo_iters = ppo_iters
        self.target_kl = target_kl
        self.v_coef = v_coef
        self.entropy_coef = entropy_coef
        
        # ICM parameters, TODO: make parameters for these
        self.ro_coef = 0.5
        self.beta = 0.2
        self.eta = 1
        self.r_i_coef = 1
        self.r_e_coef = 0.2
        
        super().__init__(in_op, out_op, act_type, sess)
        
    def _create_ICM(self, optimizer=tf.train.AdamOptimizer()):
        feature_dim = 64 # TODO: make a parameter for this
        
        # Create placeholder
        self.next_obs_holders = tf.placeholder(tf.float32, shape=self.in_op.shape)
        
        # Observation feature encoder
        with tf.variable_scope('feature_encoder'):
            enc_layers = [
                Conv2D(16, 4, activation=tf.nn.tanh, name='fe_conv'),
                MaxPool2D(2, name='fe_max_pool'),
                Conv2D(32, 3, activation=tf.nn.tanh, name='fe_conv2'),
                MaxPool2D(2, name='fe_max_pool2'),
                Conv2D(32, 3, activation=tf.nn.tanh, name='fe_conv3'),
                MaxPool2D(2, name='fe_max_pool3'),
                Flatten(name='fe_flattened'),
                Dense(feature_dim, activation=tf.nn.tanh, use_bias=False, name='fe_dense')
            ]
            
            # Encoding state
            self.f_obs = enc_layers[0](self.in_op)
            for i in range(1, len(enc_layers)):
                self.f_obs = enc_layers[i](self.f_obs)
            
            # Encoding the next state
            self.f_obs_next = enc_layers[0](self.next_obs_holders)
            for i in range(1, len(enc_layers)):
                self.f_obs_next = enc_layers[i](self.f_obs_next)
            
        # State predictor forward model
        with tf.variable_scope('forward_model'):
            self.state_act_pair = tf.concat([self.out_op, self.f_obs], axis=1)
            self.sp_dense = Dense(64, activation=tf.nn.tanh)(self.state_act_pair)
            self.f_obs_next_pred = Dense(feature_dim, activation=tf.nn.tanh, use_bias=False)(self.sp_dense)
        
        # Inverse dynamics model (predicting action)
        with tf.variable_scope('inverse_model'):
            self.state_state_pair = tf.concat([self.f_obs, self.f_obs_next], axis=1)
            self.act_preds = Dense(64, activation=tf.nn.relu)(self.state_state_pair)
            # TODO: softmax only works for discrete, make continuous version
            self.act_preds = Dense(self.out_op.shape[1].value, use_bias=False, activation=tf.nn.softmax)(self.act_preds)
        
        # Calculating intrinsic reward
        self.obs_pred_diff = self.f_obs_next_pred - self.f_obs_next
        self.r_i = 0.5 * self.eta * tf.reduce_sum(self.obs_pred_diff ** 2, axis=1) # Fix these squares (Probably okay)
        self.r_ie = self.r_i_coef * self.r_i # + self.r_e_coef * self.reward_holders
        
        # Calculating losses
        self.pre_loss_i = categorical_crossentropy(self.act_masks, self.act_preds) # tf.reduce_sum((self.act_holders - self.act_pred) ** 2, axis=1)
        self.pre_loss_f = 0.5 * tf.reduce_sum(self.obs_pred_diff ** 2, axis=1)
        
        self.loss_i = (1 - self.beta) * tf.reduce_mean(self.pre_loss_i)
        self.loss_f = self.beta * tf.reduce_mean(self.pre_loss_f)
        self.loss_p = -self.ro_coef * tf.reduce_mean(self.r_ie)
        
        # Making update functions
        self.i_train_vars = tf.trainable_variables(scope='feature_encoder') + tf.trainable_variables(scope='inverse_model')
        self.f_train_vars = tf.trainable_variables(scope='forward_model')
        self.p_train_vars = [var for var in tf.trainable_variables() if var not in (self.i_train_vars + self.f_train_vars)]
        
        self.li_update = optimizer.minimize(self.loss_i, var_list=self.i_train_vars)
        self.lf_update = optimizer.minimize(self.loss_f, var_list=self.f_train_vars)
        self.lp_update = optimizer.minimize(self.loss_p, var_list=self.f_train_vars)
        
        self.icm_updates = [self.li_update, self.lf_update, self.lp_update]
        self.losses = [self.loss_i, self.loss_f, self.loss_p]
        
    def _create_discrete_trainer(self, optimizer=tf.train.AdamOptimizer()):
        """
        Creates a function for vanilla policy training with a discrete action space
        """
        # First passthrough
        
        self.act_holders = tf.placeholder(tf.int32, shape=[None])
        self.reward_holders = tf.placeholder(tf.float32, shape=[None])
        
        self.act_masks = tf.one_hot(self.act_holders, self.out_op.shape[1].value, dtype=tf.float32)
        self.resp_acts = tf.reduce_sum(self.act_masks *  self.out_op, axis=1)
        
        self.advantages = self.reward_holders - tf.squeeze(self.value_out_op)
        
        self._create_ICM()
        
        # Second passthrough
        
        self.advatange_holders = tf.placeholder(dtype=tf.float32, shape=self.advantages.shape)
        self.old_prob_holders = tf.placeholder(dtype=tf.float32, shape=self.resp_acts.shape)
 
        self.policy_ratio = self.resp_acts / self.old_prob_holders
        self.clipped_ratio = tf.clip_by_value(self.policy_ratio, 1 - self.clip_val, 1 + self.clip_val)

        self.min_loss = tf.minimum(self.policy_ratio * self.advatange_holders, self.clipped_ratio * self.advatange_holders)
        
        self.optimizer = tf.train.AdamOptimizer()

        # Actor update
        
        self.kl_divergence = tf.reduce_mean(tf.log(self.old_prob_holders) - tf.log(self.resp_acts))
        self.actor_loss = -tf.reduce_mean(self.min_loss)Cycles: Last hour, day, month, year
        self.actor_update = self.optimizer.minimize(self.actor_loss)

        # Value update
        
        self.value_loss = tf.reduce_mean(tf.square(self.reward_holders - tf.squeeze(self.value_out_op)))
        self.value_update = self.optimizer.minimize(self.value_loss)
        
        # Combined update
        
        self.entropy = -tf.reduce_mean(tf.reduce_sum(self.out_op * tf.log(1. / tf.clip_by_value(self.out_op, 1e-8, 1.0)), axis=1))
        self.combined_loss = self.actor_loss + self.v_coef * self.value_loss + self.entropy_coef * self.entropy
        self.combined_update = self.optimizer.minimize(self.combined_loss)
        
        def update_func(train_data, train_type=0):
            if train_type == 0:
                i_rew, li, lf, lp, _, _, _ = self.sess.run([tf.reduce_mean(self.r_i)] + self.losses + self.icm_updates, 
                                       feed_dict={self.in_op: reshape_train_var(train_data[:, 0]),
                                                  self.act_holders: reshape_train_var(train_data[:, 1]),
                                                  self.reward_holders: train_data[:, 2],
                                                  self.next_obs_holders: reshape_train_var(train_data[:, 3])})
                return i_rew, [li, lf, lp]
            else:
                self.old_probs, self.old_advantages = self.sess.run([self.resp_acts, self.advantages], 
                                        feed_dict={self.in_op: reshape_train_var(train_data[:, 0]),
                                                   self.act_holders: train_data[:, 1],
                                                   self.reward_holders: train_data[:, 2]})

                for i in range(self.ppo_iters):
                    kl_div, _ = self.sess.run([self.kl_divergence, self.combined_update], 
                                   feed_dict={self.in_op: reshape_train_var(train_data[:, 0]),
                                        self.act_holders: reshape_train_var(train_data[:, 1]),
                                        self.reward_holders: train_data[:, 2],
                                        self.old_prob_holders: self.old_probs,
                                        self.advatange_holders: self.old_advantages})
                    if kl_div > 1.5 * self.target_kl:
                        break

        self.sess.run(tf.global_variables_initializer())
        
        return update_func
        
    def _create_continuous_trainer(self):
        return

In [4]:
env = make_env() # This instance of the environment is only used
                              # to get action dimensions
in_shape = [42, 42, 4] # Size of reshaped observations

# Creating a conv net for the policy and value estimator
obs_op = Input(shape=in_shape)
conv1 = Conv2D(32, 3, activation='relu')(obs_op)
max_pool1 = MaxPool2D(2, 2)(conv1)
conv2 = Conv2D(32, 3, activation='relu')(max_pool1)
max_pool2 = MaxPool2D(2, 2)(conv2)
conv3 = Conv2D(32, 3, activation='relu')(max_pool2)
max_pool3 = MaxPool2D(2, 2)(conv3)
flattened = Flatten()(max_pool3)
dense1 = Dense(64, activation='relu')(flattened)
dense2 = Dense(128, activation='relu')(dense1)
dense3 = Dense(128, activation='relu')(dense1)

# Output probability distribution over possible actions
act_probs_op = Dense(env.action_space.n, activation='softmax')(dense2)

# Output value of observed state
value_op = Dense(1)(dense3)

# Wrap a Proximal Policy Optimization Trainer on top of the network
network = IMTrainer(obs_op, act_probs_op, value_op, act_type='discrete', ppo_iters=80)

  result = entry_point.load(False)


In [5]:
n_episodes = 10000 # Total episodes of data to collect
max_steps = 1024 # Max number of frames per game
batch_size = 16 # Smaller = faster, larger = stabler
print_freq = 1 # How many training updates between printing progress

In [6]:
agent_hist = {} # Keeps track of up to 3 previous frames for each agent

# Create observation transformation that adds the two last frames on
# as two extra dimensions
def new_obs_transform(obs, agent_id):
    new_frame = preprocess_atari(obs.squeeze(), size=(42, 42)) # First preprocess the new frame
    
    if agent_id in agent_hist: # Case for a continued episode
        agent_hist[agent_id] = agent_hist[agent_id][1:]
        agent_hist[agent_id].append(new_frame)
    else: # Case for a new episode
        agent_hist[agent_id] = [new_frame, new_frame, new_frame, new_frame]
    
    # Format the data
    arr = np.array(agent_hist[agent_id])
    return np.swapaxes(arr, 0, 3).squeeze()

############################################################
############################################################

mtmb = MTMemoryBuffer() # Create a memory buffer to store the episode data

# Edit the memory buffer's start_rollout function so that every time
# an episode ends, it resets the respective agent's history
old_start_rollout = mtmb.start_rollout

def new_start_rollout(agent_id):
    old_start_rollout(agent_id)
    agent_hist.pop(agent_id, None)
    
mtmb.start_rollout = new_start_rollout

In [7]:
# Create the environment controller for generating game data
ec = EnvController(make_env, n_threads=4, memory_buffer=mtmb)
# Set the preprocessing function for observations
ec.set_obs_transform(new_obs_transform)

In [8]:
update_rewards = []
update_i_rewards = []
train_type = 0
render = False

for i in range(int(n_episodes / batch_size)):
    ec.sim_episodes(network, batch_size, max_steps) # Simualate env to generate data
    update_rewards.append(ec.get_avg_reward()) # Append rewards to reward tracker list
    
    print(sum([len(x) for x in ec.mb.rollouts]))
    if sum([len(x) for x in ec.mb.rollouts]) > 10000:
        rollouts = ec.mb.rollouts
        
    dat = ec.get_data() # Get all the data gathered
#     if train_type == 0 and len(update_i_rewards) >= 4 and update_i_rewards[-1] <= update_i_rewards[-2] and \
#        update_i_rewards[-1] <= update_i_rewards[-3] and update_i_rewards[-1] <= update_i_rewards[-4]:
#         print('Switching to PPO training')
#         train_type = 1
    
    if train_type == 0:
        i_rew, losses = network.train(dat, train_type=0) # Train the network with PPO
        update_i_rewards.append(i_rew)

        if i != 0 and i % print_freq == 0:
            print(f'Update #{i}, Avg Reward (E, I): {np.mean(update_rewards[-print_freq:])}, ' + \
                  f'{np.mean(update_i_rewards[-print_freq:])}')
#             print(f'I: {losses[0]}, F: {losses[1]}, P: {losses[2]}')
#             print()
    else:
        network.train(dat, train_type=1) # Train the network with PPO

        if i != 0 and i % print_freq == 0:
            print(f'Update #{i}, Avg Reward (E): {np.mean(update_rewards[-print_freq:])}')
        
    if render and i != 0 and i % (print_freq * 5) == 0:
        ec.render_episodes(network, 1, max_steps)

16
4851
16
4096
Update #1, Avg Reward (E, I): 1.625, 2.021937370300293
16
4189
Update #2, Avg Reward (E, I): 1.5, 1.9337788820266724
16
3912
Update #3, Avg Reward (E, I): 1.4375, 1.820548176765442
16
3633
Update #4, Avg Reward (E, I): 0.9375, 1.879676103591919
16
3905
Update #5, Avg Reward (E, I): 1.3125, 2.065701723098755
16
4047
Update #6, Avg Reward (E, I): 1.3125, 2.206267833709717
16
4315
Update #7, Avg Reward (E, I): 1.75, 2.5103354454040527
16
4272
Update #8, Avg Reward (E, I): 1.6875, 2.8310184478759766
16
4139
Update #9, Avg Reward (E, I): 1.6875, 3.051278591156006
16
4380
Update #10, Avg Reward (E, I): 1.9375, 3.2508013248443604
16
3820
Update #11, Avg Reward (E, I): 1.1875, 3.364258050918579
16
4497
Update #12, Avg Reward (E, I): 1.9375, 3.744131088256836
16
3774
Update #13, Avg Reward (E, I): 1.125, 4.082278728485107
16
3728
Update #14, Avg Reward (E, I): 1.125, 4.393712043762207
16
4298
Update #15, Avg Reward (E, I): 1.75, 4.868260383605957
16
3897
Update #16, Avg Reward (

16
3967
Update #128, Avg Reward (E, I): 1.375, 33.85519027709961
16
3731
Update #129, Avg Reward (E, I): 1.0625, 33.87940979003906
16
3599
Update #130, Avg Reward (E, I): 0.9375, 33.30869674682617
16
3842
Update #131, Avg Reward (E, I): 1.3125, 32.628517150878906
16
4605
Update #132, Avg Reward (E, I): 2.3125, 32.3742561340332
16
4203
Update #133, Avg Reward (E, I): 1.625, 32.61603927612305
16
4048
Update #134, Avg Reward (E, I): 1.4375, 33.08491897583008
16
4038
Update #135, Avg Reward (E, I): 1.4375, 33.689605712890625
16
3660
Update #136, Avg Reward (E, I): 1.0, 34.126953125
16
3932
Update #137, Avg Reward (E, I): 1.25, 34.26844024658203
16
3807
Update #138, Avg Reward (E, I): 1.1875, 34.28370666503906
16
3843
Update #139, Avg Reward (E, I): 1.1875, 34.33322525024414
16
3971
Update #140, Avg Reward (E, I): 1.4375, 34.68077087402344
16
4198
Update #141, Avg Reward (E, I): 1.5625, 34.949859619140625
16
4492
Update #142, Avg Reward (E, I): 1.875, 35.393009185791016
16
4095
Update #143,

16
4432
Update #254, Avg Reward (E, I): 1.875, 35.702964782714844
16
3786
Update #255, Avg Reward (E, I): 1.1875, 35.36470031738281
16
3984
Update #256, Avg Reward (E, I): 1.25, 35.354217529296875
16
3552
Update #257, Avg Reward (E, I): 0.875, 35.389434814453125
16
4263
Update #258, Avg Reward (E, I): 1.75, 35.17220687866211
16
3848
Update #259, Avg Reward (E, I): 1.1875, 35.2606086730957
16
3827
Update #260, Avg Reward (E, I): 1.3125, 35.041194915771484
16
4201
Update #261, Avg Reward (E, I): 1.875, 35.15694046020508
16
3883
Update #262, Avg Reward (E, I): 1.3125, 35.343833923339844
16
4595
Update #263, Avg Reward (E, I): 2.0625, 35.5604248046875
16
4699
Update #264, Avg Reward (E, I): 2.1875, 35.53388214111328
16
4237
Update #265, Avg Reward (E, I): 1.625, 35.47725296020508
16
3997
Update #266, Avg Reward (E, I): 1.375, 35.57422637939453
16
3767
Update #267, Avg Reward (E, I): 1.1875, 35.744754791259766
16
3809
Update #268, Avg Reward (E, I): 1.25, 35.58749771118164
16
3935
Update #2

Update #379, Avg Reward (E, I): 1.375, 35.78618621826172
16
4248
Update #380, Avg Reward (E, I): 1.6875, 35.62858963012695
16
4212
Update #381, Avg Reward (E, I): 1.8125, 35.4911994934082
16
4437
Update #382, Avg Reward (E, I): 1.8125, 35.406288146972656
16
4531
Update #383, Avg Reward (E, I): 2.0625, 35.5987663269043
16
4060
Update #384, Avg Reward (E, I): 1.4375, 35.69038009643555
16
3827
Update #385, Avg Reward (E, I): 1.4375, 35.60451126098633
16
4221
Update #386, Avg Reward (E, I): 1.5625, 35.636810302734375
16
4040
Update #387, Avg Reward (E, I): 1.375, 35.652103424072266
16
3828
Update #388, Avg Reward (E, I): 1.25, 35.33037567138672
16
4589
Update #389, Avg Reward (E, I): 2.125, 35.33203125
16
3775
Update #390, Avg Reward (E, I): 1.125, 35.67604446411133
16
3502
Update #391, Avg Reward (E, I): 0.8125, 35.63094711303711
16
3788
Update #392, Avg Reward (E, I): 1.1875, 35.704742431640625
16
4056
Update #393, Avg Reward (E, I): 1.375, 35.57771301269531
16
4423
Update #394, Avg Rewa

16
4129
Update #505, Avg Reward (E, I): 1.5, 35.446678161621094
16
3622
Update #506, Avg Reward (E, I): 1.0, 35.52969741821289
16
3967
Update #507, Avg Reward (E, I): 1.4375, 35.49864959716797
16
3685
Update #508, Avg Reward (E, I): 1.125, 35.24808120727539
16
4756
Update #509, Avg Reward (E, I): 2.25, 35.181026458740234
16
4395
Update #510, Avg Reward (E, I): 2.0, 35.32768630981445
16
4208
Update #511, Avg Reward (E, I): 1.5, 35.5011100769043
16
4365
Update #512, Avg Reward (E, I): 1.875, 35.57898712158203
16
3900
Update #513, Avg Reward (E, I): 1.375, 35.77070617675781
16
3938
Update #514, Avg Reward (E, I): 1.3125, 35.789005279541016
16
3532
Update #515, Avg Reward (E, I): 0.875, 35.88114929199219
16
4161
Update #516, Avg Reward (E, I): 1.5, 35.97163009643555
16
3807
Update #517, Avg Reward (E, I): 1.25, 35.81123352050781
16
3853
Update #518, Avg Reward (E, I): 1.25, 35.68925094604492
16
3505
Update #519, Avg Reward (E, I): 0.875, 35.54646301269531
16
3968
Update #520, Avg Reward (E

In [9]:
ec.render_episodes(network, 5, max_steps) # Render an episode to see the result

In [10]:
env.action_space

Discrete(4)