In [1]:
import numpy as np
import tensorflow as tf
from ludus.env import EnvController
from tensorflow.keras.layers import Conv2D, Dense, MaxPool2D, Flatten, Input
import cv2
import time
from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT

In [2]:
def make_env():
    env = gym_super_mario_bros.make('SuperMarioBros-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)
    return env

In [3]:
env = make_env()
obs_shape = (42, 42)
obs_buffer_size = 3

with tf.variable_scope('policy'):
    state_in = Input(list(obs_shape) + [obs_buffer_size])
    conv1 = Conv2D(32, 2, strides=(2, 2), activation='elu')(state_in)
    mp1 = MaxPool2D(2)(conv1)
    conv2 = Conv2D(32, 2, activation='elu')(mp1)
    mp2 = MaxPool2D(2)(conv2)
    conv3 = Conv2D(32, 2, activation='elu')(mp2)
    mp3 = MaxPool2D(2)(conv3)
    flat = Flatten()(mp3)
    dense1 = Dense(128, activation='elu')(flat)
    act_oh = Dense(env.action_space.n, activation='softmax', use_bias=False)(dense1)
    act_out = tf.random.categorical(act_oh, 1)

  result = entry_point.load(False)


Instructions for updating:
Colocations handled automatically by placer.


In [4]:
lam = 0.1
beta = 0.2
enc_dim = 64 # Encoded Feature Dimension

# Placeholders
state_ph = state_in
state_p_ph = tf.placeholder(dtype=tf.float32, shape=(None, *list(obs_shape), obs_buffer_size))
act_holder = tf.placeholder(dtype=tf.int32, shape=(None,))

# State Encoder
with tf.variable_scope('encoder'):
    enc_layers = [
        Conv2D(32, 3, strides=(2, 2), activation='elu'),
        Conv2D(32, 3, strides=(2, 2), activation='elu'),
        Conv2D(32, 3, strides=(2, 2), activation='elu'),
        Conv2D(64, 3, strides=(2, 2), activation='elu'),
        Flatten()
    ]
    
    enc_state = enc_layers[0](state_ph)
    for i in range(1, len(enc_layers)):
        enc_state = enc_layers[i](enc_state)
        
    enc_state_p = enc_layers[0](state_p_ph)
    for i in range(1, len(enc_layers)):
        enc_state_p = enc_layers[i](enc_state_p)
        
print(enc_state)
        
# Inverse Dynamics Model
with tf.variable_scope('inverse_model'):
    state_state_pair = tf.concat([enc_state, enc_state_p], axis=1)
    im_dense = Dense(256, activation='relu')(state_state_pair)
    act_pred = Dense(env.action_space.n, activation='softmax', use_bias=False)(im_dense)
    
# Foward Model
with tf.variable_scope('forward_model'):
    state_act_pair = tf.concat([enc_state, act_oh], axis=1)
    fm_dense = Dense(256, activation='relu')(state_act_pair)
    enc_state_p_pred = Dense(enc_dim, activation='relu')(fm_dense)
    # enc_state_p_pred = tf.clip_by_value(enc_state_p_pred, 0., 1.)
    
# Losses

# Inverse Dynamics Loss
act_actual_oh = tf.one_hot(act_holder, env.action_space.n)
loss_i = tf.reduce_mean(tf.keras.losses.categorical_crossentropy(act_actual_oh, act_pred))
loss_i = (1 - beta) * loss_i

# Forward Loss
state_p_diff = tf.square(enc_state_p_pred - enc_state_p)
loss_f = 0.5 * tf.reduce_mean(tf.reduce_sum(state_p_diff, axis=1))
loss_f = beta * loss_f

# Intrinsic Reward
ri = 0.5 * tf.reduce_mean(tf.reduce_sum(state_p_diff, axis=1))
loss_p = -lam * ri

icm_losses = [loss_i, loss_f, loss_p]

# Update Ops
optimizer = tf.train.AdamOptimizer(learning_rate=1e-3)
update_i = optimizer.minimize(loss_i, var_list=tf.trainable_variables(scope='encoder') + tf.trainable_variables(scope='inverse_model'))
update_f = optimizer.minimize(loss_f, var_list=tf.trainable_variables(scope='forward_model'))
update_p = optimizer.minimize(loss_p, var_list=tf.trainable_variables(scope='policy'))

icm_objective = [update_i, update_f, update_p]

Tensor("encoder/flatten_1/Reshape:0", shape=(?, 64), dtype=float32)
Instructions for updating:
Use tf.cast instead.


In [None]:
def filter_obs(obs):
    obs = cv2.resize(obs, obs_shape, interpolation=cv2.INTER_LINEAR)
    obs = cv2.cvtColor(obs, cv2.COLOR_BGR2GRAY)
    return obs / 255

In [None]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [None]:
n_episodes = 1000000
steps_per_epoch = 2048
print_freq = 8
act_skip = 6
max_steps = int(4096 / act_skip)
render = False

all_rewards = []
ris, lis, lfs, lps = 0, 0, 0, 0
train_iters = 0
train_data = [] # Formatted as [obs_buffer_t, act_t, reward_t, obs_buffer_t+1]
for episode in range(n_episodes):
    obs = env.reset()
    obs = filter_obs(obs)
    obs_buffer = np.rollaxis(np.array([obs] * obs_buffer_size), 0, 3)
    ep_reward = 0

    for step in range(max_steps):
        act = sess.run(act_out, feed_dict={state_in: [obs_buffer]})[0][0]
        
        for i in range(act_skip):
            obs_p, r, d, _ = env.step(act)
            ep_reward += r
            if d:
                break
        
        train_data.append([obs_buffer.copy(), act, r])

        obs_p = filter_obs(obs_p)
        obs_buffer[:,:,:-1] = obs_buffer[:,:,1:]
        obs_buffer[:,:,-1] = obs_p

        train_data[-1].append(obs_buffer.copy())

        if render:
            env.render()
            time.sleep(0.02)

        if len(train_data) >= steps_per_epoch:
            np.random.shuffle(train_data)
            train_obs = np.array([x[0] for x in train_data])
            train_acts = np.array([x[1] for x in train_data])
            train_rewards = np.array([x[2] for x in train_data])
            train_obs_ps = np.array([x[3] for x in train_data])

            nri, li, lf, lp, _, _, _ = sess.run([ri] + icm_losses + icm_objective,
                                    feed_dict={
                                        state_in: train_obs,
                                        act_holder: train_acts,
                                        state_p_ph: train_obs_ps
                                    })
            ris += nri
            lis += li
            lfs += lf
            lps += lp
            train_iters += 1
            
            train_data = []

        if d:
            break

    all_rewards.append(ep_reward)

    if (episode + 1) % print_freq == 0:
        print(f'R_e: {np.mean(all_rewards[-print_freq:])}, R_i: {ris/train_iters}')
        print(f'L_i: {lis/train_iters}, L_f: {lfs/train_iters}, L_p: {lps/train_iters}')
        print()
        
        ris, lis, lfs, lps = 0, 0, 0, 0
        train_iters = 0

R_e: 1708.25, R_i: 0.8701882362365723
L_i: 1.5594013929367065, L_f: 0.1740376502275467, L_p: -0.08701882511377335

R_e: 1763.0, R_i: 1.1722170114517212
L_i: 1.5594733953475952, L_f: 0.23444341123104095, L_p: -0.11722170561552048

R_e: 1684.25, R_i: 1.057920217514038
L_i: 1.5550225973129272, L_f: 0.21158404648303986, L_p: -0.10579202324151993

R_e: 1350.125, R_i: 0.9000678360462189
L_i: 1.5596612691879272, L_f: 0.18001356720924377, L_p: -0.09000678360462189

R_e: 1437.125, R_i: 0.663006067276001
L_i: 1.5545310974121094, L_f: 0.13260121643543243, L_p: -0.06630060821771622

R_e: 1687.375, R_i: 0.5373105406761169
L_i: 1.5546898245811462, L_f: 0.10746210813522339, L_p: -0.053731054067611694

R_e: 1588.0, R_i: 0.47550293803215027
L_i: 1.5484487414360046, L_f: 0.09510058909654617, L_p: -0.04755029454827309

R_e: 1572.75, R_i: 0.6556946039199829
L_i: 1.5457490682601929, L_f: 0.13113892078399658, L_p: -0.06556946039199829

R_e: 1350.375, R_i: 0.9503017067909241
L_i: 1.5165074467658997, L_f: 0.1