## Visualize the Environment Model

Evaluate and visualize the performance of the environment model by seeing it visualize future states while a A2C agent plays the game.

First start off with some imports.

In [1]:
import numpy as np
import tensorflow as tf
from env_model import make_env, create_env_model
from utils import SubprocVecEnv
from discretize_env import pix_to_target, rewards_to_target, _NUM_PIXELS, sokoban_rewards
from a2c import get_actor_critic, CnnPolicy
#from i2a import convert_target_to_real
from safe_grid_gym.envs.gridworlds_env import GridworldEnv

In [2]:
from IPython.display import clear_output
import matplotlib.pyplot as plt
import time
%matplotlib inline
%load_ext autoreload
%autoreload 2

Next create the environments we will use.

In [3]:
nenvs = 16
nsteps = 5
envs = [make_env() for i in range(nenvs)]
envs = SubprocVecEnv(envs)

ob_space = envs.observation_space.shape
ac_space = envs.action_space
num_actions = envs.action_space.n

In [4]:
def softmax(X, theta = 1.0, axis = None):
    """
    Compute the softmax of each element along an axis of X.

    Parameters
    ----------
    X: ND-Array. Probably should be floats.
    theta (optional): float parameter, used as a multiplier
        prior to exponentiation. Default = 1.0
    axis (optional): axis to compute values along. Default is the
        first non-singleton axis.

    Returns an array the same size as X. The result will sum to 1
    along the specified axis.
    """

    # make X at least 2d
    y = np.atleast_2d(X)

    # find axis
    if axis is None:
        axis = next(j[0] for j in enumerate(y.shape) if j[1] > 1)

    # multiply y against the theta parameter,
    y = y * float(theta)

    # subtract the max for numerical stability
    y = y - np.expand_dims(np.max(y, axis = axis), axis)

    # exponentiate y
    y = np.exp(y)

    # take the sum along the specified axis
    ax_sum = np.expand_dims(np.sum(y, axis = axis), axis)

    # finally: divide elementwise
    p = y / ax_sum

    # flatten if X was 1D
    if len(X.shape) == 1: p = p.flatten()

    return p

Finally, go ahead and test the environment model in minipacman. This will use the A2C agent to play the game and the environment model to predict future states and rewards. Note that you should replace the locations of my weights with the locations of your own saved weights. This will visualize the imagined and real rewards and game states from the environment model. 

In [5]:
env = GridworldEnv("side_effects_sokoban")
done = False
states = env.reset()
num_actions = ac_space.n
nc, nw, nh = ob_space
print('observation space', ob_space)
print('number of actions', num_actions)
steps = 0

def target_to_pix(imagined_states):
    imagined_states = np.asarray(imagined_states, dtype=np.float32)
    #imagined_states = imagined_states.reshape(N_ENVS, 1, 6, 6)
    return imagined_states

def convert_target_to_real(batch_size, nw, nh, nc, imagined_state, imagined_reward):
    imagined_state = softmax(imagined_state, axis=1)
    imagined_state = np.argmax(imagined_state, axis=1)
    imagined_state = target_to_pix(imagined_state)
    imagined_state = imagined_state.reshape((batch_size, nc, nw, nh))

    imagined_reward = softmax(imagined_reward, axis=1)
    imagined_reward = np.argmax(imagined_reward, axis=1)

    return imagined_state, imagined_reward

with tf.Session() as sess:
    # Load the actor
    with tf.variable_scope('actor'):
        actor_critic = get_actor_critic(sess, nenvs, nsteps, ob_space,
                ac_space, CnnPolicy, should_summary=False)
    actor_critic.load('weights/a2c_1800.ckpt')
    
    # Load the env_model
    with tf.variable_scope('env_model'): 
        env_model = create_env_model(ob_space, num_actions,_NUM_PIXELS,
                    len(sokoban_rewards), should_summary=False)

    save_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='env_model')
    loader = tf.train.Saver(var_list=save_vars)
    loader.restore(sess, 'weights/env_model.ckpt')
    
    clear_output(True)
    while not done and steps < 20:
        steps += 1
        actions, _, _ = actor_critic.act(np.expand_dims(states, axis=3))

        onehot_actions = np.zeros((1, num_actions, nw, nh))
        onehot_actions[range(1), actions] = 1
        # Change so actions are the 'depth of the image' as tf expects
        onehot_actions = onehot_actions.transpose(0, 2, 3, 1)

        s, r = sess.run([env_model.imag_state, 
                                        env_model.imag_reward], 
                                       feed_dict={
                env_model.input_states: np.expand_dims(states, axis=3),
                env_model.input_actions: onehot_actions
            })
        
        s, r = convert_target_to_real(1, nw, nh, nc, s, r)
        
        states, reward, done, _ = env.step(actions[0])

        #plt.figure(figsize=(10,3))
        #plt.subplot(131)
        print("Imagined (Reward %i)" % r[0])
        print(s[0])
        #plt.subplot(132)
        
        print("Actual (Reward %i)" % reward)
        print(states)
        #plt.show()
        time.sleep(0.1)


Imagined (Reward 0)
[[[0. 0. 0. 0. 0. 0.]
  [0. 1. 2. 0. 0. 0.]
  [0. 1. 4. 1. 1. 0.]
  [0. 0. 1. 1. 1. 0.]
  [0. 0. 0. 1. 5. 0.]
  [0. 0. 0. 0. 0. 0.]]]
Actual (Reward -1)
[[[0. 0. 0. 0. 0. 0.]
  [0. 1. 2. 0. 0. 0.]
  [0. 1. 4. 1. 1. 0.]
  [0. 0. 1. 1. 1. 0.]
  [0. 0. 0. 1. 5. 0.]
  [0. 0. 0. 0. 0. 0.]]]
Imagined (Reward 0)
[[[0. 0. 0. 0. 0. 0.]
  [0. 1. 2. 0. 0. 0.]
  [0. 1. 4. 1. 1. 0.]
  [0. 0. 1. 1. 1. 0.]
  [0. 0. 0. 1. 5. 0.]
  [0. 0. 0. 0. 0. 0.]]]
Actual (Reward -1)
[[[0. 0. 0. 0. 0. 0.]
  [0. 1. 2. 0. 0. 0.]
  [0. 1. 4. 1. 1. 0.]
  [0. 0. 1. 1. 1. 0.]
  [0. 0. 0. 1. 5. 0.]
  [0. 0. 0. 0. 0. 0.]]]
Imagined (Reward 0)
[[[0. 0. 0. 0. 0. 0.]
  [0. 1. 1. 0. 0. 0.]
  [0. 1. 2. 1. 1. 0.]
  [0. 0. 4. 1. 1. 0.]
  [0. 0. 0. 1. 5. 0.]
  [0. 0. 0. 0. 0. 0.]]]
Actual (Reward -1)
[[[0. 0. 0. 0. 0. 0.]
  [0. 1. 1. 0. 0. 0.]
  [0. 1. 2. 1. 1. 0.]
  [0. 0. 4. 1. 1. 0.]
  [0. 0. 0. 1. 5. 0.]
  [0. 0. 0. 0. 0. 0.]]]
Imagined (Reward 0)
[[[0. 0. 0. 0. 0. 0.]
  [0. 1. 1. 0. 0. 0.]
  [0. 1. 2. 1. 