## Space invaders

In [15]:
import tensorflow as tf
import numpy as np
import gym

#Plot
%matplotlib nbagg
import matplotlib
import matplotlib.animation as animation
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

### Environment

In [16]:
environment = gym.make('SpaceInvaders-v0')

[2017-11-07 07:35:30,497] Making new env: SpaceInvaders-v0


In [17]:
observation = environment.reset()

Observations are 210X160 images with rgb

In [18]:
observation.shape

(210, 160, 3)

Action space is 6 (stop, shoot, right, right + shoot, left, left + shoot)

In [19]:
environment.action_space

Discrete(6)

### Start screen

In [20]:
def plot_environment(environment, figsize=(5,4)):
    plt.close()
    plt.figure(figsize=figsize)
    image = environment.render(mode="rgb_array")
    plt.imshow(image)
    plt.axis("off")
    plt.show()

In [21]:
plot_environment(environment)

<IPython.core.display.Javascript object>

### Random game

In [22]:
frames = []

max_steps = 1000
change_action_step = 10

observation = environment.reset()

for step in range(max_steps):
    image = environment.render(mode="rgb_array")
    frames.append(image)
    if step % change_action_step == 0:
        action = environment.action_space.sample()
    observation, reward, done, info = environment.step(action)
    if done:
        break

In [23]:
def update_scene(num, frames, patch):
    patch.set_data(frames[num])
    return patch,

def plot_animation(frames, repeat=False, interval=40):
    plt.close()
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    return animation.FuncAnimation(fig, update_scene, fargs=(frames, patch), frames=len(frames), repeat=repeat, interval=interval)

In [24]:
video = plot_animation(frames)
plt.show()

<IPython.core.display.Javascript object>

### Policy gradients

#### Image preprocessing

In [25]:
gym_environment = "SpaceInvaders-v0"
environment = gym.make(gym_environment)

[2017-11-07 07:35:43,688] Making new env: SpaceInvaders-v0


In [26]:
player_ship_color = np.array([51, 132, 50]).mean()

def preprocess_observation(observation, height, width):
    img = observation[1:176:2, ::2] # crop and downsize
    img = img.mean(axis=2) # to greyscale
    img[img==player_ship_color] = 0 # Improve contrast
    img = (img - 128) / 128 - 1 # normalize from -1. to 1.
    return img.reshape(height, width, 1)

observation = environment.reset()

image_height = 88
image_width = 80
image = preprocess_observation(observation, image_height, image_width)

In [27]:
plt.figure(figsize=(11, 7))
plt.subplot(121)
plt.title("Original image (210X160 RGB)")
plt.imshow(observation)
plt.axis("off")
plt.subplot(122)
plt.title("Preprocessed image (88×80 greyscale)")
plt.imshow(image.reshape(88, 80), interpolation="nearest", cmap="gray")
plt.axis("off")
plt.show()

<IPython.core.display.Javascript object>

#### Policy network

In [102]:
tf.reset_default_graph()

input_height = image_height
input_width = image_width
input_channels = 1

#first convolution layer
conv1_n_maps = 32
conv1_kernel_sizes = (8,8)
conv1_strides = 4
#second convolution layer
conv2_n_maps = 64
conv2_kernel_sizes = (4,4)
conv2_strides = 2
#third convolution layer
conv3_n_maps = 64
conv3_kernel_sizes = (3,3)
conv3_strides = 1

conv_paddings = "SAME"
conv_activation = tf.nn.relu

n_hidden_inputs = 64 * 11 * 10  # conv3 has 64 maps of 11x10 each
n_hidden = 512
hidden_activation = tf.nn.relu
n_outputs = environment.action_space.n
learning_rate = 0.01
initializer = tf.contrib.layers.variance_scaling_initializer()

X_state = tf.placeholder(tf.float32, shape=[None, input_height, input_width, input_channels])
conv_layer_1 = tf.layers.conv2d(X_state, filters=conv1_n_maps, kernel_size=conv1_kernel_sizes, 
                                strides=conv1_strides, padding=conv_paddings, activation=conv_activation, 
                                kernel_initializer=initializer)
conv_layer_2 = tf.layers.conv2d(conv_layer_1, filters=conv2_n_maps, kernel_size=conv2_kernel_sizes, 
                                strides=conv2_strides, padding=conv_paddings, activation=conv_activation, 
                                kernel_initializer=initializer)
conv_layer_3 = tf.layers.conv2d(conv_layer_2, filters=conv3_n_maps, kernel_size=conv3_kernel_sizes, 
                                strides=conv3_strides, padding=conv_paddings, activation=conv_activation, 
                                kernel_initializer=initializer)
conv_layer_3_flat = tf.reshape(conv_layer_3, shape=[-1, n_hidden_inputs])
hidden = tf.layers.dense(conv_layer_3_flat, n_hidden, activation=hidden_activation, kernel_initializer=initializer)
logits = tf.layers.dense(hidden, n_outputs)
output = tf.nn.softmax(logits)

action = tf.multinomial(tf.log(output), num_samples=1)

y = tf.one_hot(indices=[action], depth=n_outputs)

cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y)
optimizer = tf.train.AdamOptimizer(learning_rate)
gradients_and_variables = optimizer.compute_gradients(cross_entropy)
gradients = [grad for grad, variable in gradients_and_variables]

gradient_placeholders = []
gradients_and_variables_feed = []

for grad, variable in gradients_and_variables:
    gradient_placeholder = tf.placeholder(tf.float32, shape=grad.get_shape())
    gradient_placeholders.append(gradient_placeholder)
    gradients_and_variables_feed.append((gradient_placeholder, variable))

training_optimize = optimizer.apply_gradients(gradients_and_variables_feed)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [83]:
def discount_rewards(rewards, discount_rate):
    discounted_rewards = np.zeros(len(rewards))
    cumulative_rewards = 0
    for step in reversed(range(len(rewards))):
        cumulative_rewards = rewards[step] + cumulative_rewards * discount_rate
        discounted_rewards[step] = cumulative_rewards
    return discounted_rewards

def discount_and_normalize_rewards(all_rewards, discount_rate):
    all_discounted_rewards = [discount_rewards(rewards, discount_rate) for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std() if flat_rewards.std() != 0 else 1 # check for division by 0
    return [(discounted_rewards - reward_mean)/reward_std for discounted_rewards in all_discounted_rewards]

In [85]:
number_of_games_per_update = 2
max_steps = 100
number_of_iterations = 2
save_iterations = 10
discount_rate = 0.95

with tf.Session() as sess:
    init.run()
    for iteration in range(number_of_iterations):
        print("\rAt iteration: {} (of {})".format(iteration + 1 , number_of_iterations), end="")
        all_rewards = []
        all_gradients = []
        for game in range(number_of_games_per_update):
            current_rewards = []
            current_gradients = []
            observation = environment.reset()
            for step in range(max_steps):
                action_value, gradients_value = sess.run([action, gradients], 
                        feed_dict={X_state: [preprocess_observation(observation, input_height, input_width)]})
                observation, reward, done, info = environment.step(action_value[0][0])
                current_rewards.append(reward)
                current_gradients.append(gradients_value)
                if done:
                    break
            all_rewards.append(current_rewards)
            all_gradients.append(current_gradients)

        all_rewards = discount_and_normalize_rewards(all_rewards, discount_rate=discount_rate)
        
        feed_dict = {}
        for var_index, gradient_placeholder in enumerate(gradient_placeholders):
            mean_gradients = np.mean([reward * all_gradients[game_index][step][var_index]
                                      for game_index, rewards in enumerate(all_rewards)
                                          for step, reward in enumerate(rewards)], axis=0)
            feed_dict[gradient_placeholder] = mean_gradients
        sess.run(training_optimize, feed_dict=feed_dict)
        if iteration % save_iterations == 0:
            saver.save(sess, "./policy_net_pg.ckpt")

At iteration: 2 (of 2)

In [None]:
environment.close()

#### Playing a game

In [99]:
def render_policy_net(model_path, action, X, gym_environment, max_steps = 1000):
    frames = []
    environment = gym.make(gym_environment)
    observation = environment.reset()
    with tf.Session() as sess:
        saver.restore(sess, model_path)
        for step in range(max_steps):
            frames.append(observation)
            action_value = action.eval(feed_dict={X_state: [preprocess_observation(observation, input_height, input_width)]})
            observation, reward, done, info = environment.step(action_value[0][0])
            if done:
                break
    environment.close()
    return frames        

In [100]:
frames = render_policy_net("./policy_net_pg.ckpt", action, X_state, "SpaceInvaders-v0", max_steps=10000)
video = plot_animation(frames)
plt.show()

[2017-11-07 15:52:13,407] Making new env: SpaceInvaders-v0


INFO:tensorflow:Restoring parameters from ./policy_net_pg.ckpt


[2017-11-07 15:52:13,590] Restoring parameters from ./policy_net_pg.ckpt


<IPython.core.display.Javascript object>

### Q learning