In [None]:
import gym
import matplotlib.pyplot as plt
import numpy as np
import random
import tensorflow as tf

from IPython import display


# Configuration
SUMMARY_DIR = './logs/pong'
CHECKPOINT_DIR = './checkpoints/pong'

gamma = 0.9  # discount factor for reward
resume = True  # resume from previous checkpoint?
render = False

stack_size = 5

# Optimizer configuration
optimizer = tf.train.RMSPropOptimizer(learning_rate=1e-2, decay=.99)
# optimizer = tf.train.AdamOptimizer(learning_rate=1e-4)

# We will exploit easiest game, pong
env = gym.make('Pong-v0')
env.reset()

action_space = [0, 2, 3]  # No-op / Up / Down
n_action_space = len(action_space)


#### Helper functions
def preprocess(img):
    from skimage.color import rgb2gray
    from skimage.filters import threshold_mean
    from skimage.measure import block_reduce
    img = rgb2gray(img)

    # from skimage.filters import try_all_threshold
    # try_all_threshold(img, figsize=(10, 8), verbose=False)
    # plt.show()

    binary = img > threshold_mean(img)
    binarized_img = binary.astype(int)
    cropped_img = binarized_img[32:195]
    reduced_img = block_reduce(cropped_img, block_size=(2, 2), func=np.max)

    final_img = reduced_img * 255
    return final_img * 255


def discount_rewards(r, gamma=0.99):
    """ take 1D float array of rewards and compute discounted reward
        http://karpathy.github.io/2016/05/31/rl/  """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        if r[t] != 1:
            # reset the sum, since this was a game boundary (pong specific!)
            running_add = 0
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add

    # compute the discounted reward backwards through time
    # standardize the rewards to be unit normal (helps control the gradient
    # estimator variance)
    discounted_r -= np.mean(discounted_r)
    discounted_r /= np.std(discounted_r)
    return discounted_r


# get sample input
observation, _, _, _ = env.step(random.choice(action_space))
x = preprocess(observation)
height, width = x.shape

# based on input size, make model
# net = PongNetwork(height=height, width=width, name='pongnet')

with tf.name_scope("cnn"):
    X = tf.placeholder(tf.float32, [None, height, width, stack_size], name="input_x")
    x_stacked_image = tf.reshape(X, [-1, height, width, stack_size])

    # Build a convolutional layer random initialization
    W_conv1 = tf.get_variable("W_conv1", shape=[5, 5, stack_size, 32], initializer=tf.contrib.layers.xavier_initializer())
    # W is [row, col, channel, feature]
    b_conv1 = tf.Variable(tf.zeros([32]), name="b_conv1")
    h_conv1 = tf.nn.relu(tf.nn.conv2d(x_stacked_image, W_conv1, strides=[1, 2, 2, 1], padding='VALID') + b_conv1,
                         name="h_conv1")

    W_conv2 = tf.get_variable("W_conv2", shape=[5, 5, 32, 64], initializer=tf.contrib.layers.xavier_initializer())
    b_conv2 = tf.Variable(tf.zeros([64]), name="b_conv2")
    h_conv2 = tf.nn.relu(tf.nn.conv2d(h_conv1, W_conv2, strides=[1, 2, 2, 1], padding='VALID') + b_conv2,
                         name="h_conv2")

    W_conv3 = tf.get_variable("W_conv3", shape=[5, 5, 64, 64], initializer=tf.contrib.layers.xavier_initializer())
    b_conv3 = tf.Variable(tf.zeros([64]), name="b_conv3")
    h_conv3 = tf.nn.relu(tf.nn.conv2d(h_conv2, W_conv3, strides=[1, 2, 2, 1], padding='VALID') + b_conv3,
                         name="h_conv3")

    # Build a fully connected layer with softmax
    h_conv3_flat = tf.reshape(h_conv3, [-1, 7 * 7 * 64], name="h_pool2_flat")
    W_fc1 = tf.get_variable("W_fc1", shape=[7 * 7 * 64, n_action_space],
                            initializer=tf.contrib.layers.xavier_initializer())
    b_fc1 = tf.Variable(tf.zeros([n_action_space]), name='b_fc1')
    action_pred = tf.nn.softmax(tf.matmul(h_conv3_flat, W_fc1) + b_fc1, name="h_fc1")

    tf.summary.histogram("action_pred", action_pred)

    # Why do I use softmax here?
    # http://home.deib.polimi.it/restelli/MyWebSite/pdf/rl7.pdf


# We need to define the parts of the network needed for learning a policy
Y = tf.placeholder(tf.float32, [None, n_action_space], name="input_y")
advantages = tf.placeholder(tf.float32, [None, 1], name="reward_signal")

# Loss function
# Sum (Ai*logp(yi|xi))
log_lik = -Y * (tf.log(tf.clip_by_value(action_pred, 1e-7, 1.0)))
loss = tf.reduce_mean(tf.reduce_sum(log_lik * advantages, axis=1))
tf.summary.scalar("A_pred", tf.reduce_mean(action_pred))
tf.summary.scalar("Y", tf.reduce_mean(Y))
tf.summary.scalar("log_likelihood", tf.reduce_mean(log_lik))
tf.summary.scalar("loss", loss)

# Learning
train = optimizer.minimize(loss)

# Some place holders for summary
summary_reward = tf.placeholder(tf.float32, shape=(), name="reward")
tf.summary.scalar("reward", summary_reward)

# Summary
summary = tf.summary.merge_all()


# Setting up our environment
sess = tf.Session()
sess.run(tf.global_variables_initializer())
writer = tf.summary.FileWriter(SUMMARY_DIR)
writer.add_graph(sess.graph)

# Savor and Restore
saver = tf.train.Saver()
checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR)
if checkpoint:
    try:
        saver.restore(sess, save_path)
        print("Successfully loaded:", save_path)
    except:
        print("Error on loading old network weights")
else:
    print("Could not find old network weights")


# run
def perform_train():
    save_path = "model.ckpt"
    rendered_img = None
    global_step = 0
    for episode in range(1, 9999):
        global_step += 1

        xs_list = []
        ys_list = []
        rewards = np.empty(0).reshape(0, 1)
        ep_rewards_list = []

        state = env.reset()
        state = preprocess(state)

        stacked_state = [state] * stack_size  # initial input

        ep_win_count = 0
        ep_lose_count = 0
        ep_reward_sum = 0

        while True:
            x = np.concatenate([stacked_state])  # list of (h, w) -> numpy (stack_size, h, w)
            x = np.rollaxis(x, 0, 3)  # (stack_size, h, w) -> (h, w, stack_size)
            x = x[None, :, :, :]  # (h, w, stack_size) -> (1, h, w, stack_size)
            # Run the neural net to determine output
            action_prob = sess.run(action_pred, feed_dict={X: x})
            action_prob = np.squeeze(action_prob)  # shape (?, n) -> n
            
            # random_noise = np.random.uniform(0, 0.01, n_action_space)  # Add randomness!
            # action_prob = (action_prob + random_noise) / np.sum(action_prob + random_noise)
            action_index = np.random.choice(range(n_action_space), size=1, p=action_prob)[0]

            action = action_space[action_index]

            # random_noise = np.random.uniform(0, 1, n_action_space)
            # action_index = np.random.choice(range(n_action_space), size=1, p=action_prob + random_noise)
            # action = np.argmax(action_prob + random_noise)
            # print("Action prediction: ", np.argmax(action_prob), " action taken:", action,
            #      np.argmax(action_prob) == action)

            # Append the observations and outputs for learning
            xs_list.append(x)
            y = np.eye(n_action_space)[action_index:action_index + 1]  # One hot encoding
            ys_list.append(y)

            state, reward, done, _ = env.step(action)

            # env.render()
            state = preprocess(state)
            stacked_state = stacked_state[1:] + [state]  # FIFO
            ep_reward_sum += reward

            ep_rewards_list.append(reward)

            # Discount rewards on every single game
            if reward == 1 or reward == -1:
                ep_rewards = np.vstack(ep_rewards_list)
                discounted_rewards = discount_rewards(ep_rewards, gamma)
                rewards = np.vstack([rewards, discounted_rewards])
                ep_rewards_list = []
                # print("Ep reward {}".format(reward))

                if reward == 1:
                    ep_win_count += 1
                else:
                    ep_lose_count += 1

            if done:
                xs = np.vstack(xs_list)
                ys = np.vstack(ys_list)

                l, s, _ = sess.run([loss, summary, train],
                                   feed_dict={X: xs,
                                              Y: ys,
                                              advantages: rewards,
                                              summary_reward: ep_reward_sum})
                writer.add_summary(s, global_step)
                print("Episode {} Win {} Lose {} Reward {} Loss {}".format(
                    episode, ep_win_count, ep_lose_count, ep_reward_sum, l))
                break

            if render:
                title = 'episode{}'.format(episode)
                plt.title(title)

                # you can directly render x (which is preprocessed game image)
                if rendered_img is None:
                    rendered_img = plt.imshow(state)
                rendered_img.set_data(state)
                display.display(plt.gcf())
                display.clear_output(wait=True)
                
        if episode % 10 == 0:
            print("Saving checkpoint")
            saver = tf.train.Saver()
            saver.save(sess, save_path)

def perform_test():
    for episode in range(50):
        done = False
        while not done:
            action = random.choice(action_space)
            observation, reward, done, _ = env.step(action)
            x = preprocess(observation)

            if render:
                title = 'episode{}'.format(episode)
                plt.title(title)

                # you can directly render x (which is preprocessed game image)
                if rendered_img is None:
                    rendered_img = plt.imshow(x)
                rendered_img.set_data(x)
                display.display(plt.gcf())
                display.clear_output(wait=True)


[2017-06-20 15:59:14,289] Making new env: Pong-v0


Could not find old network weights


In [None]:
perform_train()

Episode 1 Win 0 Lose 21 Reward -21.0 Loss 0.0006195407477207482
Episode 2 Win 0 Lose 21 Reward -21.0 Loss 0.0
Episode 3 Win 0 Lose 21 Reward -21.0 Loss 0.0
Episode 4 Win 0 Lose 21 Reward -21.0 Loss 0.0
Episode 5 Win 0 Lose 21 Reward -21.0 Loss 0.0
Episode 6 Win 0 Lose 21 Reward -21.0 Loss 0.0
Episode 7 Win 0 Lose 21 Reward -21.0 Loss 0.0
Episode 8 Win 0 Lose 21 Reward -21.0 Loss 0.0
Episode 9 Win 0 Lose 21 Reward -21.0 Loss 0.0
Episode 10 Win 0 Lose 21 Reward -21.0 Loss 0.0
Saving checkpoint
Episode 11 Win 0 Lose 21 Reward -21.0 Loss 0.0
Episode 12 Win 0 Lose 21 Reward -21.0 Loss 0.0
Episode 13 Win 0 Lose 21 Reward -21.0 Loss 0.0
Episode 14 Win 0 Lose 21 Reward -21.0 Loss 0.0
Episode 15 Win 0 Lose 21 Reward -21.0 Loss 0.0
Episode 16 Win 0 Lose 21 Reward -21.0 Loss 0.0
Episode 17 Win 0 Lose 21 Reward -21.0 Loss 0.0
Episode 18 Win 0 Lose 21 Reward -21.0 Loss 0.0
Episode 19 Win 0 Lose 21 Reward -21.0 Loss 0.0
Episode 20 Win 0 Lose 21 Reward -21.0 Loss 0.0
Saving checkpoint
Episode 21 Win 