# Exercise 4: Q-Learning with Monte Carlo Updates

In [None]:
from __future__ import division

import numpy as np
import random
import os
import tensorflow as tf
import tensorflow.contrib.slim as slim

from unityenv import UnityEnvironment

### Hyperparameters

In [None]:
y = .99 # Discount rate
start_e = 1 # Starting epsilon value
end_e = 0.1 # Final epsilon value
annealing_steps = 10000 # Number of steps to anneal epsilon
num_episodes = 1000 # Total episodes to run environment
summary_path = './summaries/q-mc' # Path to save statistics
learning_rate = 1e-2 # Agent learning rate

### Load the Unity environment

In [None]:
env_config = {'--grid-size': 7, '--num-objects': 4, '--num-goals': 1}
env = UnityEnvironment(file_name="FixedGridWorld", worker_num=3, config=env_config)
print (str(env))

### Examine the state space

In [None]:
_, state = env.reset()
print(state)

State (s) is an integer which corresponds to a discrete state.

## The Q-Learning Agent

In [None]:
def discount_rewards(r, gamma):
    """ 
    function from karpathy.github.io/2016/05/31/rl/
    take 1D float array of rewards and compute discounted reward 
    """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

class QAgent(object):
    def __init__(self, num_states, num_actions, lr):
        # These lines establish the feed-forward part of the network used to estimate Q(s, a)
        self.state_input = tf.placeholder(shape=[None, 1],dtype=tf.int32)
        state = slim.one_hot_encoding(self.state_input, num_states)
        self.q_out = slim.fully_connected(state, num_actions,
                                          biases_initializer=None, 
                                          weights_initializer= tf.zeros_initializer(),
                                          activation_fn=None)
        self.q_out = slim.flatten(self.q_out)
        self.predict = tf.argmax(self.q_out,1)

        # Below we obtain the loss by taking the sum of squares difference
        # between the target and prediction Q values.
        self.q_next = tf.placeholder(shape=[None],dtype=tf.float32)
        self.action = tf.placeholder(shape=[None],dtype=tf.int32)
        self.action_onehot = slim.one_hot_encoding(self.action, num_actions)
        self.selected_q = tf.reduce_sum(self.q_out * self.action_onehot, axis=1)
        self.loss = tf.reduce_sum(tf.squared_difference(self.q_next, self.selected_q))
        trainer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update = trainer.minimize(self.loss)

### Training the network

In [None]:
tf.reset_default_graph()
agent = QAgent(env.state_space_size, env.action_space_size, learning_rate)
init = tf.global_variables_initializer()

if not os.path.exists(summary_path):
    os.makedirs(summary_path)

e_drop = (start_e - end_e) / annealing_steps
e = start_e
value_table = np.zeros([env.state_space_size])

# Create lists to contain total rewards and steps per episode
episode_list = []
reward_list = []
loss_list = []
with tf.Session() as sess:
    sess.run(init)
    summary_writer = tf.summary.FileWriter(summary_path)
    for i in range(num_episodes):
        # Reset environment and get first new state
        _, state = env.reset()
        total_reward = 0
        done = False
        steps = 0
        episode_buffer = []
        while not done:
            steps +=1
            # Choose an action by greedily (with e chance of random action) from the Q-network
            action, Q = sess.run([agent.predict,agent.q_out],feed_dict={agent.state_input: [state]})
            action = action[0]
            if np.random.rand(1) < e:
                action = np.random.randint(0, env.action_space_size)
            
            # Get new state and reward from environment
            _, state_1, reward, done = env.step(action, value_table.tolist())
            episode_buffer.append([state, action, reward, state_1, done])
            
            total_reward += reward
            state = state_1
            if e > end_e:
                e -= e_drop
        # Train our network using target and estimated Q values
        episode_matrix = np.array(episode_buffer)
        episode_matrix[:, 2] = discount_rewards(episode_matrix[:, 2], y)
        _, q_table, v_loss = sess.run([agent.update, 
                                       tf.trainable_variables()[0], 
                                       agent.loss],
                 feed_dict={agent.state_input: np.vstack(episode_matrix[:, 0]), 
                            agent.q_next: episode_matrix[:, 2],
                            agent.action: episode_matrix[:, 1]})
        loss_list.append(v_loss)
        episode_list.append(steps)
        reward_list.append(total_reward)
        value_table = np.mean(q_table, axis=1)
        if i % 50 == 0 and i != 0:
            summary = tf.Summary()
            summary.value.add(tag='Info/Reward', simple_value=float(np.mean(reward_list[-50:])))
            summary.value.add(tag='Info/Value Loss', simple_value=float(np.mean(loss_list[-50:])))
            summary.value.add(tag='Info/Epsilon', simple_value=float(e))
            summary.value.add(tag='Info/Q Estimate', simple_value=float(np.mean(value_table)))
            summary.value.add(tag='Info/Episode Length', simple_value=float(np.mean(episode_list[-50:])))
            summary_writer.add_summary(summary, i)
            summary_writer.flush()
            print ("Mean Reward: {}".format(np.mean(reward_list[-50:])))
env.close()

In [None]:
env.close()