In [1]:
from __future__ import division
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import tensorflow as tf
from env_current import *
from collections import deque
import random


In [2]:
class ReplayBuffer(object):

    def __init__(self, buffer_size, random_seed = 123):
        self.buffer_size = buffer_size
        self.count = 0
        self.buffer = deque()
        random.seed(random_seed)

    def add(self, s, a, r, t, s2):
        experience = (s, a, r, t, s2)
        if self.count < self.buffer_size: 
            self.buffer.append(experience)
            self.count += 1
        else:
            self.buffer.popleft()
            self.buffer.append(experience)

    def size(self):
        return self.count

    def sample_batch(self, batch_size):

        batch = []

        if self.count < batch_size:
            batch = random.sample(self.buffer, self.count)
        else:
            batch = random.sample(self.buffer, batch_size)

        s_batch = np.array([_[0] for _ in batch])
        a_batch = np.array([_[1] for _ in batch])
        r_batch = np.array([_[2] for _ in batch])
        t_batch = np.array([_[3] for _ in batch])
        s2_batch = np.array([_[4] for _ in batch])

        return s_batch, a_batch, r_batch, t_batch, s2_batch

    def clear(self):
        self.buffer.clear()
        self.count = 0
        
def build_summaries():
    episode_reward = tf.Variable(0.)
    tf.summary.scalar("Reward", episode_reward)
    episode_ave_max_q = tf.Variable(0.)
    tf.summary.scalar("Qmax_Value", episode_ave_max_q)
    exploration_rate = tf.Variable(0.)
    tf.summary.scalar("Exploration", exploration_rate)

    summary_vars = [episode_reward, episode_ave_max_q, exploration_rate]
    summary_ops = tf.summary.merge_all()

    return summary_ops, summary_vars

In [3]:
class QNet(object):
    def __init__(self, sess, state_dim, action_dim, learning_rate, tau, batch_size, save_path):
        self.sess = sess
        self.learning_rate = learning_rate
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.learning_rate = learning_rate
        self.tau = tau
        self.batch_size = batch_size
        self.save_path = save_path
        
        self.inputs, self.q_values, self.a_predict = self.build_net()
        self.net_params = tf.trainable_variables()
        
        self.target_inputs, self.target_q_values, self.target_a_predict = self.build_net()
        self.target_net_params = tf.trainable_variables()[len(self.net_params):]
        
        self.update_target_net_params = [self.target_net_params[i]
                                         .assign(tf.multiply(self.tau, self.net_params[i])
                                                 + tf.multiply((1.-self.tau), self.target_net_params[i]) ) 
                                         for i in range(len(self.target_net_params))]
        
        self.true_q_value = tf.placeholder(shape=[None, 1], dtype=tf.float32)
        self.action = tf.placeholder(shape=[None, 1], dtype=tf.int32)
        
        gather_indices = tf.range(MINIBATCH_SIZE) * tf.shape(self.q_values)[1] + tf.reshape(self.action, [-1])
        self.action_correlated_q = tf.gather(tf.reshape(self.q_values,[-1]), gather_indices)
        
        self.loss = tf.losses.mean_squared_error(tf.reshape(self.true_q_value, [-1]), self.action_correlated_q)
        self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
        
        self.saver = tf.train.Saver()
        self.last_num_epi = -1
        
    def build_net(self):
        s_inputs = tf.placeholder(shape = [None, self.state_dim], dtype = tf.float32)
        W1 = tf.Variable(tf.random_uniform([self.state_dim, 400], 0, 0.1))
        B1 = tf.Variable(tf.zeros([400]))
        L1 = tf.add(tf.matmul(s_inputs, W1), B1)
        L1 = tf.layers.batch_normalization(L1)
        L1 = tf.nn.relu(L1)
        W2 = tf.Variable(tf.random_uniform([400, 300], 0, 0.1))
        B2 = tf.Variable(tf.zeros([300]))
        L2 = tf.add(tf.matmul(L1, W2), B2)
        L2 = tf.layers.batch_normalization(L2)
        L2 = tf.nn.relu(L2)
        W3 = tf.Variable(tf.random_uniform([300, self.action_dim], 0, 0.01))
#         B3 = tf.Variable(tf.random_uniform([self.action_dim], -0.003, 0.003))
#         q_values = tf.add(tf.matmul(L2, W3), B3)
        q_values = tf.matmul(L2, W3)  
        a_predict = tf.argmax(q_values,1)
        
        regularizer = tf.contrib.layers.l2_regularizer(0.01)
        tf.contrib.layers.apply_regularization(regularizer,[W1, B1, W2, B2, W3])
        return s_inputs, q_values, a_predict
    
    def train(self, states, action, true_q, num_epi):
        if num_epi%20 == 0 and num_epi!=self.last_num_epi:
            self.saver.save(self.sess, self.save_path)
            print "DDQN Saved"
            self.last_num_epi = num_epi
            
        return self.sess.run([self.q_values, self.optimizer], 
                             feed_dict={self.inputs: states, self.true_q_value: true_q, self.action: action})
    
    def predict_q(self, states):
        return self.sess.run(self.q_values, feed_dict={self.inputs: states})
    
    def predict_a(self, states):
        return self.sess.run(self.a_predict, feed_dict={self.inputs: states})
    
    def predect_target(self, states):
        return self.sess.run(self.target_q_values, feed_dict={self.target_inputs: states})
    
    def update_target(self):
        self.sess.run(self.update_target_net_params)
        
        

In [4]:
def train(sess, env, qnet):
    
    global EXPLORATION_RATE
  
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph)
    
    qnet.update_target()
    
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
    
    for num_epi in range(MAX_EPISODES):

        s = env.reset()
        s = [list(np.unravel_index(s, env.shape))]

        ep_reward = 0
        ep_ave_max_q = 0

        for j in range(MAX_EPISODE_LEN):

            a = np.argmax(qnet.predict_q(np.reshape(s, (1, qnet.state_dim))))
    
            if np.random.rand(1) < EXPLORATION_RATE:
                s2, r, terminal, info = env.step(np.random.randint(0,qnet.action_dim))
            else:
                s2, r, terminal, info = env.step(a)
            
            s2 = list(np.unravel_index(s2, env.shape))

            replay_buffer.add(np.reshape(s, (qnet.state_dim,)), np.reshape(a, (1,)), r,
                              terminal, np.reshape(s2, (qnet.state_dim,)))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(MINIBATCH_SIZE)

                # Calculate targets
                target_q = qnet.predect_target(s2_batch)

                y_i = []
                for k in range(MINIBATCH_SIZE):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + GAMMA * np.amax(target_q[k]))

                # Update the critic given the targets
                predicted_q_value, _ = qnet.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)), num_epi)

                ep_ave_max_q += np.amax(predicted_q_value)
                
                # Update target networks
                qnet.update_target()

            s = s2
            ep_reward += r

            if terminal or j == MAX_EPISODE_LEN-1:
                
                if EXPLORATION_RATE > 0.05 and terminal:
                    EXPLORATION_RATE = EXPLORATION_RATE*0.92

                summary_str = sess.run(summary_ops, feed_dict={
                    summary_vars[0]: ep_reward,
                    summary_vars[1]: ep_ave_max_q / float(j),
                    summary_vars[2]: EXPLORATION_RATE
                })

                writer.add_summary(summary_str, num_epi)
                writer.flush()

                print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f} | Exploration: {:.6f} '.format(int(ep_reward), \
                        num_epi, (ep_ave_max_q / float(j)), EXPLORATION_RATE))
                
                f = open("stats.txt", "ab")
                f.write("| Reward: " + str(int(ep_reward)) 
                        +" | Episode: " + str(num_epi) 
                        + " | Qmax: " + str(ep_ave_max_q / float(j)) 
                        + " | Exploration: " + str(EXPLORATION_RATE) + "\n")
                f.close()
                
                break
                
        if num_epi%1 == 0:
            state_list = []
            action_list = []
            world = np.zeros(env.shape)
            for state in range(env.nS):
                state = np.unravel_index(state, env.shape)
                action = qnet.predict_q(np.reshape(state, (1,state_dim)))
                action = np.argmax(action)
                state_list.append(state)
                action_list.append(action)
                
#             print np.reshape(action_list, env.shape)
                
            f = open("action.txt","ab")
            np.savetxt(f, np.reshape(action_list, env.shape), fmt="%i")
            f.write("---------------------------\n")
            f.close()
    
    

In [5]:
LEARNING_RATE = 0.0015
GAMMA = 0.99
TAU = 0.001
BUFFER_SIZE = 10**6
MINIBATCH_SIZE = 64
RANDOM_SEED = 272
MAX_EPISODES = 50000
MAX_EPISODE_LEN = 1000
SUMMARY_DIR = './results/tf_ddqn'
EXPLORATION_RATE = 0.65

In [6]:
with tf.Session() as sess:
    env = CurrentWorld()
    np.random.seed(RANDOM_SEED)
    tf.set_random_seed(RANDOM_SEED)
    env.seed(RANDOM_SEED)
    
    state_dim = 2
    action_dim = 4
    
    Qnet = QNet(sess, state_dim, action_dim, LEARNING_RATE, TAU, MINIBATCH_SIZE, "./saved_model/ddqn.ckpt")
    
    train(sess, env, Qnet)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


DDQN Saved
| Reward: -1000 | Episode: 0 | Qmax: 52.1282 | Exploration: 0.650000 
| Reward: -1000 | Episode: 1 | Qmax: 63.0277 | Exploration: 0.650000 
| Reward: -1000 | Episode: 2 | Qmax: 64.3090 | Exploration: 0.650000 
| Reward: -1000 | Episode: 3 | Qmax: 63.2443 | Exploration: 0.650000 
| Reward: -1000 | Episode: 4 | Qmax: 61.5773 | Exploration: 0.650000 
| Reward: -1000 | Episode: 5 | Qmax: 59.6755 | Exploration: 0.650000 
| Reward: -1000 | Episode: 6 | Qmax: 57.6404 | Exploration: 0.650000 
| Reward: -1000 | Episode: 7 | Qmax: 55.6209 | Exploration: 0.650000 
| Reward: -1000 | Episode: 8 | Qmax: 53.5161 | Exploration: 0.650000 
| Reward: -1000 | Episode: 9 | Qmax: 51.5566 | Exploration: 0.650000 
| Reward: -1000 | Episode: 10 | Qmax: 49.7000 | Exploration: 0.650000 
| Reward: -1000 | Episode: 11 | Qmax: 47.8952 | Exploration: 0.650000 
| Reward: -1000 | Episode: 12 | Qmax: 46.0578 | Exploration: 0.650000 
| Reward: -1000 | Episode: 13 | Qmax: 44.3536 | Exploration: 0.650000 
| Rew

KeyboardInterrupt: 

world = np.zeros(env.shape)
a_list = []
s_list = []
for s in range(env.nS):
    a_list += [np.argmax(P[s])]
    s_list += [np.unravel_index(s,env.shape)]
for s,a in zip(s_list,a_list):
    world[s] = a
    


%matplotlib auto
plt.imshow(world)

%matplotlib inline
matplotlib.style.use('ggplot')
plotting.plot_episode_stats(stats)

def get_optimal_path(Q,env):
    env.reset()
    start_state = env.start_state
    terminal_state = env.terminal_state
    state = np.ravel_multi_index(start_state,env.shape)
    path = [start_state]
    value = 0
    action = []
    while 1:
        next_action = np.argmax(Q[state])
        next_state,reward,done,_ = env.step(next_action)
        path += [np.unravel_index(next_state,env.shape)]
        value += reward
        action += [next_action]
        if done:
            return path, action, value
            break
        state = next_state

opt_path,action,value = get_optimal_path(Q,env)

%matplotlib auto
world = deepcopy(env.winds)
t = 0
for i in opt_path[:-1]:
    world[i] = 6
#     world[i] += action[t]
    t+=1
plt.imshow(world)
# print value