In [1]:
from __future__ import division
import numpy as np
from graphviz import Source
from qnetwork import *
from utils import *
import matplotlib.pyplot as plt
from env_dynamic_ap import *
# from Plot_Path import *
import tensorflow as tf
import sys
from dra_planning import gen_dra_policy

if sys.platform == "darwin":
    DEVICE = "/device:CPU:0"
else:
    DEVICE = "/device:GPU:0"

LTL = "<>(A && <>(B && <>T))"

LEARNING_RATE = 0.0015
GAMMA = 0.99
# GAMMA = 0.7
TAU = 0.001
BUFFER_SIZE = 10**6
MINIBATCH_SIZE = 64
RANDOM_SEED = 210
MAX_EPISODES = 30000
MAX_EPISODE_LEN = 2000
file_appendix = "GuideLearning_" + time.ctime()[4:16].replace("  ","").replace(" ","_").replace(":","-") + LTL
SUMMARY_DIR = './results/' + file_appendix
SAVE_DIR = "./saved_model/" + file_appendix + "/guide_learning.ckpt"
EXPLORATION_RATE = 0.7
LR_DECAY_TRUNCATION = -200

env = CurrentWorld(LTL)

config=tf.ConfigProto(log_device_placement=True)
config.gpu_options.allow_growth = True


def train(sess, env, qnet, dra_policy):
    
    global EXPLORATION_RATE
  
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph)
    
    qnet.update_target()
    
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
    
    for num_epi in range(MAX_EPISODES):

        s = env.reset()
        s = list(np.unravel_index(s, env.shape))

        ep_reward = 0
        ep_ave_max_q = 0
        
        reward_list = []

        for j in range(MAX_EPISODE_LEN):

            a = np.argmax(qnet.predict_q(np.reshape(s, (1, qnet.state_dim))))
    
            if np.random.rand(1) < EXPLORATION_RATE:
                if tuple(s) in dra_policy.keys():
                    s2, r, terminal, info = env.step(dra_policy[tuple(s)])
                else:
                    s2, r, terminal, info = env.step(np.random.randint(env.nA))
            else:
                s2, r, terminal, info = env.step(a)
            
            s2 = list(np.unravel_index(s2, env.shape))

            replay_buffer.add(np.reshape(s, (qnet.state_dim,)), np.reshape(a, (1,)), r,
                              terminal, np.reshape(s2, (qnet.state_dim,)))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(MINIBATCH_SIZE)

                # Calculate targets
                target_q = qnet.predect_target(s2_batch)

                y_i = []
                for k in range(MINIBATCH_SIZE):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + GAMMA * np.amax(target_q[k]))

                # Update the critic given the targets
                predicted_q_value, _ = qnet.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)), num_epi)

                ep_ave_max_q += np.amax(predicted_q_value)
                
                # Update target networks
                qnet.update_target()

            s = s2
            ep_reward += r

            if terminal or j == MAX_EPISODE_LEN-1:
                
                if EXPLORATION_RATE > 0.02 and terminal:
                    EXPLORATION_RATE = EXPLORATION_RATE*0.999
                    
                reward_list += [ep_reward]
                
                if np.average(reward_list[-10:]) > LR_DECAY_TRUNCATION:
                    qnet.decay_learning_rate(1)

                summary_str = sess.run(summary_ops, feed_dict={
                    summary_vars[0]: ep_reward,
                    summary_vars[1]: ep_ave_max_q / float(j),
                    summary_vars[2]: EXPLORATION_RATE,
                    summary_vars[3]: qnet.get_learning_rate().eval()
                })

                writer.add_summary(summary_str, num_epi)
                writer.flush()

                print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f} | Exploration: {:.6f} | Step: {:d} | LearningRate: {:.5f} '.format(int(ep_reward), \
                                                                                                    num_epi, (ep_ave_max_q / float(j)), EXPLORATION_RATE, j, qnet.get_learning_rate().eval()))
                
                f = open("./stats/stats" + file_appendix + ".txt", "ab")
                f.write("| Reward: " + str(int(ep_reward)) 
                        +" | Episode: " + str(num_epi) 
                        + " | Qmax: " + str(ep_ave_max_q / float(j)) 
                        + " | Exploration: " + str(EXPLORATION_RATE)
                        + " | Step: " + str(j)
                        + " | LearningRate: " + str(qnet.get_learning_rate().eval())
                        + "\n")
                f.close()
                
                f = open(SUMMARY_DIR + "/reward.txt", "ab")
                f.write(str(int(ep_reward)))
                f.close()

                break
                
#         if num_epi%10 == 0:
#             state_list = []
#             action_list = []
#             world = np.zeros(env.shape)
#             for state in range(env.nS):
#                 state = np.unravel_index(state, env.shape)
#                 action = qnet.predict_q(np.reshape(state, (1,state_dim)))
#                 action = np.argmax(action)
#                 state_list.append(state)
#                 action_list.append(action)
                
# #             print np.reshape(action_list, env.shape)
                
#             f = open("action.txt","ab")
#             act_string = np.array_str(np.reshape(action_list, env.shape))
#             f.write(act_string)
#             f.write("---------------------------\n")
#             f.close()



with tf.Session(config=config) as sess:
    
    np.random.seed(RANDOM_SEED)
    tf.set_random_seed(RANDOM_SEED)
    env.seed(RANDOM_SEED)
    
    state_dim = 3
    action_dim = env.nA

    dra_policy = gen_dra_policy(LTL, env)
    
    Qnet = QNet(sess, state_dim, action_dim, LEARNING_RATE, TAU, MINIBATCH_SIZE, SAVE_DIR, DEVICE)
    
    train(sess, env, Qnet, dra_policy)
    
    
    


  from ._conv import register_converters as _register_converters


0
0
0
0


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


DDQN Saved
| Reward: -372 | Episode: 0 | Qmax: 25.2632 | Exploration: 0.699300 | Step: 472 | LearningRate: 0.00150 
| Reward: 43 | Episode: 1 | Qmax: 31.7826 | Exploration: 0.698601 | Step: 57 | LearningRate: 0.00150 
| Reward: 36 | Episode: 2 | Qmax: 31.8453 | Exploration: 0.697902 | Step: 64 | LearningRate: 0.00150 
| Reward: 23 | Episode: 3 | Qmax: 31.7882 | Exploration: 0.697204 | Step: 77 | LearningRate: 0.00150 
| Reward: -111 | Episode: 4 | Qmax: 35.1514 | Exploration: 0.696507 | Step: 211 | LearningRate: 0.00150 
| Reward: -1099 | Episode: 5 | Qmax: 48.1120 | Exploration: 0.695810 | Step: 1199 | LearningRate: 0.00150 
| Reward: 10 | Episode: 6 | Qmax: 56.0079 | Exploration: 0.695115 | Step: 90 | LearningRate: 0.00150 
| Reward: 53 | Episode: 7 | Qmax: 54.5748 | Exploration: 0.694420 | Step: 47 | LearningRate: 0.00150 
| Reward: 51 | Episode: 8 | Qmax: 57.0191 | Exploration: 0.693725 | Step: 49 | LearningRate: 0.00150 
| Reward: -233 | Episode: 9 | Qmax: 53.6428 | Exploration: 0

| Reward: -353 | Episode: 79 | Qmax: 91.1659 | Exploration: 0.646802 | Step: 453 | LearningRate: 0.00150 
DDQN Saved
| Reward: -418 | Episode: 80 | Qmax: 90.0664 | Exploration: 0.646156 | Step: 518 | LearningRate: 0.00150 
| Reward: 57 | Episode: 81 | Qmax: 92.5658 | Exploration: 0.645509 | Step: 43 | LearningRate: 0.00150 
| Reward: -374 | Episode: 82 | Qmax: 91.6211 | Exploration: 0.644864 | Step: 474 | LearningRate: 0.00150 
| Reward: -1030 | Episode: 83 | Qmax: 90.4084 | Exploration: 0.644219 | Step: 1130 | LearningRate: 0.00150 
| Reward: 34 | Episode: 84 | Qmax: 91.5444 | Exploration: 0.643575 | Step: 66 | LearningRate: 0.00150 
| Reward: 62 | Episode: 85 | Qmax: 90.7383 | Exploration: 0.642931 | Step: 38 | LearningRate: 0.00150 
| Reward: -142 | Episode: 86 | Qmax: 90.9318 | Exploration: 0.642288 | Step: 242 | LearningRate: 0.00150 
| Reward: 55 | Episode: 87 | Qmax: 93.7664 | Exploration: 0.641646 | Step: 45 | LearningRate: 0.00150 
| Reward: 51 | Episode: 88 | Qmax: 94.0562 | 

| Reward: -17 | Episode: 158 | Qmax: 95.0586 | Exploration: 0.597648 | Step: 117 | LearningRate: 0.00150 
| Reward: -65 | Episode: 159 | Qmax: 94.6792 | Exploration: 0.597050 | Step: 165 | LearningRate: 0.00150 
DDQN Saved
| Reward: -123 | Episode: 160 | Qmax: 94.3301 | Exploration: 0.596453 | Step: 223 | LearningRate: 0.00150 
| Reward: 66 | Episode: 161 | Qmax: 96.4763 | Exploration: 0.595856 | Step: 34 | LearningRate: 0.00150 
| Reward: 23 | Episode: 162 | Qmax: 95.3217 | Exploration: 0.595261 | Step: 77 | LearningRate: 0.00150 
| Reward: 56 | Episode: 163 | Qmax: 96.8069 | Exploration: 0.594665 | Step: 44 | LearningRate: 0.00150 
| Reward: 54 | Episode: 164 | Qmax: 97.6933 | Exploration: 0.594071 | Step: 46 | LearningRate: 0.00150 
| Reward: 6 | Episode: 165 | Qmax: 94.9517 | Exploration: 0.593477 | Step: 94 | LearningRate: 0.00150 
| Reward: 60 | Episode: 166 | Qmax: 96.7201 | Exploration: 0.592883 | Step: 40 | LearningRate: 0.00150 
| Reward: 31 | Episode: 167 | Qmax: 96.6316 | E

| Reward: -2 | Episode: 237 | Qmax: 96.0386 | Exploration: 0.552228 | Step: 102 | LearningRate: 0.00150 
| Reward: 45 | Episode: 238 | Qmax: 97.5772 | Exploration: 0.551676 | Step: 55 | LearningRate: 0.00150 
| Reward: -3 | Episode: 239 | Qmax: 96.9371 | Exploration: 0.551125 | Step: 103 | LearningRate: 0.00150 
DDQN Saved
| Reward: -20 | Episode: 240 | Qmax: 95.9793 | Exploration: 0.550573 | Step: 120 | LearningRate: 0.00150 
| Reward: 53 | Episode: 241 | Qmax: 96.3321 | Exploration: 0.550023 | Step: 47 | LearningRate: 0.00150 
| Reward: -17 | Episode: 242 | Qmax: 96.7077 | Exploration: 0.549473 | Step: 117 | LearningRate: 0.00150 
| Reward: 57 | Episode: 243 | Qmax: 99.2273 | Exploration: 0.548923 | Step: 43 | LearningRate: 0.00150 
| Reward: 34 | Episode: 244 | Qmax: 96.7244 | Exploration: 0.548374 | Step: 66 | LearningRate: 0.00150 
| Reward: 59 | Episode: 245 | Qmax: 97.9531 | Exploration: 0.547826 | Step: 41 | LearningRate: 0.00150 
| Reward: 63 | Episode: 246 | Qmax: 98.9821 | E

| Reward: -12 | Episode: 316 | Qmax: 96.9636 | Exploration: 0.510261 | Step: 112 | LearningRate: 0.00150 
| Reward: 35 | Episode: 317 | Qmax: 96.4730 | Exploration: 0.509751 | Step: 65 | LearningRate: 0.00150 
| Reward: 48 | Episode: 318 | Qmax: 97.4877 | Exploration: 0.509241 | Step: 52 | LearningRate: 0.00150 
| Reward: 61 | Episode: 319 | Qmax: 98.1423 | Exploration: 0.508732 | Step: 39 | LearningRate: 0.00150 
DDQN Saved
| Reward: 43 | Episode: 320 | Qmax: 97.2742 | Exploration: 0.508223 | Step: 57 | LearningRate: 0.00150 
| Reward: 45 | Episode: 321 | Qmax: 97.8544 | Exploration: 0.507715 | Step: 55 | LearningRate: 0.00150 
| Reward: 44 | Episode: 322 | Qmax: 96.4940 | Exploration: 0.507207 | Step: 56 | LearningRate: 0.00150 
| Reward: 3 | Episode: 323 | Qmax: 96.9700 | Exploration: 0.506700 | Step: 97 | LearningRate: 0.00150 
| Reward: 36 | Episode: 324 | Qmax: 97.0515 | Exploration: 0.506193 | Step: 64 | LearningRate: 0.00150 
| Reward: 33 | Episode: 325 | Qmax: 97.3520 | Explor

| Reward: 45 | Episode: 395 | Qmax: 96.6010 | Exploration: 0.471483 | Step: 55 | LearningRate: 0.00150 
| Reward: 32 | Episode: 396 | Qmax: 97.1089 | Exploration: 0.471011 | Step: 68 | LearningRate: 0.00150 
| Reward: 55 | Episode: 397 | Qmax: 99.0614 | Exploration: 0.470540 | Step: 45 | LearningRate: 0.00150 
| Reward: 19 | Episode: 398 | Qmax: 96.8103 | Exploration: 0.470070 | Step: 81 | LearningRate: 0.00150 
| Reward: 24 | Episode: 399 | Qmax: 97.7708 | Exploration: 0.469600 | Step: 76 | LearningRate: 0.00150 
DDQN Saved
| Reward: 59 | Episode: 400 | Qmax: 98.4619 | Exploration: 0.469130 | Step: 41 | LearningRate: 0.00150 
| Reward: 38 | Episode: 401 | Qmax: 96.5812 | Exploration: 0.468661 | Step: 62 | LearningRate: 0.00150 
| Reward: 29 | Episode: 402 | Qmax: 97.5338 | Exploration: 0.468192 | Step: 71 | LearningRate: 0.00150 
| Reward: 32 | Episode: 403 | Qmax: 97.7938 | Exploration: 0.467724 | Step: 68 | LearningRate: 0.00150 
| Reward: 32 | Episode: 404 | Qmax: 97.9739 | Explora

| Reward: 13 | Episode: 474 | Qmax: 97.2762 | Exploration: 0.435652 | Step: 87 | LearningRate: 0.00150 
| Reward: 44 | Episode: 475 | Qmax: 98.3133 | Exploration: 0.435216 | Step: 56 | LearningRate: 0.00150 
| Reward: -14 | Episode: 476 | Qmax: 98.1874 | Exploration: 0.434781 | Step: 114 | LearningRate: 0.00150 
| Reward: -26 | Episode: 477 | Qmax: 99.2676 | Exploration: 0.434346 | Step: 126 | LearningRate: 0.00150 
| Reward: 51 | Episode: 478 | Qmax: 101.4826 | Exploration: 0.433912 | Step: 49 | LearningRate: 0.00150 
| Reward: 34 | Episode: 479 | Qmax: 98.8858 | Exploration: 0.433478 | Step: 66 | LearningRate: 0.00150 
DDQN Saved
| Reward: 30 | Episode: 480 | Qmax: 100.2446 | Exploration: 0.433044 | Step: 70 | LearningRate: 0.00150 
| Reward: 26 | Episode: 481 | Qmax: 99.0019 | Exploration: 0.432611 | Step: 74 | LearningRate: 0.00150 
| Reward: -2 | Episode: 482 | Qmax: 97.9620 | Exploration: 0.432179 | Step: 102 | LearningRate: 0.00150 
| Reward: 37 | Episode: 483 | Qmax: 98.5731 | 

KeyboardInterrupt: 