In [1]:
from __future__ import division
from full_prod_DRA import *
from buchi import buchi_from_ltl
import numpy as np
from env_sensing_error import CurrentWorld
import scipy
from plot_path_for_prod import *
from graphviz import Source
from qnetwork import *
from utils import *
import matplotlib.pyplot as plt
from Plot_Path import *
import tensorflow as tf
import sys
from dra_planning import *
import time

  from ._conv import register_converters as _register_converters


In [2]:
def train(sess, env, qnet, prod_planner):
    
    global EXPLORATION_RATE
    global GUIDE_RATE
  
    summary_ops, summary_vars = build_summaries()
    if not RESTORE:
        sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph)
    
    qnet.update_target()
    
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) 
    
    for num_epi in range(MAX_EPISODES):

        s = env.reset()
        s = [list(np.unravel_index(s, env.shape))]

        ep_reward = 0
        ep_ave_max_q = 0
        
        reward_list = []
        
        train_time = 0
        batch_time = 0
        gym_time = 0
        guide_time = 0

        for j in range(MAX_EPISODE_LEN):
            
            gym_start = time.time()

            rand_num = np.random.rand(1)
    
            if rand_num <= EXPLORATION_RATE:
                a = np.random.randint(0,qnet.action_dim)
                s2, r, terminal, info = env.step(a)
            elif EXPLORATION_RATE < rand_num <= GUIDE_RATE+EXPLORATION_RATE:
                
                guide_start = time.time()
                
                prod_planner.get_global_opt()
                if len(prod_planner.opt_path) > 0:
                    prod_planner.get_opt_rabin()
                    new_ltl = prod_planner.get_next_ltl(s[-1])
                    guide_path = prod_planner.get_local_opt(s[:-1], new_ltl)
                    if guide_path != None:
                        a = convert_path_to_action(guide_path)
                    else:
                        a = np.random.randint(0,qnet.action_dim)
                else:
                    a = np.random.randint(0,qnet.action_dim)
                s2, r, terminal, info = env.step(a)
                
                guide_time += time.time() - guide_start
                
            else:
                a = np.argmax(qnet.predict_q(np.reshape(s, (1, qnet.state_dim))))
                s2, r, terminal, info = env.step(a)
                
            gym_time += time.time() - gym_start
            
            batch_start = time.time()
            
            s2 = list(np.unravel_index(s2, env.shape))

            replay_buffer.add(np.reshape(s, (qnet.state_dim,)), np.reshape(a, (1,)), r,
                              terminal, np.reshape(s2, (qnet.state_dim,)))
            batch_time += time.time() - batch_start

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > MINIBATCH_SIZE:
                
                batch_start = time.time()
                s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(MINIBATCH_SIZE)

                # Calculate targets
                target_q = qnet.predect_target(s2_batch)

                y_i = []
                for k in range(MINIBATCH_SIZE):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + GAMMA * np.amax(target_q[k]))
                        
                batch_time += time.time() - batch_start

                # Update the critic given the targets
                train_start = time.time()
                predicted_q_value, _ = qnet.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)), num_epi)

                ep_ave_max_q += np.amax(predicted_q_value)
                
                # Update target networks
                qnet.update_target()

                train_time += time.time() - train_start

            s = s2
            ep_reward += r

            if terminal or j == MAX_EPISODE_LEN-1:
                
                if EXPLORATION_RATE > 0.02 and terminal:
                    EXPLORATION_RATE = EXPLORATION_RATE*0.98
                    
                reward_list += [ep_reward]
                
                if np.average(reward_list[-10:]) > LR_DECAY_TRUNCATION:
                    qnet.decay_learning_rate(0.98)

                print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f} | Exploration: {:.6f} | Step: {:d} | LR: {:.8f}'.format(int(ep_reward), \
                        num_epi, (ep_ave_max_q / float(j)), EXPLORATION_RATE, j, qnet.get_learning_rate()))
                
                f = open("stats/" + file_appendix + "_stats.txt", "ab")
                f.write("| Reward: " + str(int(ep_reward)) 
                        +" | Episode: " + str(num_epi) 
                        + " | Qmax: " + str(ep_ave_max_q / float(j)) 
                        + " | Exploration: " + str(EXPLORATION_RATE)
                        + " | Step: " + str(j)
                        + " | LR:" + str(qnet.get_learning_rate()) + "\n")
                f.close()
                
                f = open("stats/" + file_appendix + "_stats_time.txt", "ab")
                f.write(" | Episode: " + str(num_epi) 
                        + " | Train: " + str(train_time) 
                        + " | Gym: " + str(gym_time)
                        + " | Batch: " + str(batch_time) 
                        + " | Guide: " + str(guide_time)
                        + "\n")
                f.close()
                
                break
                

In [3]:
# LTL = "<>(A && <>(B && <> T)) && []<>A && []<>B"
# LTL = "[] (p1 -> !(X p1) U (p2 || p3) ) && []<>p1"
# LTL = "T && []<>A && []<>B"
# LTL = "<>(A && <>(B && <> T)) && []<>A && []<>B && []!C && []!D"
# LTL = "<>(A && <>(B && <> T)) && []<>A && []<>B && []!C"
# LTL = "<>(A && <>(B && <> T))"
# LTL = "<>(A && <>B) && <>[]T && []!C"
LTL = "<>(A && <>T) && []!C"
# LTL = "<>(A && <>(B && <>T)) && []<>(A||T) && []<>B && []!C"
# LTL = "<>(A && <>(B && <>T)) && []!C"
# LTL = "<>(A && <>D) && <>(B && <>E) && []<>T && []<>(D || E) && []!C"

LEARNING_RATE = 0.00001
GAMMA = 0.99
# GAMMA = 0.7
TAU = 0.001
BUFFER_SIZE = 10**6
MINIBATCH_SIZE = 1
RANDOM_SEED = 210
MAX_EPISODES = 50000
MAX_EPISODE_LEN = 500
file_appendix = "Guide_Planning_" + time.ctime()[4:16].replace("  ","").replace(" ","_").replace(":","-") + "_large_" + LTL
# file_appendix = "Feb8_03-47"
SUMMARY_DIR = './results/tf_ddqn_' + file_appendix
SAVE_DIR = "./saved_model/" + file_appendix + "/ddqn.ckpt"
EXPLORATION_RATE = 0.2
GUIDE_RATE = 0.4
LR_DECAY_TRUNCATION = -200
RESTORE = 0
if sys.platform == "darwin":
    DEVICE = "/device:CPU:0"
else:
    DEVICE = "/device:GPU:0"

In [4]:
env = CurrentWorld(LTL)
with open("my.dot", "r") as dotfile:
    text = dotfile.read()
Source(text)

prod_planner = Prod_Planning(env, LTL.lower())

In [8]:
config=tf.ConfigProto(log_device_placement=False)
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
       
    state_dim = 3
    action_dim = 5
    
    if RESTORE:
        Qnet = QNet(sess, state_dim, action_dim, LEARNING_RATE, TAU, MINIBATCH_SIZE, SAVE_DIR, DEVICE)
        Qnet.saver.restore(sess, RESTORE_PATH)
        train(sess, env, Qnet, prod_planner)
        
    else:
        np.random.seed(RANDOM_SEED)
        tf.set_random_seed(RANDOM_SEED)
        env.seed(RANDOM_SEED)
    
        Qnet = QNet(sess, state_dim, action_dim, LEARNING_RATE, TAU, MINIBATCH_SIZE, SAVE_DIR, DEVICE)

        train(sess, env, Qnet, prod_planner)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epi: 0
DDQN Saved
GUIDE
global
rabin


IndexError: index 0 is out of bounds for axis 0 with size 0