In [1]:
from __future__ import print_function
import numpy as np
import tensorflow as tf
from actor import ActorNetwork
from rbfActor import RbfActorNetwork
from critic import CriticNetwork
from replay_buffer import ReplayBuffer
from ounoise import OUNoise
import gym, time
from Envs.reaching import ReachingEnv
import matplotlib.pyplot as plt


MAX_EPISODE = 1000
MAX_TIME = 200

ACTOR_LEARNING_RATE = 0.0001
CRITIC_LEARNING_RATE = 0.001
L2_DECAY = 0.01
GAMMA = 0.99
TAU = 0.001

BUFFER_SIZE = 1000000
BATCH_SIZE = 64

OU_MU = 0.0
OU_THETA = 0.15  
OU_SIGMA = 0.20

RANDOM_SEED = 1926

goal_pos = np.array([[-1.7691047 , -1.76426373],
       [-1.81476041,  1.25572033],
       [ 1.97538345,  1.7239961 ],
       [ 0.49885795,  1.82511657],
       [-1.45703216,  1.39941234],
       [ 0.49100693,  0.12822174],
       [ 0.14809867, -1.31716354],
       [-1.30413931,  1.79390377],
       [-0.60087002,  1.90940639],
       [ 1.29273111,  1.85736147]])

# env = gym.make('Pendulum-v0')
env = ReachingEnv(include_t = True)
tf.set_random_seed(RANDOM_SEED)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_bound = env.action_space.high
sess = tf.Session()

In [2]:
# actor = ActorNetwork(sess, state_dim, action_dim, action_bound, hidden_layer_dim = [40,30], \
#                      seed = RANDOM_SEED, tau = TAU, learning_rate = ACTOR_LEARNING_RATE)
actor = RbfActorNetwork(sess, state_dim, action_dim, action_bound, hidden_layer_dim = [40,30], \
                     seed = RANDOM_SEED, tau = TAU, learning_rate = ACTOR_LEARNING_RATE)

critic = CriticNetwork(sess, state_dim, action_dim, hidden_layer_dim = [30],\
                       l2_alpha = L2_DECAY, seed = RANDOM_SEED, tau =TAU, learning_rate = CRITIC_LEARNING_RATE)
replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
noise = OUNoise(action_dim, mu = OU_MU, theta = OU_THETA, sigma = OU_SIGMA, seed = RANDOM_SEED)
sess.run(tf.global_variables_initializer())

summary_sum_reward_list = np.zeros([10,1000])
summary_avg_reward_list = np.zeros([10,1000])

In [3]:
for j in range(len(goal_pos)):
    env.set_goal(np.append(goal_pos[j], [0,0]))
    all_sum_reward_list = []
    all_avg_reward_list = []
    all_reward_list = []
    all_loss_list = []
    all_t_list = []
    
    for i in range(MAX_EPISODE):

        state = env.reset()
        noise.reset()
        reward_list = []
        loss_list = []

        for t in range(MAX_TIME):
            action = actor.predict(np.reshape(state, (-1, state_dim))) 
            action += noise.noise()
            action = np.clip(action, -action_bound, action_bound)
            action = np.reshape(action, action_dim)

            next_state, reward, done, info = env.step(action)
            replay_buffer.add_sample(np.reshape(state, state_dim), \
                                     np.reshape(action,action_dim),\
                                     reward,\
                                     np.reshape(next_state,state_dim),\
                                     t)

            mini_batch = replay_buffer.rand_sample(batch_size = BATCH_SIZE, seed = RANDOM_SEED + t + i*MAX_TIME)
            s_batch, a_batch, r_batch, s2_batch, t_batch = mini_batch

            a2_batch = actor.predict(s2_batch, if_target = True)
            training_q = r_batch + GAMMA * critic.predict(s2_batch, a2_batch, if_target = True)

            _, loss = critic.train(s_batch, a_batch, training_q)

            train_action_batch = actor.predict(s_batch)
            critic_grad = critic.compute_critic_gradient(s_batch, train_action_batch)
            actor.train(s_batch, critic_grad[0])

            actor.update_target_network()
            critic.update_target_network()

            reward_list.append(reward)
            loss_list.append(loss)
            state = next_state
            if done:
                break

    #         print('Episode: %s \t Action: %s, %s \t State: %s,%s,%s,%s' %(i, action[0], action[1], state[0],state[1],state[2],state[3]))

        all_sum_reward_list.append(np.sum(reward_list))
        all_avg_reward_list.append(np.mean(all_sum_reward_list[-100:]))
        all_reward_list.append(reward_list)
        all_loss_list.append(loss_list)
        all_t_list.append(t)

        print('Task : %s \t Episode: %s \t Time_step: %s \t Avg_reward: %s \t Cur_reward: %s'%(j, i, t, all_avg_reward_list[-1], all_sum_reward_list[-1]))
    
    summary_sum_reward_list[j] = np.array(all_sum_reward_list)
    summary_avg_reward_list[j] = np.array(all_avg_reward_list)
    


SyntaxError: invalid syntax (<ipython-input-3-21911e35df67>, line 50)

In [None]:
import matplotlib.pyplot as plt
plt.figure(1)
plt.plot(all_avg_reward_list)
plt.show()