In [1]:
from __future__ import print_function
import numpy as np
import tensorflow as tf
from actor import ActorNetwork
from rbfActor import RbfActorNetwork
from critic import CriticNetwork
from replay_buffer import ReplayBuffer
from ounoise import OUNoise
import gym, time
from Envs.reaching import ReachingEnv
from Envs.throwing import ThrowingEnv
import matplotlib.pyplot as plt


MAX_EPISODE = 500
MAX_TIME = 200

ACTOR_LEARNING_RATE = 0.0001
CRITIC_LEARNING_RATE = 0.001
L2_DECAY = 0.01
GAMMA = 0.99
TAU = 0.001

BUFFER_SIZE = 1000000
BATCH_SIZE = 64

OU_MU = 0.0
OU_THETA = 0.15  
OU_SIGMA = 0.20

RANDOM_SEED = 1926

goal_pos = np.load('./Envs/reaching_goal_pos.npy')

# goal_pos = np.array([[-4.11399563, -5.        ],
#        [-0.05680097, -5.        ],
#        [ 3.51188653, -5.        ],
#        [ 2.25174116, -5.        ],
#        [ 4.24114159, -5.        ],
#        [-3.44134834, -5.        ],
#        [-2.44153671, -5.        ],
#        [-2.33641164, -5.        ],
#        [-2.77225586, -5.        ],
#        [-1.0171196 , -5.        ]])
GAMMA = .99
# env = gym.make('Pendulum-v0')
env = ReachingEnv(include_t = True)
# env = ThrowingEnv(include_t = True)
tf.set_random_seed(RANDOM_SEED)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_bound = env.action_space.high
sess = tf.Session()

In [2]:
actor = ActorNetwork(sess, state_dim, action_dim, action_bound, hidden_layer_dim = [40,30], \
                     seed = RANDOM_SEED, tau = TAU, learning_rate = ACTOR_LEARNING_RATE)
# actor = RbfActorNetwork(sess, state_dim, action_dim, action_bound, hidden_layer_dim = [40,30], \
#                      seed = RANDOM_SEED, tau = TAU, learning_rate = ACTOR_LEARNING_RATE)

critic = CriticNetwork(sess, state_dim, action_dim, hidden_layer_dim = [30],\
                       l2_alpha = L2_DECAY, seed = RANDOM_SEED, tau =TAU, learning_rate = CRITIC_LEARNING_RATE)
replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
noise = OUNoise(action_dim, mu = OU_MU, theta = OU_THETA, sigma = OU_SIGMA, seed = RANDOM_SEED)
sess.run(tf.global_variables_initializer())

summary_sum_reward_list = np.zeros([50,MAX_EPISODE])
summary_avg_reward_list = np.zeros([50,MAX_EPISODE])
all_recorded_actor_paras = []
all_recorded_critic_paras = []

In [3]:
for j in range(len(goal_pos)):
    env.set_goal(np.append(goal_pos[j], [0,0]))
    all_sum_reward_list = []
    all_avg_reward_list = []
    all_reward_list = []
    all_loss_list = []
    all_t_list = []
    sess.run(tf.global_variables_initializer())
    
    for i in range(MAX_EPISODE):

        state = env.reset()
        noise.reset()
        reward_list = []
        loss_list = []

        for t in range(MAX_TIME):
            action = actor.predict(np.reshape(state, (-1, state_dim))) 
            action += noise.noise()
            action = np.clip(action, -action_bound, action_bound)
            action = np.reshape(action, action_dim)

            next_state, reward, done, info = env.step(action)
            replay_buffer.add_sample(np.reshape(state, state_dim), \
                                     np.reshape(action,action_dim),\
                                     reward,\
                                     np.reshape(next_state,state_dim),\
                                     done)

            mini_batch = replay_buffer.rand_sample(batch_size = BATCH_SIZE, seed = RANDOM_SEED + t + i*MAX_TIME)
            s_batch, a_batch, r_batch, s2_batch, t_batch = mini_batch

            a2_batch = actor.predict(s2_batch, if_target = True)
            training_q = r_batch + GAMMA * critic.predict(s2_batch, a2_batch, if_target = True) #* ~t_batch

            _, loss = critic.train(s_batch, a_batch, training_q)

            train_action_batch = actor.predict(s_batch)
            critic_grad = critic.compute_critic_gradient(s_batch, train_action_batch)
            actor.train(s_batch, critic_grad[0])

            actor.update_target_network()
            critic.update_target_network()

            reward_list.append(reward)
            loss_list.append(loss)
            state = next_state
            if done:
                break

    #         print('Episode: %s \t Action: %s, %s \t State: %s,%s,%s,%s' %(i, action[0], action[1], state[0],state[1],state[2],state[3]))

        all_sum_reward_list.append(np.sum(reward_list))
        all_avg_reward_list.append(np.mean(all_sum_reward_list[-100:]))
        all_reward_list.append(reward_list)
        all_loss_list.append(loss_list)
        all_t_list.append(t)

        print('Task : %s \t Episode: %s \t Time_step: %s \t Avg_reward: %s \t Cur_reward: %s'%(j, i, t, all_avg_reward_list[-1], all_sum_reward_list[-1]))
    
    summary_sum_reward_list[j] = np.array(all_sum_reward_list)
    summary_avg_reward_list[j] = np.array(all_avg_reward_list)
    record_actor_paras = [v for v in tf.trainable_variables() if 'actor_target' in v.name]
    record_critic_paras = [v for v in tf.trainable_variables() if 'critic_target' in v.name]
    all_recorded_actor_paras.append(sess.run(record_actor_paras))
    all_recorded_critic_paras.append(sess.run(record_critic_paras))

Task : 0 	 Episode: 0 	 Time_step: 199 	 Avg_reward: -121.874299024 	 Cur_reward: -121.874299024
Task : 0 	 Episode: 1 	 Time_step: 199 	 Avg_reward: -120.239545257 	 Cur_reward: -118.60479149
Task : 0 	 Episode: 2 	 Time_step: 199 	 Avg_reward: -98.752939085 	 Cur_reward: -55.7797267406
Task : 0 	 Episode: 3 	 Time_step: 199 	 Avg_reward: -85.1982523263 	 Cur_reward: -44.5341920503
Task : 0 	 Episode: 4 	 Time_step: 199 	 Avg_reward: -72.7346584753 	 Cur_reward: -22.8802830711
Task : 0 	 Episode: 5 	 Time_step: 199 	 Avg_reward: -66.7675021278 	 Cur_reward: -36.9317203901
Task : 0 	 Episode: 6 	 Time_step: 199 	 Avg_reward: -72.7102823967 	 Cur_reward: -108.366964011
Task : 0 	 Episode: 7 	 Time_step: 199 	 Avg_reward: -73.5748705315 	 Cur_reward: -79.626987475
Task : 0 	 Episode: 8 	 Time_step: 199 	 Avg_reward: -75.1981580439 	 Cur_reward: -88.1844581426
Task : 0 	 Episode: 9 	 Time_step: 199 	 Avg_reward: -74.0816178482 	 Cur_reward: -64.0327560872
Task : 0 	 Episode: 10 	 Time_ste

In [None]:
plt.figure(1)
plt.plot(np.mean(summary_sum_reward_list, axis = 0))
plt.plot(np.mean(summary_avg_reward_list, axis = 0))
indexis = np.arange(0,500,10)
errors = np.std(summary_avg_reward_list, axis = 0)
means = np.mean(summary_avg_reward_list, axis = 0)
plt.errorbar(indexis, means[indexis], errors[indexis])
plt.show()

  if self._edgecolors == 'face':


In [43]:
testing = [v for v in tf.trainable_variables() if 'actor_target' in v.name]
# test,test2 = sess.run([tf.trainable_variables(), testing])
test = sess.run(testing)

In [10]:
np.shape(goal_pos)

(50, 2)

In [4]:
all_recorded_paras

[[array([[ 0.77286857, -0.59390765, -0.38024545,  1.52092099, -0.50692403,
           0.9400019 , -0.87018859,  0.70858145, -0.65530795, -0.0048756 ,
           1.24183214,  0.29644755, -0.47211483,  0.51922488,  1.01006424,
          -0.32988319,  0.45615327, -0.02071878,  0.00964731,  0.02333229,
           0.23613161, -0.25129709,  0.6187368 , -0.16070931,  0.60067207,
          -1.75945997,  0.08631326,  1.08919787,  0.16050473, -0.28552952,
          -1.48346567,  0.9640128 , -0.26898971,  0.00345204,  0.81340545,
           1.37896419,  0.36950454,  0.02544495,  0.91181594,  1.18187189],
         [-1.10240269,  1.64776647,  0.89400274, -0.07732943,  0.29941466,
           1.21645999,  0.89444917, -0.09455988, -1.03166366, -0.01493152,
          -1.89812934, -0.93173319, -0.91077238, -0.28684011,  1.12343538,
           0.80951482,  0.47800025, -1.94940877, -1.43326318,  0.61967069,
           0.540546  , -1.77589583,  0.73202205, -1.07304323,  0.7607547 ,
          -0.58795744,  

In [5]:
np.savez('./Data/reaching_ddpg_fc.npz', all_recorded_actor_paras, all_recorded_critic_paras,summary_sum_reward_list, summary_avg_reward_list)