In [1]:
from __future__ import print_function
import numpy as np
import tensorflow as tf
from actor import ActorNetwork
from rbfActor import RbfActorNetwork
from critic import CriticNetwork
from replay_buffer import ReplayBuffer
from ounoise import OUNoise
import gym, time
from Envs.reaching import ReachingEnv
from Envs.throwing import ThrowingEnv
import matplotlib.pyplot as plt


MAX_EPISODE = 500
MAX_TIME = 200

ACTOR_LEARNING_RATE = 0.0001
CRITIC_LEARNING_RATE = 0.001
L2_DECAY = 0.01
GAMMA = 0.99
TAU = 0.001

BUFFER_SIZE = 1000000
BATCH_SIZE = 64

OU_MU = 0.0
OU_THETA = 0.15  
OU_SIGMA = 0.20

RANDOM_SEED = 1926

goal_pos = np.load('./Envs/reaching_goal_pos.npy')

# goal_pos = np.array([[-4.11399563, -5.        ],
#        [-0.05680097, -5.        ],
#        [ 3.51188653, -5.        ],
#        [ 2.25174116, -5.        ],
#        [ 4.24114159, -5.        ],
#        [-3.44134834, -5.        ],
#        [-2.44153671, -5.        ],
#        [-2.33641164, -5.        ],
#        [-2.77225586, -5.        ],
#        [-1.0171196 , -5.        ]])
GAMMA = .99
# env = gym.make('Pendulum-v0')
env = ReachingEnv(include_t = True)
# env = ThrowingEnv(include_t = True)
tf.set_random_seed(RANDOM_SEED)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_bound = env.action_space.high
sess = tf.Session()

In [2]:
# actor = ActorNetwork(sess, state_dim, action_dim, action_bound, hidden_layer_dim = [40,30], \
#                      seed = RANDOM_SEED, tau = TAU, learning_rate = ACTOR_LEARNING_RATE)
actor = RbfActorNetwork(sess, state_dim, action_dim, action_bound, hidden_layer_dim = [40,30], \
                     seed = RANDOM_SEED, tau = TAU, learning_rate = ACTOR_LEARNING_RATE)

critic = CriticNetwork(sess, state_dim, action_dim, hidden_layer_dim = [30],\
                       l2_alpha = L2_DECAY, seed = RANDOM_SEED, tau =TAU, learning_rate = CRITIC_LEARNING_RATE)
replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
noise = OUNoise(action_dim, mu = OU_MU, theta = OU_THETA, sigma = OU_SIGMA, seed = RANDOM_SEED)
sess.run(tf.global_variables_initializer())

summary_sum_reward_list = np.zeros([50,MAX_EPISODE])
summary_avg_reward_list = np.zeros([50,MAX_EPISODE])
all_recorded_actor_paras = []
all_recorded_critic_paras = []

In [None]:
for j in range(len(goal_pos)):
    env.set_goal(np.append(goal_pos[j], [0,0]))
    all_sum_reward_list = []
    all_avg_reward_list = []
    all_reward_list = []
    all_loss_list = []
    all_t_list = []
    
    for i in range(MAX_EPISODE):

        state = env.reset()
        noise.reset()
        reward_list = []
        loss_list = []

        for t in range(MAX_TIME):
            action = actor.predict(np.reshape(state, (-1, state_dim))) 
            action += noise.noise()
            action = np.clip(action, -action_bound, action_bound)
            action = np.reshape(action, action_dim)

            next_state, reward, done, info = env.step(action)
            replay_buffer.add_sample(np.reshape(state, state_dim), \
                                     np.reshape(action,action_dim),\
                                     reward,\
                                     np.reshape(next_state,state_dim),\
                                     done)

            mini_batch = replay_buffer.rand_sample(batch_size = BATCH_SIZE, seed = RANDOM_SEED + t + i*MAX_TIME)
            s_batch, a_batch, r_batch, s2_batch, t_batch = mini_batch

            a2_batch = actor.predict(s2_batch, if_target = True)
            training_q = r_batch + GAMMA * critic.predict(s2_batch, a2_batch, if_target = True) #* ~t_batch

            _, loss = critic.train(s_batch, a_batch, training_q)

            train_action_batch = actor.predict(s_batch)
            critic_grad = critic.compute_critic_gradient(s_batch, train_action_batch)
            actor.train(s_batch, critic_grad[0])

            actor.update_target_network()
            critic.update_target_network()

            reward_list.append(reward)
            loss_list.append(loss)
            state = next_state
            if done:
                break

    #         print('Episode: %s \t Action: %s, %s \t State: %s,%s,%s,%s' %(i, action[0], action[1], state[0],state[1],state[2],state[3]))

        all_sum_reward_list.append(np.sum(reward_list))
        all_avg_reward_list.append(np.mean(all_sum_reward_list[-100:]))
        all_reward_list.append(reward_list)
        all_loss_list.append(loss_list)
        all_t_list.append(t)

        print('Task : %s \t Episode: %s \t Time_step: %s \t Avg_reward: %s \t Cur_reward: %s'%(j, i, t, all_avg_reward_list[-1], all_sum_reward_list[-1]))
    
    summary_sum_reward_list[j] = np.array(all_sum_reward_list)
    summary_avg_reward_list[j] = np.array(all_avg_reward_list)
    record_actor_paras = [v for v in tf.trainable_variables() if 'actor_target' in v.name]
    record_critic_paras = [v for v in tf.trainable_variables() if 'critic_target' in v.name]
    all_recorded_actor_paras.append(sess.run(record_actor_paras))
    all_recorded_critic_paras.append(sess.run(record_critic_paras))

Task : 0 	 Episode: 0 	 Time_step: 199 	 Avg_reward: -191.595636939 	 Cur_reward: -191.595636939
Task : 0 	 Episode: 1 	 Time_step: 199 	 Avg_reward: -191.549742664 	 Cur_reward: -191.50384839
Task : 0 	 Episode: 2 	 Time_step: 199 	 Avg_reward: -191.504435452 	 Cur_reward: -191.413821027
Task : 0 	 Episode: 3 	 Time_step: 199 	 Avg_reward: -191.654364907 	 Cur_reward: -192.10415327
Task : 0 	 Episode: 4 	 Time_step: 199 	 Avg_reward: -191.622791713 	 Cur_reward: -191.49649894
Task : 0 	 Episode: 5 	 Time_step: 199 	 Avg_reward: -191.598513671 	 Cur_reward: -191.47712346
Task : 0 	 Episode: 6 	 Time_step: 199 	 Avg_reward: -191.575945613 	 Cur_reward: -191.440537264
Task : 0 	 Episode: 7 	 Time_step: 199 	 Avg_reward: -191.594739563 	 Cur_reward: -191.72629721
Task : 0 	 Episode: 8 	 Time_step: 199 	 Avg_reward: -191.334274276 	 Cur_reward: -189.250551986
Task : 0 	 Episode: 9 	 Time_step: 199 	 Avg_reward: -191.395727064 	 Cur_reward: -191.948802155
Task : 0 	 Episode: 10 	 Time_step:

In [11]:
test = [print(v.name) for v in record_paras]

actor_target/weights_rbf:0
actor_target/weights_pid:0
actor_target_1/weights_rbf:0
actor_target_1/weights_pid:0


In [4]:
plt.figure(1)
plt.plot(all_sum_reward_list)
plt.plot(all_avg_reward_list)
plt.show()

In [15]:
test1

[array([[ 0.49194562, -0.40710258],
        [-0.25445586,  1.53224158],
        [-0.54333317,  0.91868734],
        [-0.82634121,  0.9508    ],
        [-0.7505812 , -0.0811923 ],
        [ 1.25594985,  0.0549583 ],
        [-0.58448076,  0.40521252],
        [ 1.17066264,  0.01010851],
        [ 0.56132561, -0.22675836],
        [-0.00533446,  0.17792961],
        [ 0.13523023, -0.48673689],
        [ 0.48039314,  0.07201325],
        [ 0.81906778, -1.79299045],
        [ 0.2295589 ,  1.26048887],
        [-0.12102009, -0.10076116],
        [-1.53408384,  0.93438947],
        [-0.35953823,  0.1069089 ],
        [ 0.81876975,  1.52116883],
        [ 0.42825001,  0.07205483],
        [ 1.06748009,  0.91868311],
        [-1.07992125,  1.77906847],
        [ 1.19341791,  0.05580121],
        [ 0.49355066,  1.55945647],
        [ 0.76629263, -0.14363949],
        [-1.05794227, -0.08685627],
        [-1.84896374, -1.04760098],
        [-0.94853312, -0.25303429],
        [ 0.8249833 ,  1.020

In [None]:
test