In [1]:
from __future__ import print_function
import numpy as np
import tensorflow as tf
from actor import ActorNetwork
from rbfActor import RbfActorNetwork
from critic import CriticNetwork
from replay_buffer import ReplayBuffer
from ounoise import OUNoise
import gym, time
from Envs.reaching import ReachingEnv
from Envs.throwing import ThrowingEnv
import matplotlib.pyplot as plt


MAX_EPISODE = 250
MAX_TIME = 200

ACTOR_LEARNING_RATE = 0.0001
CRITIC_LEARNING_RATE = 0.001
L2_DECAY = 0.01
GAMMA = 0.99
TAU = 0.001

BUFFER_SIZE = 1000000
BATCH_SIZE = 64

OU_MU = 0.0
OU_THETA = 0.15  
OU_SIGMA = 0.20

RANDOM_SEED = 1926

goal_pos = np.load('./Envs/reaching_goal_pos.npy')

# goal_pos = np.array([[-4.11399563, -5.        ],
#        [-0.05680097, -5.        ],
#        [ 3.51188653, -5.        ],
#        [ 2.25174116, -5.        ],
#        [ 4.24114159, -5.        ],
#        [-3.44134834, -5.        ],
#        [-2.44153671, -5.        ],
#        [-2.33641164, -5.        ],
#        [-2.77225586, -5.        ],
#        [-1.0171196 , -5.        ]])
GAMMA = .99
# env = gym.make('Pendulum-v0')
env = ReachingEnv(include_t = True)
# env = ThrowingEnv(include_t = True)
tf.set_random_seed(RANDOM_SEED)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_bound = env.action_space.high
sess = tf.Session()

In [2]:
actor = ActorNetwork(sess, state_dim, action_dim, action_bound, hidden_layer_dim = [40,30], \
                     seed = RANDOM_SEED, tau = TAU, learning_rate = ACTOR_LEARNING_RATE)
# actor = RbfActorNetwork(sess, state_dim, action_dim, action_bound, hidden_layer_dim = [40,30], \
#                      seed = RANDOM_SEED, tau = TAU, learning_rate = ACTOR_LEARNING_RATE)

critic = CriticNetwork(sess, state_dim, action_dim, hidden_layer_dim = [30],\
                       l2_alpha = L2_DECAY, seed = RANDOM_SEED, tau =TAU, learning_rate = CRITIC_LEARNING_RATE)
replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
noise = OUNoise(action_dim, mu = OU_MU, theta = OU_THETA, sigma = OU_SIGMA, seed = RANDOM_SEED)
sess.run(tf.global_variables_initializer())

summary_sum_reward_list = np.zeros([50,MAX_EPISODE])
summary_avg_reward_list = np.zeros([50,MAX_EPISODE])
all_recorded_paras = []

In [3]:
for j in range(len(goal_pos)-48):
    env.set_goal(np.append(goal_pos[j], [0,0]))
    all_sum_reward_list = []
    all_avg_reward_list = []
    all_reward_list = []
    all_loss_list = []
    all_t_list = []
    
    for i in range(MAX_EPISODE):

        state = env.reset()
        noise.reset()
        reward_list = []
        loss_list = []

        for t in range(MAX_TIME):
            action = actor.predict(np.reshape(state, (-1, state_dim))) 
            action += noise.noise()
            action = np.clip(action, -action_bound, action_bound)
            action = np.reshape(action, action_dim)

            next_state, reward, done, info = env.step(action)
            replay_buffer.add_sample(np.reshape(state, state_dim), \
                                     np.reshape(action,action_dim),\
                                     reward,\
                                     np.reshape(next_state,state_dim),\
                                     done)

            mini_batch = replay_buffer.rand_sample(batch_size = BATCH_SIZE, seed = RANDOM_SEED + t + i*MAX_TIME)
            s_batch, a_batch, r_batch, s2_batch, t_batch = mini_batch

            a2_batch = actor.predict(s2_batch, if_target = True)
            training_q = r_batch + GAMMA * critic.predict(s2_batch, a2_batch, if_target = True) #* ~t_batch

            _, loss = critic.train(s_batch, a_batch, training_q)

            train_action_batch = actor.predict(s_batch)
            critic_grad = critic.compute_critic_gradient(s_batch, train_action_batch)
            actor.train(s_batch, critic_grad[0])

            actor.update_target_network()
            critic.update_target_network()

            reward_list.append(reward)
            loss_list.append(loss)
            state = next_state
            if done:
                break

    #         print('Episode: %s \t Action: %s, %s \t State: %s,%s,%s,%s' %(i, action[0], action[1], state[0],state[1],state[2],state[3]))

        all_sum_reward_list.append(np.sum(reward_list))
        all_avg_reward_list.append(np.mean(all_sum_reward_list[-100:]))
        all_reward_list.append(reward_list)
        all_loss_list.append(loss_list)
        all_t_list.append(t)

        print('Task : %s \t Episode: %s \t Time_step: %s \t Avg_reward: %s \t Cur_reward: %s'%(j, i, t, all_avg_reward_list[-1], all_sum_reward_list[-1]))
    
    summary_sum_reward_list[j] = np.array(all_sum_reward_list)
    summary_avg_reward_list[j] = np.array(all_avg_reward_list)
    record_paras = [v for v in tf.trainable_variables() if 'actor_target' in v.name]
    all_recorded_paras.append(record_paras)


Task : 0 	 Episode: 0 	 Time_step: 199 	 Avg_reward: -75.1653294911 	 Cur_reward: -75.1653294911
Task : 0 	 Episode: 1 	 Time_step: 199 	 Avg_reward: -69.9493169292 	 Cur_reward: -64.7333043674
Task : 0 	 Episode: 2 	 Time_step: 199 	 Avg_reward: -75.4527745665 	 Cur_reward: -86.4596898412
Task : 0 	 Episode: 3 	 Time_step: 199 	 Avg_reward: -63.9215298592 	 Cur_reward: -29.3277957371
Task : 0 	 Episode: 4 	 Time_step: 199 	 Avg_reward: -58.2973255487 	 Cur_reward: -35.8005083067
Task : 0 	 Episode: 5 	 Time_step: 199 	 Avg_reward: -55.6001476627 	 Cur_reward: -42.1142582328
Task : 0 	 Episode: 6 	 Time_step: 199 	 Avg_reward: -55.2644756132 	 Cur_reward: -53.250443316
Task : 0 	 Episode: 7 	 Time_step: 199 	 Avg_reward: -50.297467106 	 Cur_reward: -15.5284075555
Task : 0 	 Episode: 8 	 Time_step: 199 	 Avg_reward: -47.7056467876 	 Cur_reward: -26.9710842406
Task : 0 	 Episode: 9 	 Time_step: 199 	 Avg_reward: -44.2156626974 	 Cur_reward: -12.8058058861
Task : 0 	 Episode: 10 	 Time_st

ValueError: could not broadcast input array from shape (250) into shape (1000)

In [None]:
plt.figure(1)
plt.plot(all_avg_reward_list)
plt.show()

In [43]:
testing = [v for v in tf.trainable_variables() if 'actor_target' in v.name]
# test,test2 = sess.run([tf.trainable_variables(), testing])
test = sess.run(testing)

In [45]:
[print(np.shape(v)) for v in test]

(5, 40)
(40,)
(40, 30)
(30,)
(30, 2)


[None, None, None, None, None]

In [31]:
[print(np.shape(v)) for v in test]

(5, 40)
(40,)
(40, 30)
(30,)
(30, 2)
(5, 40)
(40,)
(40, 30)
(30,)
(30, 2)
(5, 30)
(2, 30)
(30, 1)
(30,)
(5, 30)
(2, 30)
(30, 1)
(30,)


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]