In [1]:
from __future__ import print_function
import numpy as np
import tensorflow as tf
from actor import ActorNetwork
from rbfActor import RbfActorNetwork
from actorls import ActorLSNetwork
from critic import CriticNetwork
from replay_buffer import ReplayBuffer
from ounoise import OUNoise
import gym, time
from Envs.reaching import ReachingEnv
from Envs.throwing import ThrowingEnv
import matplotlib.pyplot as plt
import tensor_toolbox_yyang as ttool

MAX_EPISODE = 500
MAX_TIME = 200

ACTOR_LEARNING_RATE = 0.0001
CRITIC_LEARNING_RATE = 0.001
L2_DECAY = 0.01
GAMMA = 0.99
TAU = 0.001

BUFFER_SIZE = 1000000
BATCH_SIZE = 64

OU_MU = 0.0
OU_THETA = 0.15  
OU_SIGMA = 0.20

RANDOM_SEED = 1926

goal_pos = np.load('./Envs/reaching_goal_pos.npy')
source_paras = np.load('./Data/reaching_ddpg_rbf_500.npz')

GAMMA = .99
# env = gym.make('Pendulum-v0')
env = ReachingEnv(include_t = True)
# env = ThrowingEnv(include_t = True)
tf.set_random_seed(RANDOM_SEED)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_bound = env.action_space.high
sess = tf.Session()

In [2]:
actor = ActorLSNetwork(sess, state_dim, action_dim, action_bound, L_init = None, S_init = None, \
                         seed = RANDOM_SEED, tau = TAU, learning_rate = ACTOR_LEARNING_RATE)

critic = CriticNetwork(sess, state_dim, action_dim, hidden_layer_dim = [30],\
                       l2_alpha = L2_DECAY, seed = RANDOM_SEED, tau =TAU, learning_rate = CRITIC_LEARNING_RATE)
replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
noise = OUNoise(action_dim, mu = OU_MU, theta = OU_THETA, sigma = OU_SIGMA, seed = RANDOM_SEED)
sess.run(tf.global_variables_initializer())

summary_sum_reward_list = np.zeros([50,MAX_EPISODE])
summary_avg_reward_list = np.zeros([50,MAX_EPISODE])
all_recorded_actor_paras = []
all_recorded_critic_paras = []
all_weights = np.array([v[0] for v in source_paras['arr_0']])
rank = [10,50,2]


(?, 501) (501, 2)
(?, 501) (501, 2)


In [3]:
for j in range(len(goal_pos) - 49):
    j = 9
    indices = range(j)+range(j+1,50)
    U, S = ttool.tucker_dcmp(all_weights[indices], eps_or_k = rank)
    temp = np.tensordot(S, U[1], axes = (1,-1))
    L = np.tensordot(temp,U[2], axes = (1,-1))
    S = U[0][j-1]
    S = np.expand_dims(S, axis = -1)
    S = np.expand_dims(S, axis = -1)
#     print(S)
    # actor = ActorNetwork(sess, state_dim, action_dim, action_bound, hidden_layer_dim = [40,30], \
    #                      seed = RANDOM_SEED, tau = TAU, learning_rate = ACTOR_LEARNING_RATE)
    

    sess.run(tf.global_variables_initializer())
    L_tensor = tf.constant(L, dtype = tf.float32)
    S_tensor = tf.constant(S, dtype = tf.float32)
#     L_paras = [v for v in tf.trainable_variables() if 'actor' in v.name and 'L' in v.name]
#     S_paras = [v for v in tf.trainable_variables() if 'actor' in v.name and 'S' in v.name]
    initializeL = [v.assign(L_tensor) for v in tf.trainable_variables() if 'actor' in v.name and 'L' in v.name]
    initializeS = [v.assign(S_tensor) for v in tf.trainable_variables() if 'actor' in v.name and 'S' in v.name]
    sess.run([initializeL, initializeS])
#     temp_L, temp_S = sess.run([L_paras, S_paras])
#     print(temp_S)
    env.set_goal(np.append(goal_pos[j], [0,0]))
    all_sum_reward_list = []
    all_avg_reward_list = []
    all_reward_list = []
    all_loss_list = []
    all_t_list = []
    
    for i in range(MAX_EPISODE):

        state = env.reset()
        noise.reset()
        reward_list = []
        loss_list = []

        for t in range(MAX_TIME):
            action = actor.predict(np.reshape(state, (-1, state_dim))) 
            action += noise.noise()
            action = np.clip(action, -action_bound, action_bound)
            action = np.reshape(action, action_dim)

            next_state, reward, done, info = env.step(action)
            replay_buffer.add_sample(np.reshape(state, state_dim), \
                                     np.reshape(action,action_dim),\
                                     reward,\
                                     np.reshape(next_state,state_dim),\
                                     done)

            mini_batch = replay_buffer.rand_sample(batch_size = BATCH_SIZE, seed = RANDOM_SEED + t + i*MAX_TIME)
            s_batch, a_batch, r_batch, s2_batch, t_batch = mini_batch

            a2_batch = actor.predict(s2_batch, if_target = True)
            training_q = r_batch + GAMMA * critic.predict(s2_batch, a2_batch, if_target = True) #* ~t_batch

            _, loss = critic.train(s_batch, a_batch, training_q)

            train_action_batch = actor.predict(s_batch)
            critic_grad = critic.compute_critic_gradient(s_batch, train_action_batch)
            actor.train(s_batch, critic_grad[0])

            actor.update_target_network()
            critic.update_target_network()

            reward_list.append(reward)
            loss_list.append(loss)
            state = next_state
            if done:
                break

    #         print('Episode: %s \t Action: %s, %s \t State: %s,%s,%s,%s' %(i, action[0], action[1], state[0],state[1],state[2],state[3]))

        all_sum_reward_list.append(np.sum(reward_list))
        all_avg_reward_list.append(np.mean(all_sum_reward_list[-100:]))
        all_reward_list.append(reward_list)
        all_loss_list.append(loss_list)
        all_t_list.append(t)

        print('Task : %s \t Episode: %s \t Time_step: %s \t Avg_reward: %3.5f \t Cur_reward: %3.5f'%(j, i, t, all_avg_reward_list[-1], all_sum_reward_list[-1]))
#     temp_L, temp_S = sess.run([L_paras, S_paras])
#     print(temp_S)
    summary_sum_reward_list[j] = np.array(all_sum_reward_list)
    summary_avg_reward_list[j] = np.array(all_avg_reward_list)
    record_actor_paras = [v for v in tf.trainable_variables() if 'actor_target' in v.name]
    record_critic_paras = [v for v in tf.trainable_variables() if 'critic_target' in v.name]
    all_recorded_actor_paras.append(sess.run(record_actor_paras))
    all_recorded_critic_paras.append(sess.run(record_critic_paras))

Task : 9 	 Episode: 0 	 Time_step: 199 	 Avg_reward: -131.86109 	 Cur_reward: -131.86109
Task : 9 	 Episode: 1 	 Time_step: 199 	 Avg_reward: -157.39538 	 Cur_reward: -182.92966
Task : 9 	 Episode: 2 	 Time_step: 199 	 Avg_reward: -163.51305 	 Cur_reward: -175.74840
Task : 9 	 Episode: 3 	 Time_step: 199 	 Avg_reward: -169.76370 	 Cur_reward: -188.51564
Task : 9 	 Episode: 4 	 Time_step: 199 	 Avg_reward: -173.66153 	 Cur_reward: -189.25286
Task : 9 	 Episode: 5 	 Time_step: 199 	 Avg_reward: -176.43149 	 Cur_reward: -190.28130
Task : 9 	 Episode: 6 	 Time_step: 199 	 Avg_reward: -178.47173 	 Cur_reward: -190.71316
Task : 9 	 Episode: 7 	 Time_step: 199 	 Avg_reward: -179.82729 	 Cur_reward: -189.31620
Task : 9 	 Episode: 8 	 Time_step: 199 	 Avg_reward: -180.86534 	 Cur_reward: -189.16977
Task : 9 	 Episode: 9 	 Time_step: 199 	 Avg_reward: -181.69159 	 Cur_reward: -189.12782
Task : 9 	 Episode: 10 	 Time_step: 199 	 Avg_reward: -182.33836 	 Cur_reward: -188.80606
Task : 9 	 Episode: 

In [4]:
np.savez('./Data/reaching_ddpg_rbf_transfer_500/9.npz', summary_avg_reward_list, summary_sum_reward_list)

In [None]:
[print(v.shape) for v in U]
print(S.shape)

test = np.tensordot(S, U[1], axes = (1,-1))
print(test.shape)

test2 = np.tensordot(test, U[2], axes = (1,-1))
print(test2.shape)

test3 = np.array([ np.sum(np.array([ U[0][v][i]*test2[i]  for i in range(10)]), axis = 0) for v in range(50)])
# test3 = np.tensordot(test2,U[0], axes = (-1,0))
# print(test3.shape)

In [5]:
plt.figure(1)
plt.plot(all_avg_reward_list)
plt.plot(all_sum_reward_list)
plt.show()

In [17]:
plt.figure(1)
plt.plot(np.mean(summary_sum_reward_list, axis = 0))
plt.plot(np.mean(summary_avg_reward_list, axis = 0))
indexis = np.arange(0,500,10)
errors = np.std(summary_avg_reward_list, axis = 0)
means = np.mean(summary_avg_reward_list, axis = 0)
plt.errorbar(indexis, means[indexis], errors[indexis])
plt.show()

In [4]:
actor_paras = [v for v in tf.trainable_variables() if 'actor' in v.name]
critic_paras = [v for v in tf.trainable_variables() if 'critic' in v.name]
print_actor = [print(v.name) for v in tf.trainable_variables() if 'actor' in v.name]
print_critic = [print(v.name) for v in tf.trainable_variables() if 'critic' in v.name]

actor/L:0
actor/S:0
actor_target/L:0
actor_target/S:0
critic/weights_state:0
critic/weights_action:0
critic/weights_hidden:0
critic/bias:0
critic_target/weights_state:0
critic_target/weights_action:0
critic_target/weights_hidden:0
critic_target/bias:0


In [6]:
actor_p, critic_p = sess.run([actor_paras, critic_paras])
actor_init = [v.initialized_value() for v in actor_paras]
critic_init = [v.initialized_value() for v in critic_paras]
actor_i, critic_i = sess.run([actor_init, critic_init])

In [5]:
np.savez('./Data/reaching_ddpg_rbf_transfer_50/01.npz', all_recorded_actor_paras, all_recorded_critic_paras, summary_avg_reward_list, summary_sum_reward_list)

ValueError: could not broadcast input array from shape (10,51,2) into shape (10)

In [4]:
np.savez('./Data/reaching_ddpg_rbf_transfer_50/04.npz', summary_avg_reward_list, summary_sum_reward_list)

In [14]:
sess.run(tf.global_variables_initializer())

In [13]:
print(all_recorded_actor_paras[0][0].shape)

(10, 51, 2)
