In [1]:
from __future__ import print_function
import numpy as np
import tensorflow as tf
from actor import ActorNetwork
from critic import CriticNetwork
from replay_buffer import ReplayBuffer
from ounoise import OUNoise
import gym, time
from Envs.reaching import ReachingEnv

MAX_EPISODE = 1000
MAX_TIME = 200

ACTOR_LEARNING_RATE = 0.0001
CRITIC_LEARNING_RATE = 0.001
L2_DECAY = 0.01
GAMMA = 0.99
TAU = 0.001

BUFFER_SIZE = 1000000
BATCH_SIZE = 64

OU_MU = 0.0
OU_THETA = 0.15  
OU_SIGMA = 0.20

RANDOM_SEED = 1926

env = gym.make('Pendulum-v0')
# env = ReachingEnv(include_t = True)
tf.set_random_seed(RANDOM_SEED)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_bound = env.action_space.high
sess = tf.Session()

In [2]:
actor = ActorNetwork(sess, state_dim, action_dim, action_bound, hidden_layer_dim = [40,30], \
                     seed = RANDOM_SEED, tau = TAU, learning_rate = ACTOR_LEARNING_RATE)
critic = CriticNetwork(sess, state_dim, action_dim, hidden_layer_dim = [30],\
                       l2_alpha = L2_DECAY, seed = RANDOM_SEED, tau =TAU, learning_rate = CRITIC_LEARNING_RATE)
replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
noise = OUNoise(action_dim, mu = OU_MU, theta = OU_THETA, sigma = OU_SIGMA, seed = RANDOM_SEED)
sess.run(tf.global_variables_initializer())

all_sum_reward_list = []
all_reward_list = []
all_loss_list = []
all_t_list = []

In [3]:
for i in range(MAX_EPISODE):
    state = env.reset()
    noise.reset()
    reward_list = []
    loss_list = []
    
    for t in range(MAX_TIME):
        action = actor.predict(np.reshape(state, (-1, state_dim))) 
        action += noise.noise()
        action = np.clip(action, -action_bound, action_bound)
        action = np.reshape(action, action_dim)
        
        next_state, reward, done, info = env.step(action)
        replay_buffer.add_sample(np.reshape(state, state_dim), \
                                 np.reshape(action,action_dim),\
                                 reward,\
                                 np.reshape(next_state,state_dim),\
                                 t)
        
        mini_batch = replay_buffer.rand_sample(batch_size = BATCH_SIZE, seed = RANDOM_SEED + t + i*MAX_TIME)
        s_batch, a_batch, r_batch, s2_batch, t_batch = mini_batch

        a2_batch = actor.predict(s2_batch, if_target = True)
        training_q = r_batch + GAMMA * critic.predict(s2_batch, a2_batch, if_target = True)
        
        _, loss = critic.train(s_batch, a_batch, training_q)
        
        train_action_batch = actor.predict(s_batch)
        critic_grad = critic.compute_critic_gradient(s_batch, train_action_batch)
        actor.train(s_batch, critic_grad[0])
        
        actor.update_target_network()
        critic.update_target_network()
        
        reward_list.append(reward)
        loss_list.append(loss)
        state = next_state
        if done:
            break
            
    all_sum_reward_list.append(np.sum(reward_list))
    all_reward_list.append(reward_list)
    all_loss_list.append(loss_list)
    all_t_list.append(t)
    
    print('Episode: %s \t Time_step: %s \t Avg_reward: %s \t Cur_reward: %s'%(i, t, np.mean(all_sum_reward_list[-100:]), all_sum_reward_list[-1]))
    
        

Episode: 0 	 Time_step: 199 	 Avg_reward: -164.038645686 	 Cur_reward: -164.038645686
Episode: 1 	 Time_step: 199 	 Avg_reward: -170.512756062 	 Cur_reward: -176.986866438
Episode: 2 	 Time_step: 199 	 Avg_reward: -171.006995047 	 Cur_reward: -171.995473016
Episode: 3 	 Time_step: 199 	 Avg_reward: -171.432929975 	 Cur_reward: -172.710734761
Episode: 4 	 Time_step: 199 	 Avg_reward: -171.276220601 	 Cur_reward: -170.649383106
Episode: 5 	 Time_step: 199 	 Avg_reward: -171.970159095 	 Cur_reward: -175.439851561
Episode: 6 	 Time_step: 199 	 Avg_reward: -172.862735206 	 Cur_reward: -178.218191873
Episode: 7 	 Time_step: 199 	 Avg_reward: -166.375964728 	 Cur_reward: -120.968571383
Episode: 8 	 Time_step: 199 	 Avg_reward: -164.221459195 	 Cur_reward: -146.985414933
Episode: 9 	 Time_step: 199 	 Avg_reward: -163.494741346 	 Cur_reward: -156.954280705
Episode: 10 	 Time_step: 199 	 Avg_reward: -161.049302544 	 Cur_reward: -136.59491452
Episode: 11 	 Time_step: 199 	 Avg_reward: -152.726024

In [None]:
action