In [1]:
import tensorflow.compat.v1 as tf
import gym
import numpy as np

In [2]:
def build_graph_Q(state, action):
    with tf.variable_scope('layer0'):
        layer_size = 128
        state0 = tf.layers.dense(state,layer_size)
        action0 = tf.layers.dense(action, layer_size, use_bias=False) 
        layer = action0 + state0 # layer này là 1 Affine Transformation cúa state và action Wt*s + Wt*a + b
        layer = tf.nn.relu(layer)
    
    with tf.variable_scope('layer1'):
        layer_size = 128
        layer = tf.layers.dense(layer, layer_size)
        layer = tf.nn.relu(layer)
        
    with tf.variable_scope('layer2'):
        layer_size = 1
        layer = tf.layers.dense(layer, layer_size)

    return layer

def build_graph_policy(state):
    with tf.variable_scope('layer0'):
        layer_size = 128
        layer = tf.layers.dense(state, layer_size)
        layer = tf.nn.relu(layer)
    
    with tf.variable_scope('layer1'):
        layer_size = 128
        layer = tf.layers.dense(layer, layer_size)
        layer = tf.nn.relu(layer)
        
    with tf.variable_scope('layer2'):
        layer_size = 1
        layer = tf.layers.dense(layer, layer_size)
        layer = tf.nn.tanh(layer)
        layer = tf.multiply(layer, 2) #action space từ -2 đến 2 nên ta dùng activation function tanh() sau đó nhân với 2
    return layer

In [3]:
tf.compat.v1.disable_eager_execution()

In [4]:
observations_ph = tf.placeholder(tf.float32, shape=(None, 3), name='observation')
next_observations_ph = tf.placeholder(tf.float32, shape=(None, 3), name='next_observation')
rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='reward')
actions_ph = tf.placeholder(tf.float32, shape=(None, 1), name='action')
terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminal')

In [5]:
with tf.variable_scope('Actor/eval'):
    policy = build_graph_policy(observations_ph)
with tf.variable_scope('Actor/target'):
    target_policy = build_graph_policy(observations_ph)

with tf.variable_scope('Critic/eval'):
    q = build_graph_Q(observations_ph, actions_ph) 
with tf.variable_scope('Critic/eval', reuse=True):
    q_policy = build_graph_Q(observations_ph, policy)

with tf.variable_scope('Critic/target'):
    target_q = build_graph_Q(next_observations_ph, target_policy)

policy_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
target_policy_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
q_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
target_q_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')

Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [6]:
TAU = 0.01
GAMMA = 0.9
LR_Critic = 0.001
LR_Actor = 0.0001

q_target = tf.stop_gradient(rewards_ph + (1-terminals_ph)*GAMMA*target_q)

q_error = 0.5*tf.reduce_mean((q_target - q)**2)
q_train_ops = tf.train.AdamOptimizer(LR_Critic).minimize(loss=q_error, var_list=q_params)

policy_loss = - tf.reduce_mean(q_policy)
policy_train_ops = tf.train.AdamOptimizer(LR_Actor).minimize(loss=policy_loss, var_list=policy_params)

update_policy_ops = [tf.assign(tpp, (1-TAU)*tpp + TAU*pp) for tpp, pp in zip(target_policy_params, policy_params)]
update_q_ops = [tf.assign(tqp, (1-TAU)*tqp + TAU*qp) for tqp, qp in zip(target_q_params, q_params)]

def action_respond(sess, obs):
    action = sess.run(policy, feed_dict={observations_ph: obs})[0]
    return action

def init_training(sess):
    sess.run(update_policy_ops)
    sess.run(update_q_ops)

def get_feed_dict(batch):
    feed_dict = {observations_ph: batch['observations'],
                 actions_ph: batch['actions'],
                 next_observations_ph: batch['next_observations'],
                 rewards_ph: batch['rewards'],
                 terminals_ph: batch['terminals']}
    return feed_dict
    
def do_training(sess, batch):
    feed_dict = get_feed_dict(batch)
    sess.run([q_train_ops, policy_train_ops], feed_dict)
    sess.run(update_policy_ops)
    sess.run(update_q_ops)

In [7]:
env = gym.make('Pendulum-v0') # tạo một environment
env.reset() #  reset lại environment, hàm reset() trả về state đầu tiên của environment
episodes = 10 # ta chạy 10 episodes
steps = 1000 # mỗi episodes ta chạy nhiều nhất 100 steps
for ep in range(episodes):
  state = env.reset()
  for step in range(steps):
    # env.render() # gọi hàm render() để sinh ra animation, mình hay tắt đi vì nó gây crash trên window
    action = env.action_space.sample() # lấy 1 action ngẫu nhiên trong action space
    next_state, reward, done, _ = env.step(action) # thực thi action đó trên environment, giá trị trả về là state tiếp theo s', reward nhận được, và done (đã kết thúc episodes hay chưa)

In [8]:
env = gym.make('Pendulum-v0') 
env.reset() 
episodes = 10 
steps = 1000 

BUFFER_SIZE = 1000000 # độ lớn của buffer
buffer = []
for ep in range(episodes):
    state = env.reset()
    for step in range(steps):
        # env.render()
        action = env.action_space.sample() 
        next_state, reward, done, _ = env.step(action) 
        
        if len(buffer) >= BUFFER_SIZE: 
             buffer.pop(0)  # nếu buffer đầy thì pop phần tử đầu tiên ra      
        buffer.append([state, action, np.array([reward]), np.array([done]).astype(int), next_state]) #thêm experience mới
        state = next_state #sau khi lưu vào buffer xong thì ta gán state là next_state