In [1]:
import numpy as np
import tensorflow as tf
import gym
import mujoco_py

np.random.seed(2)
tf.set_random_seed(2)  # reproducible

# Superparameters
OUTPUT_GRAPH = False
MAX_EPISODE = 300000
DISPLAY_REWARD_THRESHOLD = -10  # renders environment if total episode reward is greater then this threshold
MAX_EP_STEPS = 100 # maximum time step in one episode
RENDER = False # rendering wastes time
GAMMA = 0.99     # reward discount in TD error
ALPHA = 2
LR_A = 0.00001    # learning rate for actor
LR_C = 0.00001    # learning rate for critic
# env = gym.make('MountainCarContinuous-v0')
env = gym.make('Reacher-v1').env
env.seed(1)  # reproducible

N_F = env.observation_space.shape[0]
# Reacher
action_res = [3, 3]
N_A = action_res[0]*action_res[1]
action_map = np.zeros([np.prod(action_res), 2])
u = np.linspace(env.action_space.low[0], env.action_space.high[0], num=action_res[0])
v = np.linspace(env.action_space.low[1], env.action_space.high[1], num=action_res[1])
for i in range(action_res[0]):
    for j in range(action_res[1]):
        s = action_res[1] * i + j
        action_map[s, :] = [u[i], v[j]]

[2018-06-21 19:09:32,635] Making new env: Reacher-v1


In [2]:
class ExpReplay(object):
    def __init__(self, memory_size, state_dim, act_dim, batch_size):
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.state_dim = state_dim
        self.act_dim = act_dim
        self.states = np.empty((self.memory_size, self.state_dim), dtype=np.float32)
        self.actions = np.empty((self.memory_size, 1), dtype=np.float32)
        self.rewards = np.empty(self.memory_size, dtype=np.float32)
        self.next_states = np.empty((self.memory_size, self.state_dim), dtype=np.float32)
        self.count = 0
        self.current = 0
    def fifo(self, state, action, reward, next_state):
        self.states[self.current] = state
        self.actions[self.current] = action
        self.rewards[self.current] = reward
        self.next_states[self.current] = next_state
        self.current = (self.current + 1) % self.memory_size
        self.count = self.count + 1
    def add_trajectory(self, states, actions, rewards, next_states):
        num = len(observes)
        for i in range(0, num):
            self.fifo(states[i], actions[i], rewards[i], next_states[i])
    def sampling(self):
        indexes = np.random.randint(min(self.count, self.memory_size), size=self.batch_size)
        states = self.states[indexes]
        actions = self.actions[indexes]
        rewards = self.rewards[indexes]
        next_states = self.next_states[indexes]
        states = states.reshape((-1, self.state_dim))
        actions = actions.reshape((-1, 1))
        rewards = rewards.reshape((-1, 1))
        next_states = next_states.reshape((-1, self.state_dim))
        return states, actions, rewards, next_states

In [3]:
class Memory(object):
    def __init__(self, memory_size, state_dim, act_dim):
        self.memory_size = memory_size
        self.state_dim = state_dim
        self.act_dim = act_dim
        self.states = np.empty((self.memory_size, self.state_dim), dtype=np.float32)
        self.actions = np.empty((self.memory_size, 1), dtype=np.float32)
        self.rewards = np.empty(self.memory_size, dtype=np.float32)
        self.next_states = np.empty((self.memory_size, self.state_dim), dtype=np.float32)
        self.count = 0
        self.current = 0
    def fifo(self, state, action, reward, next_state):
        self.states[self.current] = state
        self.actions[self.current] = action
        self.rewards[self.current] = reward
        self.next_states[self.current] = next_state
        self.current = (self.current + 1) % self.memory_size
        self.count = self.count + 1
    def add_trajectory(self, states, actions, rewards, next_states):
        num = len(observes)
        for i in range(0, num):
            self.fifo(states[i], actions[i], rewards[i], next_states[i])
    def sampling(self):
        randpermlist = np.random.permutation(self.count)
        states = self.states[randpermlist]
        actions = self.actions[randpermlist]
        rewards = self.rewards[randpermlist]
        next_states = self.next_states[randpermlist]
        states = states.reshape((-1, self.state_dim))
        actions = actions.reshape((-1, 1))
        rewards = rewards.reshape((-1, 1))
        next_states = next_states.reshape((-1, self.state_dim))
        return states, actions, rewards, next_states
    def empty_memory(self):
        self.states = np.empty((self.memory_size, self.state_dim), dtype=np.float32)
        self.actions = np.empty((self.memory_size, 1), dtype=np.float32)
        self.rewards = np.empty(self.memory_size, dtype=np.float32)
        self.next_states = np.empty((self.memory_size, self.state_dim), dtype=np.float32)
        self.count = 0
        self.current = 0

In [4]:
class Actor(object):
    def __init__(self, sess, n_features, n_actions, batch_size, train_itr, eps_decay_rate, lr=0.0005):
        
        self.sess = sess
        self.batch_size = batch_size
        self.train_itr = train_itr
        self.s = tf.placeholder(tf.float32, (None, n_features), "state")
        self.a = tf.placeholder(tf.float32, (None, n_actions), "act")
        self.td_error = tf.placeholder(tf.float32, (None, 1), "td_error")  # TD_error]
        self.eps = 1
        self.eps_decay_rate = eps_decay_rate
        self.eps_min = 0

        with tf.variable_scope('Actor'):
            l1 = tf.layers.dense(
                inputs=self.s,
                units=64,    # number of hidden units
                activation=tf.tanh,
                kernel_initializer=tf.random_normal_initializer(0., .1),    # weights
                # bias_initializer=tf.constant_initializer(0.1),  # biases
                name='l1'
            )

            l2 = tf.layers.dense(
                inputs=l1,
                units=32,    # number of hidden units
                activation=tf.tanh,
                kernel_initializer=tf.random_normal_initializer(0., .1),    # weights
                # bias_initializer=tf.constant_initializer(0.1),  # biases
                name='l2'
            )
            
            self.acts_prob = tf.layers.dense(
                inputs=l2,
                units=n_actions,    # output units
                activation=tf.contrib.sparsemax.sparsemax,   # get action probabilities
                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
                # bias_initializer=tf.constant_initializer(0.1),  # biases
                name='acts_prob'
            )
            
        with tf.variable_scope('exp_v'):
            self.error = tf.maximum(self.td_error/ALPHA + 0.5 + 
                             0.5*tf.reduce_sum(tf.multiply(self.acts_prob, self.acts_prob)
                                               , 1, keep_dims=True), 0) - tf.reduce_sum(
                tf.multiply(self.acts_prob, self.a), 1, keep_dims=True)
            self.loss = tf.reduce_sum(tf.square(self.error))

        with tf.variable_scope('train'):
            self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)  
            
    def learn(self, s, a, td):
        BATCH_SIZE = self.batch_size
        ITER = self.train_itr
        for i in range(0, ITER):
            randpermlist = np.random.permutation(s.shape[0])
            s = s[randpermlist]
            a = a[randpermlist]
            td = td[randpermlist]
            NITER = int(s.shape[0]/BATCH_SIZE)
            for i in range(0, NITER):
                feed_dict = {self.s: s[i*BATCH_SIZE:(i+1)*BATCH_SIZE], 
                             self.a: a[i*BATCH_SIZE:(i+1)*BATCH_SIZE], 
                             self.td_error: td[i*BATCH_SIZE:(i+1)*BATCH_SIZE]}
                _, loss = self.sess.run([self.train_op, self.loss], feed_dict)
        
        return 0

    def choose_action(self, s):
        s = s[np.newaxis, :]
        probs = self.sess.run(self.acts_prob, {self.s: s})   # get probabilities for all actions
        probs = probs.ravel()
        probs = np.ones(probs.shape[0]) * self.eps / probs.shape[0] + probs * (1. - self.eps)
        probs = probs/np.sum(probs)
        return np.random.choice(np.arange(probs.shape[0]), p=probs)   # return a int
    
    def update_policy(self):
        self.eps = np.max((self.eps*self.eps_decay_rate, self.eps_min))
        
    def print_prob(self, s):
        probs = self.sess.run(self.acts_prob, {self.s: s})   # get probabilities for all actions
        probs = probs.ravel()
        probs = np.ones(probs.shape[0]) * self.eps / probs.shape[0] + probs * (1. - self.eps)
        probs = probs/np.sum(probs)
        print(probs)
        return probs
    
    def get_probs(self, s, a):
        probs = self.sess.run(self.acts_prob, {self.s: s})   # get probabilities for all actions
        probs = np.multiply(probs, a)
        probs = np.sum(probs, axis=1, keepdims=True)
        return probs

In [None]:
class Critic(object):
    def __init__(self, sess, n_features, batch_size, train_itr, lr=0.01):
        
        self.sess = sess
        self.batch_size = batch_size
        self.train_itr = train_itr
        self.s = tf.placeholder(tf.float32, (None, n_features), "state")
        self.v_ = tf.placeholder(tf.float32, (None, 1), "v_next")
        self.r = tf.placeholder(tf.float32, (None, 1), 'r')
        self.p = tf.placeholder(tf.float32, (None, 1), 'p')

        with tf.variable_scope('Critic'):
            l1 = tf.layers.dense(
                inputs=self.s,
                units=32,  # number of hidden units
                activation=tf.tanh,  # None
                # have to be linear to make sure the convergence of actor.
                # But linear approximator seems hardly learns the correct Q.
                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
                # bias_initializer=tf.constant_initializer(0.1),  # biases
                name='l1'
            )
            
            l2 = tf.layers.dense(
                inputs=l1,
                units=16,  # number of hidden units
                activation=tf.tanh,  # None
                # have to be linear to make sure the convergence of actor.
                # But linear approximator seems hardly learns the correct Q.
                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
                # bias_initializer=tf.constant_initializer(0.1),  # biases
                name='l2'
            )

            self.v = tf.layers.dense(
                inputs=l2,
                units=1,  # output units
                activation=None,
                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
                # bias_initializer=tf.constant_initializer(0.1),  # biases
                name='V'
            )
            
        with tf.variable_scope('squared_TD_error'):
            self.td_error = self.r + GAMMA * self.v_ - self.v
            # self.loss = tf.reduce_sum(tf.square(self.td_error + ALPHA/2*(1 - self.p)))    # TD_error = (r+gamma*V_next) - V_eval
            self.loss = tf.reduce_sum(tf.square(self.td_error))
        with tf.variable_scope('train'):
            self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)

    def learn(self, s, r, s_, p):
        BATCH_SIZE = self.batch_size
        ITER = self.train_itr
        for i in range(0, ITER):
            randpermlist = np.random.permutation(s.shape[0])
            s = s[randpermlist]
            r = r[randpermlist]
            s_ = s_[randpermlist]
            p = p[randpermlist]
            NITER = int(s.shape[0]/BATCH_SIZE)
            for i in range(0, NITER):
                v_ = self.sess.run(self.v, {self.s: s_[i*BATCH_SIZE:(i+1)*BATCH_SIZE]})
                td_error, _ = self.sess.run([self.td_error, self.train_op],
                                                  {self.s: s[i*BATCH_SIZE:(i+1)*BATCH_SIZE], self.v_: v_, 
                                                   self.r: r[i*BATCH_SIZE:(i+1)*BATCH_SIZE],
                                                   self.p: p[i*BATCH_SIZE:(i+1)*BATCH_SIZE]})
        return 0
    
    def get_error(self, s, r, s_):
        v_ = self.sess.run(self.v, {self.s: s_})
        td_error = self.sess.run(self.td_error,
                                          {self.s: s, self.v_: v_, self.r: r})
        return td_error

In [None]:
sess = tf.Session()

MEMORY_SIZE = 10500
BATCH_SIZE = 5000
TRAIN_ITR = 2
EPS_DECAY_RATE = 0.999
actor = Actor(sess, n_features=N_F, n_actions=N_A, batch_size=BATCH_SIZE
              , train_itr=TRAIN_ITR, eps_decay_rate=EPS_DECAY_RATE, lr=LR_A)
critic = Critic(sess, n_features=N_F, 
                batch_size=BATCH_SIZE, train_itr=TRAIN_ITR, lr=LR_C)
memory = Memory(MEMORY_SIZE, N_F, N_A)
# replay = ExpReplay(MEMORY_SIZE, N_F, N_A, BATCH_SIZE)
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()
if OUTPUT_GRAPH:
    tf.summary.FileWriter("logs/", sess.graph)

m = 0
for i_episode in range(0, MAX_EPISODE):
    s = env.reset()
    t = 0
    track_r = []
    while True:
        a = actor.choose_action(s)
        s_, r, done, info = env.step(action_map[a, :])
        track_r.append(r)
        
        if m < 10000: memory.fifo(s, a, r, s_)
        s = s_
        t += 1
        m += 1
        if t >= MAX_EP_STEPS: #done or
            ep_rs_sum = sum(track_r)
            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering
            if i_episode % 10 ==0: print("episode:", i_episode, "  reward:", int(running_reward))
            break
        
    if m >= 10000:
        states, actions, rewards, next_states = memory.sampling()
        mx_actions = np.zeros((actions.shape[0], N_A))
        for i in range(0, states.shape[0]):
            mx_actions[i, int(actions[i, 0])] = 1
        probs = actor.get_probs(states, mx_actions)
        td_errors = critic.get_error(states, rewards, next_states)
        actor.learn(states, mx_actions, td_errors)
        critic.learn(states, rewards, next_states, probs)
        actor.update_policy()
        memory.empty_memory()
        m = 0     
    if i_episode % 5000 == 0:
        saver.save(sess, "./reacher_model/train.ckpt", global_step=i_episode)

('episode:', 0, '  reward:', -141)
('episode:', 10, '  reward:', -145)
('episode:', 20, '  reward:', -149)
('episode:', 30, '  reward:', -152)
('episode:', 40, '  reward:', -151)
('episode:', 50, '  reward:', -151)
('episode:', 60, '  reward:', -153)
('episode:', 70, '  reward:', -152)
('episode:', 80, '  reward:', -152)
('episode:', 90, '  reward:', -154)
('episode:', 100, '  reward:', -152)
('episode:', 110, '  reward:', -152)
('episode:', 120, '  reward:', -154)
('episode:', 130, '  reward:', -155)
('episode:', 140, '  reward:', -154)
('episode:', 150, '  reward:', -154)
('episode:', 160, '  reward:', -155)
('episode:', 170, '  reward:', -155)
('episode:', 180, '  reward:', -154)
('episode:', 190, '  reward:', -154)
('episode:', 200, '  reward:', -154)
('episode:', 210, '  reward:', -154)
('episode:', 220, '  reward:', -154)
('episode:', 230, '  reward:', -153)
('episode:', 240, '  reward:', -152)
('episode:', 250, '  reward:', -153)
('episode:', 260, '  reward:', -152)
('episode:',

('episode:', 2190, '  reward:', -152)
('episode:', 2200, '  reward:', -152)
('episode:', 2210, '  reward:', -151)
('episode:', 2220, '  reward:', -152)
('episode:', 2230, '  reward:', -152)
('episode:', 2240, '  reward:', -152)
('episode:', 2250, '  reward:', -154)
('episode:', 2260, '  reward:', -153)
('episode:', 2270, '  reward:', -152)
('episode:', 2280, '  reward:', -152)
('episode:', 2290, '  reward:', -151)
('episode:', 2300, '  reward:', -152)
('episode:', 2310, '  reward:', -153)
('episode:', 2320, '  reward:', -154)
('episode:', 2330, '  reward:', -153)
('episode:', 2340, '  reward:', -150)
('episode:', 2350, '  reward:', -152)
('episode:', 2360, '  reward:', -152)
('episode:', 2370, '  reward:', -153)
('episode:', 2380, '  reward:', -152)
('episode:', 2390, '  reward:', -150)
('episode:', 2400, '  reward:', -151)
('episode:', 2410, '  reward:', -152)
('episode:', 2420, '  reward:', -152)
('episode:', 2430, '  reward:', -153)
('episode:', 2440, '  reward:', -152)
('episode:',

('episode:', 4350, '  reward:', -151)
('episode:', 4360, '  reward:', -151)
('episode:', 4370, '  reward:', -150)
('episode:', 4380, '  reward:', -151)
('episode:', 4390, '  reward:', -152)
('episode:', 4400, '  reward:', -152)
('episode:', 4410, '  reward:', -152)
('episode:', 4420, '  reward:', -153)
('episode:', 4430, '  reward:', -154)
('episode:', 4440, '  reward:', -154)
('episode:', 4450, '  reward:', -152)
('episode:', 4460, '  reward:', -152)
('episode:', 4470, '  reward:', -152)
('episode:', 4480, '  reward:', -152)
('episode:', 4490, '  reward:', -153)
('episode:', 4500, '  reward:', -154)
('episode:', 4510, '  reward:', -154)
('episode:', 4520, '  reward:', -154)
('episode:', 4530, '  reward:', -153)
('episode:', 4540, '  reward:', -152)
('episode:', 4550, '  reward:', -152)
('episode:', 4560, '  reward:', -149)
('episode:', 4570, '  reward:', -152)
('episode:', 4580, '  reward:', -152)
('episode:', 4590, '  reward:', -153)
('episode:', 4600, '  reward:', -151)
('episode:',

('episode:', 6510, '  reward:', -150)
('episode:', 6520, '  reward:', -150)
('episode:', 6530, '  reward:', -152)
('episode:', 6540, '  reward:', -152)
('episode:', 6550, '  reward:', -154)
('episode:', 6560, '  reward:', -150)
('episode:', 6570, '  reward:', -152)
('episode:', 6580, '  reward:', -152)
('episode:', 6590, '  reward:', -155)
('episode:', 6600, '  reward:', -153)
('episode:', 6610, '  reward:', -152)
('episode:', 6620, '  reward:', -152)
('episode:', 6630, '  reward:', -151)
('episode:', 6640, '  reward:', -151)
('episode:', 6650, '  reward:', -152)
('episode:', 6660, '  reward:', -151)
('episode:', 6670, '  reward:', -151)
('episode:', 6680, '  reward:', -153)
('episode:', 6690, '  reward:', -152)
('episode:', 6700, '  reward:', -152)
('episode:', 6710, '  reward:', -151)
('episode:', 6720, '  reward:', -152)
('episode:', 6730, '  reward:', -151)
('episode:', 6740, '  reward:', -152)
('episode:', 6750, '  reward:', -152)
('episode:', 6760, '  reward:', -152)
('episode:',

In [None]:
'''
m = 0
for i_episode in range(100000, 2*MAX_EPISODE):
    s = env.reset()
    t = 0
    track_r = []
    while True:
        a = actor.choose_action(s)
        s_, r, done, info = env.step(action_map[a, :])
        track_r.append(r)
        
        if m < 10000: memory.fifo(s, a, r, s_)
        s = s_
        t += 1
        m += 1
        if t >= MAX_EP_STEPS: #done or
            ep_rs_sum = sum(track_r)
            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering
            if i_episode % 10 ==0: print("episode:", i_episode, "  reward:", int(running_reward))
            break
        
    if m >= 10000:
        states, actions, rewards, next_states = memory.sampling()
        mx_actions = np.zeros((actions.shape[0], N_A))
        for i in range(0, states.shape[0]):
            mx_actions[i, int(actions[i, 0])] = 1
        probs = actor.get_probs(states, mx_actions)
        td_errors = critic.get_error(states, rewards, next_states)
        actor.learn(states, mx_actions, td_errors)
        critic.learn(states, rewards, next_states, probs)
        actor.update_policy()
        memory.empty_memory()
        m = 0     
    if i_episode % 5000 == 0:
        saver.save(sess, "./reacher_model/train.ckpt", global_step=i_episode)
'''

In [None]:
'''
m = 0
for i_episode in range(2*MAX_EPISODE, 3*MAX_EPISODE):
    s = env.reset()
    t = 0
    track_r = []
    while True:
        a = actor.choose_action(s)
        s_, r, done, info = env.step(action_map[a, :])
        track_r.append(r)
        
        if m < 10000: memory.fifo(s, a, r, s_)
        s = s_
        t += 1
        m += 1
        if t >= MAX_EP_STEPS: #done or
            ep_rs_sum = sum(track_r)
            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering
            if i_episode % 10 ==0: print("episode:", i_episode, "  reward:", int(running_reward))
            break
        
    if m >= 10000:
        states, actions, rewards, next_states = memory.sampling()
        mx_actions = np.zeros((actions.shape[0], N_A))
        for i in range(0, states.shape[0]):
            mx_actions[i, int(actions[i, 0])] = 1
        probs = actor.get_probs(states, mx_actions)
        td_errors = critic.get_error(states, rewards, next_states)
        actor.learn(states, mx_actions, td_errors)
        critic.learn(states, rewards, next_states, probs)
        actor.update_policy()
        memory.empty_memory()
        m = 0     
    if i_episode % 5000 == 0:
        saver.save(sess, "./reacher_model/train.ckpt", global_step=i_episode)
'''

In [None]:
for i_episode in range(0, 30):
    s = env.reset()
    t = 0
    track_r = []
    while True:
        env.render()
        a = actor.choose_action(s)
        s_, r, done, info = env.step(action_map[a, :])
        track_r.append(r)
        s = s_
        t += 1
        if t >= MAX_EP_STEPS: #done or
            ep_rs_sum = sum(track_r)
            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering
            print("episode:", i_episode, "  reward:", int(running_reward))
            break