In [1]:
import tensorflow as tf
import gym
import time
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
class ActorLearner():
    
    central_counter = 0
    
    def __init__(self):
        self._I_central_update = 10
        self._I_target_update = 100
        self._env = gym.make('LunarLander-v2')
        self._state = self._env.reset()
        self._counter = 0
        self._reset()
        
        
    def learning_step(self):
        for i in range(self._I_central_update):
            act, obs, rew, done = self._take_env_step()
            self._save_infos(self._state, act, obs, rew, done)
            self._update(obs, done)

    def _reset(self):
        self._obs = []
        self._acts = []
        self._new_obs = []
        self._rews = []
        self._terminal_flags = []
    
        
    def _save_infos(self, old_state, action, new_state, reward, done):
        self._obs.append(old_state)
        self._acts.append(action)
        self._new_obs.append(new_state)
        self._rews.append(reward)
        self._terminal_flags.append(float(done))
        
    def _update(self, obs, done):
        self._counter += 1
        ActorLearner.central_counter += 1
        if ActorLearner.central_counter % self._I_target_update == 0:
            self._update_target_network()
        if self._counter % self._I_central_update == 0 or done:
            self._update_central_network()
            self._reset()
        if done:
            done = False
            self._state = self._env.reset()
        else:
            self._state = obs
    
    def _update_central_network(self):
        _, loss = sess.run([train_action_value, action_value_loss], feed_dict={
                    obs_ph: np.array(self._obs).reshape(-1, obs_dim),
                    act_ph: np.array(self._acts),
                    rew_ph: np.array(self._rews),
                    new_obs_ph: np.array(self._new_obs).reshape(-1,obs_dim),
                    terminal_ph: np.array(self._terminal_flags)
                 })
        #print(loss)
    
    def _update_target_network(self):
        sess.run([v_t.assign(v) for v_t, v in zip(mlp_target.trainable_weights, mlp_action_val.trainable_weights)])
    
    def _take_env_step(self):
        if np.random.rand() < 0.1:
            act = np.random.randint(n_acts)
        else:
            act = sess.run(greedy_action, {obs_ph: self._state.reshape(1,-1)})[0]
        obs, rew, done, info = self._env.step(act)
        return act, obs, rew, done

In [3]:
def build_model():
    mlp_action_val = tf.keras.models.Sequential()
    mlp_action_val.add(tf.keras.layers.Dense(30, activation='relu'))
    mlp_action_val.add(tf.keras.layers.Dense(30, activation='relu'))
    mlp_action_val.add(tf.keras.layers.Dense(n_acts, activation=None))
    return mlp_action_val

In [4]:
# computational graph
env = gym.make('LunarLander-v2')

obs_dim = env.observation_space.shape[0]
n_acts = env.action_space.n
gamma = 0.9

obs_ph = tf.placeholder(shape=(None,obs_dim), dtype=tf.float32)
act_ph = tf.placeholder(shape=(None), dtype=tf.int32)
rew_ph = tf.placeholder(shape=(None), dtype=tf.float32)
new_obs_ph = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32)
terminal_ph = tf.placeholder(shape=(None), dtype=tf.float32)

# make core of state-action-value function network
mlp_target = build_model()
mlp_action_val = build_model()

# define state action values
old_state_action_values = mlp_action_val(obs_ph)
new_state_action_values = mlp_target(new_obs_ph)

# select action
greedy_action = tf.math.argmax(old_state_action_values, axis=1)

# define loss function
y = rew_ph + gamma * tf.reduce_max(new_state_action_values, axis=1)*(1-terminal_ph)
y_no_grad = tf.stop_gradient(y) 
action_masks = tf.one_hot(act_ph, n_acts)
old_selected_action_values = tf.reduce_sum(action_masks * old_state_action_values, axis=1)
action_value_loss = tf.losses.mean_squared_error(y_no_grad, old_selected_action_values)

# define optimizer
optimizer_action_value = tf.train.AdamOptimizer(0.001)
train_action_value = optimizer_action_value.minimize(action_value_loss)

In [5]:
def evaluate(sess, env, iterations):
    sum_return = 0
    for i in range(iterations):
        done = False
        state = env.reset()
        while not done:
#             if i == 0:
#                 time.sleep(0.03)
#                 env.render()
            action = select_action(sess, state, 0)
            new_state, reward, done, _ = env.step(action)
            state = new_state
            sum_return += reward
        env.close()
    return sum_return / iterations


def select_action(sess, state, eps):
    if np.random.rand() < eps:
        action = np.random.randint(n_acts)
    else:
        action = sess.run(greedy_action, {obs_ph: state.reshape(1,-1)})[0]
    return action


In [7]:
iterations = 100000
n_learners = 8
sess = tf.Session()
sess.run(tf.global_variables_initializer())
learners = [ActorLearner() for i in range(n_learners)]
for i in range(iterations):
    learners[i%n_learners].learning_step()
    if i % 1000 == 0:
        print(evaluate(sess, env, 3))
#         print(sess.run(old_state_action_values, feed_dict={
#                         obs_ph: np.zeros(4).reshape(-1,obs_dim)
#         }))

-136.11123265216136
-281.34334210279667
-120.43917967489678
-461.1236595520545
-698.5347729880156
-257.51441683977083
-147.3468426867187
-352.7590290706882
-208.33739728176758
-234.47530900781058
-120.43504103923031
-205.01082120159
-243.13735176945616
-233.57947547470408
-148.48712614308664
-225.60862442972686


KeyboardInterrupt: 