In [2]:
import tensorflow as tf
import gym
import time
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def build_model_policy(n_acts = 2):
    mlp_action_val = tf.keras.models.Sequential()
    mlp_action_val.add(tf.keras.layers.Dense(30, activation=activation))
    mlp_action_val.add(tf.keras.layers.Dense(30, activation=activation))
    mlp_action_val.add(tf.keras.layers.Dense(n_acts, activation='softmax'))
    return mlp_action_val

def build_model_policy():
    mlp_action_val = tf.keras.models.Sequential()
    mlp_action_val.add(tf.keras.layers.Dense(30, activation='relu'))
    mlp_action_val.add(tf.keras.layers.Dense(30, activation='relu'))
    mlp_action_val.add(tf.keras.layers.Dense(1, activation=None))
    return mlp_action_val

def compute_rewards_to_go(rewards, gamma, bootstrap_value):
    rewards_to_go = [rewards[-1] + gamma*bootstrap_value]
    for rew in rewards[:-1][::-1]:
        tmp = rewards_to_go[-1]
        rewards_to_go.append(rew + gamma * tmp)
    return rewards_to_go[::-1]

In [3]:
class ActorLearner():
    
    central_counter = 0
    mlp_policy = build_model(activation= 'tanh')
    mlp_state_value = build_model(activation = 'relu')
    
    def __init__(self):
        self._I_central_update = 10
        self._env = gym.make('CartPole-v0')
        self._state = self._env.reset()
        self._counter = 0
        self._reset()
        self._sess = tf.Session()
        self._sess.run(tf.global_variables_initializer())
        
    def _build_computational_graph(self):
        obs_dim = self._env.observation_space.shape[0]
        n_acts = self._env.action_space.n
        gamma = 0.9

        obs_ph = tf.placeholder(shape=(None,obs_dim), dtype=tf.float32)
        act_ph = tf.placeholder(shape=(None), dtype=tf.int32)
        rew_ph = tf.placeholder(shape=(None), dtype=tf.float32)
        terminal_ph = tf.placeholder(shape=(None), dtype=tf.float32)

        # make core of state-action-value function network
        mlp_target_policy = build_model_policy()
        mlp_target_state_value = build_model_state_value()

        state_values = mlp_target_state_value(obs_ph)
        action_probs = mlp_target_policy(obs_ph)
        greedy_action = tf.math.argmax(action_probs, axis=1)

        # define loss functions
        action_masks = tf.one_hot(act_ph, n_acts)
        selected_action_probs = tf.reduce_sum(action_masks * action_probs, axis=1)
        policy_loss = tf.reduce_sum(tf.math.log(selected_action_probs) * (rew_ph - tf.no_grad(state_values)), axis=1)
        
        state_value_loss = tf.losses.mean_squared_error(rew_ph, state_values)

        # define optimizer
        optimizer_action_value = tf.train.AdamOptimizer(0.0003)
        train_policy = optimizer_action_value.minimize(policy_loss)
        optimizer_action_value = tf.train.AdamOptimizer(0.001)
        train_state_value = optimizer_action_value.minimize(state_value_loss)
        
        self._graph = [train_policy, train_state_value, obs_ph, act_ph, rew_ph, terminal_ph]
        
    def learning_step(self):
        for i in range(self._I_central_update):
            act, obs, rew, done = self._take_env_step()
            self._save_infos(self._state, act, obs, rew, done)
            self._update(obs, done)

    def _reset(self):
        self._obs = []
        self._acts = []
        self._rews = []
    
        
    def _save_infos(self, old_state, action, new_state, reward, done):
        if not done:
            bootstrap_value = self._sess.run(state_values, feed_dict={obs_ph:new_state.reshape(1,-1)})
        else:
            bootstrap_value = 0
        self._obs.append(old_state)
        self._acts.append(action)
        self._rews.append(reward + gamma * bootstrap_value)
        
    def _update(self, obs, done):
        self._counter += 1
        ActorLearner.central_counter += 1
        if ActorLearner.central_counter % self._I_target_update == 0:
            self._update_target_network()
        if self._counter % self._I_central_update == 0 or done:
            self._update_central_network()
            self._reset()
        if done:
            done = False
            self._state = self._env.reset()
        else:
            self._state = obs
    
    def _update_central_network(self):
        _, loss = sess.run([train_action_value, action_value_loss], feed_dict={
                    obs_ph: np.array(self._obs).reshape(-1, obs_dim),
                    act_ph: np.array(self._acts),
                    rew_ph: np.array(self._rews),
                    terminal_ph: np.array(self._terminal_flags)
                 })
        #print(loss)
    
    def _update_target_network(self):
        sess.run([v_t.assign(v) for v_t, v in zip(mlp_target.trainable_weights, mlp_action_val.trainable_weights)])
    
    def _take_env_step(self):
        if np.random.rand() < 0.1:
            act = np.random.randint(n_acts)
        else:
            act = sess.run(greedy_action, {obs_ph: self._state.reshape(1,-1)})[0]
        obs, rew, done, info = self._env.step(act)
        return act, obs, rew, done

In [8]:
# computational graph
env = gym.make('LunarLander-v2')

obs_dim = env.observation_space.shape[0]
n_acts = env.action_space.n
gamma = 0.9

obs_ph = tf.placeholder(shape=(None,obs_dim), dtype=tf.float32)
act_ph = tf.placeholder(shape=(None), dtype=tf.int32)
rew_ph = tf.placeholder(shape=(None), dtype=tf.float32)
new_obs_ph = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32)
terminal_ph = tf.placeholder(shape=(None), dtype=tf.float32)

# make core of state-action-value function network
mlp_target_policy = build_model()
mlp_target_state_value = build_model()

# define state action values
old_state_action_values = mlp_action_val(obs_ph)
new_state_action_values = mlp_target(new_obs_ph)

# select action
greedy_action = tf.math.argmax(old_state_action_values, axis=1)

# define loss function
y = rew_ph + gamma * tf.reduce_max(new_state_action_values, axis=1)*(1-terminal_ph)
y_no_grad = tf.stop_gradient(y) 
action_masks = tf.one_hot(act_ph, n_acts)
old_selected_action_values = tf.reduce_sum(action_masks * old_state_action_values, axis=1)
action_value_loss = tf.losses.mean_squared_error(y_no_grad, old_selected_action_values)

# define optimizer
optimizer_action_value = tf.train.AdamOptimizer(0.001)
train_action_value = optimizer_action_value.minimize(action_value_loss)

AttributeError: 'list' object has no attribute 'dtype'

In [None]:
def evaluate(sess, env, iterations):
    sum_return = 0
    for i in range(iterations):
        done = False
        state = env.reset()
        while not done:
#             if i == 0:
#                 time.sleep(0.03)
#                 env.render()
            action = select_action(sess, state, 0)
            new_state, reward, done, _ = env.step(action)
            state = new_state
            sum_return += reward
        env.close()
    return sum_return / iterations


def select_action(sess, state, eps):
    if np.random.rand() < eps:
        action = np.random.randint(n_acts)
    else:
        action = sess.run(greedy_action, {obs_ph: state.reshape(1,-1)})[0]
    return action


In [None]:
iterations = 100000
n_learners = 1
sess = tf.Session()
sess.run(tf.global_variables_initializer())
learners = [ActorLearner() for i in range(n_learners)]
for i in range(iterations):
    learners[i%n_learners].learning_step()
    if i % 1000 == 0:
        print(evaluate(sess, env, 3))
#         print(sess.run(old_state_action_values, feed_dict={
#                         obs_ph: np.zeros(4).reshape(-1,obs_dim)
#         }))

-54.635651125137606
-371.64285139015266
-413.3254531292724
-86.98586873524516
-156.4931166974486
-237.90129499636484
