In [1]:
import tensorflow as tf
import gym
import time
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def build_model_policy(n_acts = 2):
    mlp_policy = tf.keras.models.Sequential()
#     mlp_policy.add(tf.keras.layers.Dense(30, activation='tanh'))
#     mlp_policy.add(tf.keras.layers.Dense(30, activation='tanh'))
    mlp_policy.add(tf.keras.layers.Dense(n_acts,activation='softmax'))
    return mlp_policy

def build_model_state_value_function():
    mlp_action_val = tf.keras.models.Sequential()
#     mlp_action_val.add(tf.keras.layers.Dense(30, activation='relu'))
#     mlp_action_val.add(tf.keras.layers.Dense(30, activation='relu'))
    mlp_action_val.add(tf.keras.layers.Dense(1,activation=None))
    return mlp_action_val

def compute_rewards_to_go(rewards, gamma, bootstrap_value):
    rewards_to_go = [rewards[-1] + gamma*bootstrap_value]
    for rew in rewards[:-1][::-1]:
        tmp = rewards_to_go[-1]
        rewards_to_go.append(rew + gamma * tmp)
    return rewards_to_go[::-1]

In [3]:
def evaluate(sess, env, iterations):
    sum_return = 0
    for i in range(iterations):
        done = False
        state = env.reset()
        while not done:
#             if i == 0:
#                 time.sleep(0.03)
#                 env.render()
            action = select_action(sess, state, 0)
            new_state, reward, done, _ = env.step(action)
            state = new_state
            sum_return += reward
        env.close()
    return sum_return / iterations


def select_action(sess, state, eps):
    if np.random.rand() < eps:
        action = np.random.randint(n_acts)
    else:
        action = sess.run(greedy_action, {obs_ph: state.reshape(1,-1)})[0]
    return action


In [4]:
class ActorLearner():
    
    mlp_policy = build_model_policy()
    mlp_state_value = build_model_state_value_function()
    
    def __init__(self, env_name = 'CartPole-v0', gamma = 0.9):
        self._sess = tf.Session()
        self._I_central_update = 10
        self._env_name = env_name
        self._env = gym.make('CartPole-v0')
        self._obs_dim = self._env.observation_space.shape[0]
        self._n_acts = self._env.action_space.n
        self._gamma = gamma
        self._state = self._env.reset()
        self._counter = 0
        self._reset()
        self._build_computational_graph()
        
    def _build_computational_graph(self):
        obs_ph = tf.placeholder(shape=(None,self._obs_dim), dtype=tf.float32)
        act_ph = tf.placeholder(shape=(None), dtype=tf.int32)
        rew_ph = tf.placeholder(shape=(None), dtype=tf.float32)

        # make core of state-action-value function network
        mlp_target_state_value = build_model_state_value_function()
        mlp_target_policy = build_model_policy()
        
        
        dummy_policy = ActorLearner.mlp_policy(obs_ph)
        dummy_state_value = ActorLearner.mlp_state_value(obs_ph)
        state_values = mlp_target_state_value(obs_ph)
        action_probs = mlp_target_policy(obs_ph)
        greedy_action = tf.math.argmax(action_probs, axis=1)
        
        central_action_probs = ActorLearner.mlp_policy(obs_ph)
        # define loss functions
        action_masks = tf.one_hot(act_ph, self._n_acts)
        selected_action_probs = tf.reduce_sum(action_masks * central_action_probs, axis=1)
        log_action_probs = tf.math.log(selected_action_probs)
        no_grad_state_values = tf.stop_gradient(state_values)
        policy_loss = tf.reduce_sum(log_action_probs * (rew_ph - no_grad_state_values), axis=1)
        
        state_value_loss = tf.losses.mean_squared_error(rew_ph, state_values)

        # define optimizer
        optimizer_policy = tf.train.AdamOptimizer(0.0003)
        train_policy = optimizer_policy.minimize(policy_loss)
        optimizer_state_value = tf.train.AdamOptimizer(0.001)
        train_state_value = optimizer_state_value.minimize(state_value_loss)
        gradients = optimizer_policy.compute_gradients(policy_loss, var_list = ActorLearner.mlp_policy.trainable_variables)
        
        self._sess.run(tf.global_variables_initializer())
        
        # replace gradients
#         weights_mapping = {k:v for k,v in zip(mlp_target_policy.trainable_variables, \
#                                                   ActorLearner.mlp_policy.trainable_variables)}
#         print('Old gradient')
#         print(gradients)
#         print('Replace weights in gradients')
# #         for idx, grad in enumerate(gradients):
# #             gradients[idx] = (grad[0], weights_mapping[grad[1]])
        
#         apply_gradients = optimizer_policy.apply_gradients(gradients)
        
        self._graph = [train_policy, train_state_value, greedy_action, obs_ph, act_ph, rew_ph, \
                      mlp_target_policy, mlp_target_state_value, action_probs, state_values,\
                      apply_gradients, gradients]
#         print('In building computational graph')
#         print(gradients)
        
        
    def learning_step(self):
        for i in range(self._I_central_update):
            act, obs, rew, done = self._take_env_step()
            self._save_infos(self._state, act, obs, rew, done)
            self._update(obs, done)

            
    def _reset(self):
        self._obs = []
        self._acts = []
        self._rews = []
    
        
    def _save_infos(self, old_state, action, new_state, reward, done):
        obs_ph = self._graph[3]
        state_values = self._graph[9]
        if not done:
            bootstrap_value = self._sess.run(state_values, feed_dict={obs_ph:new_state.reshape(1,-1)})
        else:
            bootstrap_value = 0
        self._obs.append(old_state)
        self._acts.append(action)
        self._rews.append(reward + self._gamma * bootstrap_value)
        
        
    def _update(self, obs, done):
        self._counter += 1
        if self._counter % self._I_central_update == 0 or done:
            self._update_central_network()
            self._update_target_network()
            self._reset()
        if done:
            done = False
            self._state = self._env.reset()
        else:
            self._state = obs
    
    
    def _update_central_network(self):
        train_policy = self._graph[0]
#         mlp_target_policy = self._graph[6]
#         train_state_value = self._graph[1]
#         apply_gradients = self._graph[-2]
#         gradients = self._graph[-1]
        obs_ph = self._graph[3]
        act_ph = self._graph[4]
        rew_ph = self._graph[5]
        
#         print('Computing value before gradient step')
# #         print(self._sess.run(mlp_target_policy(obs_ph), feed_dict = {
# #             obs_ph: np.zeros(4).reshape(1,-1)
# #         }))
#         print(gradients)
#         print(ActorLearner.mlp_policy.trainable_variables)
        self._sess.run(train_policy, feed_dict={
                    obs_ph: np.array(self._obs).reshape(-1, self._obs_dim),
                    act_ph: np.array(self._acts),
                    rew_ph: np.array(self._rews),
        })
#         print('Computing value after gradient step')
# #         print(self._sess.run(mlp_target_policy(obs_ph), feed_dict = {
# #             obs_ph: np.zeros(4).reshape(1,-1)
# #         }))
    
    
    def _update_target_network(self):
        self._sess.run([v_t.assign(v) for v_t, v in zip(self._graph[7].trainable_weights, \
                                                  ActorLearner.mlp_state_value.trainable_weights)])
        self._sess.run([v_t.assign(v) for v_t, v in zip(self._graph[6].trainable_weights, \
                                                  ActorLearner.mlp_policy.trainable_weights)])
    
    
    def _take_env_step(self):
        greedy_action = self._graph[2]
        obs_ph = self._graph[3]
        if np.random.rand() < 0.1:
            act = np.random.randint(self._n_acts)
        else:
            act = self._sess.run(greedy_action, {obs_ph: self._state.reshape(1,-1)})[0]
        obs, rew, done, info = self._env.step(act)
        return act, obs, rew, done
    
    
    def evaluate(self, iterations):
        sum_return = 0
        env = gym.make(self._env_name)
        for i in range(iterations):
            done = False
            state = env.reset()
            while not done:
                action = self._select_action(state, 0)
                new_state, reward, done, _ = env.step(action)
                state = new_state
                sum_return += reward
        return sum_return / iterations


    def _select_action(self, state, eps):
        greedy_action = self._graph[2]
        obs_ph = self._graph[3]
        if np.random.rand() < eps:
            action = np.random.randint(self._n_acts)
        else:
            action = self._sess.run(greedy_action, {obs_ph: state.reshape(1,-1)})[0]
        return action

In [5]:
iterations = 100000
n_learners = 1
learners = [ActorLearner() for i in range(n_learners)]
for i in range(iterations):
    learners[i%n_learners].learning_step()
    if i % 1000 == 0:
        print(learners[0].evaluate(5))

Old gradient
[(<tf.Tensor 'gradients_2/sequential_4/MatMul_grad/tuple/control_dependency_1:0' shape=(4, 2) dtype=float32>, <tf.Variable 'sequential/dense/kernel:0' shape=(4, 2) dtype=float32>), (<tf.Tensor 'gradients_2/sequential_4/BiasAdd_grad/tuple/control_dependency_1:0' shape=(2,) dtype=float32>, <tf.Variable 'sequential/dense/bias:0' shape=(2,) dtype=float32>)]
Replace weights in gradients
In building computational graph
[(<tf.Tensor 'gradients_2/sequential_4/MatMul_grad/tuple/control_dependency_1:0' shape=(4, 2) dtype=float32>, <tf.Variable 'sequential/dense/kernel:0' shape=(4, 2) dtype=float32>), (<tf.Tensor 'gradients_2/sequential_4/BiasAdd_grad/tuple/control_dependency_1:0' shape=(2,) dtype=float32>, <tf.Variable 'sequential/dense/bias:0' shape=(2,) dtype=float32>)]
9.6


KeyboardInterrupt: 