##LunarLanderContinuous-v2 by OpenAI Gym
**Importing Modules**

In [None]:
!pip3 install box2d-py
import gym
import pickle
import tensorflow as tf
import tensorflow.compat.v1 as tf1
tf1.disable_v2_behavior()
from tensorflow import keras
from keras import layers, initializers, regularizers
import numpy as np
import threading
from functools import reduce
import time
import os
from collections import deque
import matplotlib.pyplot as plt

**Initial prep work**

In [None]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

checkpoint_path = "./models_checkpoints"
try:

    os.mkdir(checkpoint_path)
except FileExistsError:
    pass


class StateTrasitionRecorder:

    def __init__(self):
        self.recorder_memory = deque()

    def save_state_transition(self, transition):
        self.recorder_memory.append(transition)

    def flush_recorder_memory(self):
        self.recorder_memory = deque()


class RolloutBuffer(StateTrasitionRecorder):

    def __init__(self, policy_net_args):
        super().__init__()
        self.rollout_memory = deque()
        self.gamma = policy_net_args["gamma"]

    def save_rollout(self, episode):
        complete_episode = self.compute_total_rewards(episode, self.gamma)
        self.rollout_memory.append(complete_episode)
        self.flush_recorder_memory()

    def compute_total_rewards(self, episode_transitions, gamma):
        states, actions, rewards, nex_states, dones = zip(*episode_transitions)
        Q_s_a = []

        for i in range(len(rewards)):
            Q_i = 0
            for j in range(i, len(rewards)):
                Q_i += rewards[j] * self.gamma ** (j - i)

            Q_s_a.append(Q_i)

        episode = deque(zip(states, actions, rewards,
                            nex_states, dones, Q_s_a))

        return(episode)

    def unroll_state_transitions(self):
        states = ()
        actions = ()
        next_states = ()
        rewards = ()
        dones = ()
        Q_sa = ()

        for episode in self.rollout_memory:
            ep_states, ep_actions, ep_next_states, ep_rewards, ep_dones, ep_Q_s_a = zip(
                *episode)

            states += ep_states
            actions += ep_actions
            next_states += ep_next_states
            rewards += ep_rewards
            dones += ep_dones
            Q_sa += ep_Q_s_a

        states = np.asarray(states)
        actions = np.asarray(actions)
        next_states = np.asarray(next_states)
        rewards = np.asarray(rewards)
        dones = np.asarray(dones, dtype=int)
        Q_sa = np.asarray(Q_sa).reshape(-1, 1)

        return states, actions, next_states, rewards, dones, Q_sa

    def flush_rollout_memory(self):
        self.rollout_memory = deque()


def build_networks(network_name, num_Hlayers, activations_Hlayers, Hlayer_sizes, n_output_units, output_layer_activation, regularization_constant, network_type, input_features,):
    assert(num_Hlayers == (len(activations_Hlayers)) and num_Hlayers ==
           len(Hlayer_sizes))

    with tf1.variable_scope(network_type):

        network = tf1.layers.Dense(Hlayer_sizes[0], activation=activations_Hlayers[0], kernel_initializer=tf.initializers.glorot_normal(),
                                  kernel_regularizer=tf.keras.regularizers.L2(l2=regularization_constant), name="Layer_1")(input_features)

        for layer in range(1, num_Hlayers):

            network = tf1.layers.Dense(units=Hlayer_sizes[layer], kernel_initializer=tf.initializers.glorot_normal(), kernel_regularizer=tf.keras.regularizers.L2(l2=regularization_constant), activation=activations_Hlayers[layer], name=(
                "Layer_" + str(layer + 1)))(network)

        if network_type == "Actor":
            mu = tf1.layers.Dense(units=n_output_units, kernel_initializer=tf.initializers.glorot_normal(),
                                 kernel_regularizer=tf.keras.regularizers.L2(l2=regularization_constant), activation=output_layer_activation, name="mu")(network)

            covariance = tf1.layers.Dense(
                units=n_output_units, kernel_initializer=tf.initializers.glorot_normal(), kernel_regularizer=tf.keras.regularizers.L2(l2=regularization_constant), activation=tf.nn.softplus, name="covariance")(network)
            
            params = tf1.get_collection(tf1.GraphKeys.TRAINABLE_VARIABLES, network_name + "/" + network_type)

            return mu, covariance, params

        else:

            critic = tf1.layers.Dense(units=n_output_units, kernel_initializer=tf.initializers.glorot_normal(
            ), activation=output_layer_activation, kernel_regularizer=tf.keras.regularizers.L2(l2=regularization_constant), name="V")(network)
            params = tf1.get_collection(tf1.GraphKeys.TRAINABLE_VARIABLES, network_name + "/" + network_type)

            return critic, params


class ComputationGraph:
    def __init__(self, name, policy_network_args, value_function_network_args):
        super().__init__(policy_network_args)

        with tf1.variable_scope(name):
            self.actor_optimizer = policy_network_args["optimizer"]
            self.critic_optimizer = value_function_network_args["optimizer"]

            self.st_placeholder = tf1.placeholder(dtype=tf.float32, shape=[
                None, policy_network_args["state_space_size"]], name="State")

            self.rewards_placeholder = tf1.placeholder(
                tf.float32, shape=[None, 1], name="rewards")
            self.actions_placeholder = tf1.placeholder(
                tf.float32, shape=[None, policy_network_args["action_space_size"]], name="actions")
            self.dones_placeholder = tf1.placeholder(
                tf.float32, shape=[None, 1], name="dones")

            self.Qsa_placeholder = tf1.placeholder(
                dtype=tf.float32, shape=[None, 1], name="Q_sa")

            self.mu, self.covariance, self.actor_params = build_networks(name, policy_network_args["num_Hlayers"], policy_network_args["activations_Hlayers"], policy_network_args[
                "Hlayer_sizes"], policy_network_args["n_output_units"], policy_network_args["output_layer_activation"], policy_network_args["regularization_constant"], "Actor", self.st_placeholder)

            self.critic, self.critic_params = build_networks(name, value_function_network_args["num_Hlayers"], value_function_network_args["activations_Hlayers"], value_function_network_args[
                "Hlayer_sizes"], value_function_network_args["n_output_units"], value_function_network_args["output_layer_activation"], policy_network_args["regularization_constant"], "Critic", self.st_placeholder)

            with tf1.variable_scope("Train_value_function_estimator"):

                self.value_function_net_cost = tf.losses.mean_squared_error(
                    self.Qsa_placeholder, self.critic) + tf1.losses.get_regularization_loss(scope=name + "/" + "Critic")

                tf1.summary.scalar("Critic_Cost", self.value_function_net_cost)

            if name == "Global_Agent":
                for variable in self.actor_params:
                    var_name = "Actor_" + variable.name.replace("kernel:0", "w").replace("bias:0", "b")
                    tf.summary.histogram(var_name, variable)

                for variable in self.critic_params:
                    var_name = "Critic_" + variable.name.replace("kernel:0", "w").replace("bias:0", "b")
                    tf.summary.histogram(var_name, variable)

            with tf1.variable_scope("Train_policy_network"):

                self.advantage_funtion = tf.math.subtract(
                    self.Qsa_placeholder, self.critic)

                self.probability_density_func = tf1.distributions.Normal(
                    self.mu, self.covariance)

                self.log_prob_a = self.probability_density_func.log_prob(
                    self.actions_placeholder)

                auxiliary = tf.multiply(
                    self.log_prob_a, self.advantage_funtion)

                entropy = self.probability_density_func.entropy()

                self.auxiliary = policy_network_args["Entropy"] * \
                    entropy + auxiliary

                self.policy_net_cost = tf.reduce_sum(-self.auxiliary) + tf1.losses.get_regularization_loss(scope=name + "/" + "Actor")

                self.summary_policy_cost = tf.summary.scalar("Policy_Cost", self.policy_net_cost)

            with tf.name_scope("choose_a"):

                self.action = tf1.clip_by_value(self.probability_density_func.sample(
                    1), policy_network_args["action_space_lower_bound"], policy_network_args["action_space_upper_bound"])

            with tf.name_scope("get_grad"):
                self.actor_grads = tf1.gradients(self.policy_net_cost, self.actor_params)
                self.critic_grads = tf1.gradients(self.value_function_net_cost, self.critic_params)

                for act_grad, critic_grad in zip(self.actor_grads, self.critic_grads):
                    var_name_actor = "Actor_" + act_grad.name.replace("Addn", "w")
                    var_name_critic = "Critic_" + critic_grad.name.replace("Addn", "w")
                    tf.summary.histogram(var_name_actor, act_grad)
                    tf.summary.histogram(var_name_critic, critic_grad)

            self.summaries = tf1.summary.merge_all()


**Training the Model**

In [None]:
class RLAgent(ComputationGraph, RolloutBuffer):
    def __init__(self, name,  policy_network_args, value_function_network_args, session, summary_writer, Global_Agent=None):
        super().__init__(name, policy_network_args, value_function_network_args)

        self.current_num_epi = 0
        self.env = gym.make('LunarLanderContinuous-v2')
        self.total_number_episodes = policy_network_args["total_number_episodes"]
        self.num_episodes_before_update = policy_network_args["number_of_episodes_before_update"]
        self.Global_Agent = Global_Agent
        self.ep_rewards = []
        self.frequency_printing_statistics = policy_network_args["frequency_of_printing_statistics"]
        self.episodes_back = policy_network_args["episodes_back"]
        self.rendering_frequency = policy_network_args["frequency_of_rendering_episode"]
        self.max_steps = policy_network_args["max_steps_per_episode"]
        self.summary_writer = summary_writer
        self.name = name

        self.session = session
        if Global_Agent is not None:
            with tf.name_scope(name):

                with tf.name_scope('sync'):
                    with tf.name_scope('pull_from_global'):
                        self.pull_actor_params_op = [local_params.assign(
                            global_params) for local_params, global_params in zip(self.actor_params, Global_Agent.actor_params)]
                        self.pull_critic_params_op = [local_params.assign(
                            global_params) for local_params, global_params in zip(self.critic_params, Global_Agent.critic_params)]
                    with tf.name_scope("push_to_global"):
                        self.push_actor_params_op = self.actor_optimizer.apply_gradients(zip(self.actor_grads, self.Global_Agent.actor_params))
                        self.push_critic_params_op = self.critic_optimizer.apply_gradients(zip(self.critic_grads, Global_Agent.critic_params))

    def update_Global_Agent(self, feed_dict):
        _, _, = self.session.run([self.push_actor_params_op,
                                  self.push_critic_params_op], feed_dict)

    def save_summary(self, feed_dict):
        summary = self.session.run(self.Global_Agent.summaries, feed_dict)
        self.summary_writer.add_summary(summary, self.Global_Agent.current_num_epi)

    def pull_from_global(self):
        self.session.run([self.pull_actor_params_op,
                          self.pull_critic_params_op])

    def take_action(self, state):
        state = state.reshape(-1, 8)
        action = self.session.run([self.action], feed_dict={
                                  self.st_placeholder: state})
        return action[0].reshape(2,)

    def collect_rollouts(self, n_rolls, max_steps, render=False):
        for i in range(n_rolls):
            n_steps = 0
            state = self.env.reset()
            done = False
            sum_rewards = 0
            while not done and n_steps <= max_steps:
                if render:
                    self.env.render()

                action = self.take_action(state)
                next_state, reward, done, info = self.env.step(action)

                if not done and n_steps == max_steps:
                    state_feed = next_state.reshape(-1, 8)
                    reward = reward + float(self.session.run([self.critic], feed_dict={self.st_placeholder: state_feed})[0])

                self.save_state_transition(
                    [state, action, reward, next_state, done])

                sum_rewards += reward
                state = next_state
                n_steps += 1

            if self.name == "Global_Agent":
                print(f"Episode Reward: {sum_rewards}")

            self.ep_rewards.append(sum_rewards)
            self.save_rollout(self.recorder_memory)

    def training_loop(self):
        """Runs episodes in a loop and performs steps of gradient descent after every episode"""

        while not coord.should_stop() and self.Global_Agent.current_num_epi <= self.total_number_episodes:
            self.collect_rollouts(
                self.num_episodes_before_update, self.max_steps, render=False)

            states, actions, next_states, rewards, dones, Q_sa = self.unroll_state_transitions()

            feed_dict = {self.st_placeholder: states,
                         self.actions_placeholder: actions,
                         self.Qsa_placeholder: Q_sa}

            self.update_Global_Agent(feed_dict)
            self.Global_Agent.current_num_epi += self.num_episodes_before_update

            feed_dict_global_summary = {self.Global_Agent.st_placeholder: states,
                                        self.Global_Agent.actions_placeholder: actions,
                                        self.Global_Agent.Qsa_placeholder: Q_sa}

            self.save_summary(feed_dict_global_summary)

            self.flush_rollout_memory()
            self.pull_from_global()

            if self.Global_Agent.current_num_epi % self.frequency_printing_statistics == 0:

                average_reward = self.Global_Agent.compute_average_rewards(self.episodes_back)
                print(
                    f"Global ep number {self.Global_Agent.current_num_epi}: Reward = {average_reward}")

class Global_Agent(RLAgent):
    def __init__(self, name, policy_network_args, value_function_network_args, session, summary_writer, child_agents=[]):
        super().__init__(name, policy_network_args, value_function_network_args, session, summary_writer)
        self.child_agents = child_agents
        self.num_childs = len(child_agents)

    def compute_average_rewards(self, episodes_back):
        """Computes the average reward of each child agent going n episodes back, and returnes the average of those average rewards"""
        reward = 0
        for agent in self.child_agents:
            agent_average_reward = reduce(
                lambda x, y: x + y, agent.ep_rewards[-episodes_back:]) / episodes_back
            reward += agent_average_reward

        reward /= self.num_childs

        return reward

**Testing the model**

In [None]:
if __name__ == "__main__":

    env = gym.make('LunarLanderContinuous-v2')
    action_space_upper_bound = env.action_space.high
    action_space_lower_bound = env.action_space.low
    subdir = time.strftime("%Y%m%d-%H%M%S", time.localtime())
    logdir = "./summary/" + subdir
    writer = tf.summary.create_file_writer(logdir)
    sess = tf1.Session()

    policy_net_args = {"num_Hlayers": 2,
                       "activations_Hlayers": ["relu", "relu"],
                       "Hlayer_sizes": [100, 100],
                       "n_output_units": 2,
                       "output_layer_activation": tf.nn.tanh,
                       "state_space_size": 8,
                       "action_space_size": 2,
                       "Entropy": 0.01,
                       "action_space_upper_bound": action_space_upper_bound,
                       "action_space_lower_bound": action_space_lower_bound,
                       "optimizer": tf1.train.RMSPropOptimizer(0.0001),
                       "total_number_episodes": 5000,
                       "number_of_episodes_before_update": 1,
                       "frequency_of_printing_statistics": 100,
                       "frequency_of_rendering_episode": 1000,
                       "number_child_agents": 8,
                       "episodes_back": 20,
                       "gamma": 0.99,
                       "regularization_constant": 0.01,
                       "max_steps_per_episode": 2000

                       }

    valuefunction_net_args = {"num_Hlayers": 2,
                              "activations_Hlayers": ["relu", "relu"],
                              "Hlayer_sizes": [100, 64],
                              "n_output_units": 1,
                              "output_layer_activation": "linear",
                              "state_space_size": 8,
                              "action_space_size": 2,
                              "optimizer": tf1.train.RMSPropOptimizer(0.01),
                              "regularization_constant": 0.01}

    global_agent = Global_Agent("Global_Agent", policy_net_args, valuefunction_net_args, sess, writer)

    child_agents = []

    for i in range(policy_net_args["number_child_agents"]):
        i_name = f"ChildAgent_{i}"
        child_agents.append(RLAgent(i_name, policy_net_args, valuefunction_net_args, sess, writer, global_agent))

    global_agent.child_agents = child_agents
    global_agent.num_childs = len(child_agents)

    saver = tf1.train.Saver()

    coord = tf.train.Coordinator()

    if len(os.listdir(checkpoint_path)) == 0:

        sess.run(tf1.global_variables_initializer())
    else:
        saver.restore(sess, checkpoint_path + "/variables.ckpt")

    child_agents_threads = []

    subdir = time.strftime("%Y%m%d-%H%M%S", time.localtime())
    logdir = "./summary/" + subdir
    writer = tf1.summary.FileWriter(logdir)
    writer.add_graph(sess.graph)

    for child_agent in child_agents:
        def job(): return child_agent.training_loop()
        t = threading.Thread(target=job)
        t.start()
        child_agents_threads.append(t)

    coord.join(child_agents_threads)
    saver.save(sess, checkpoint_path + "/variables.ckpt")

    for i in range(1):

        global_agent.collect_rollouts(10, 2000, render=True)
        global_agent.collect_rollouts(90, 2000)

    rewards = global_agent.ep_rewards
    average = sum(rewards)/len(rewards)
    average = [average] * 100

    fig, ax = plt.subplots()

    ax.plot(rewards, label="Episode Reward")
    ax.plot(average, label="Average")
    ax.legend(loc="best")
    plt.show()