In [2]:
import gym
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt
import time
import pandas as pd
from gym import wrappers
from gym import spaces
import math
import random

In [3]:
env = gym.make('Pendulum-v1')

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_bound = env.action_space.high[0]

In [4]:
state_dim

3

In [5]:
action_dim

1

In [6]:
action_bound

2.0

In [2]:
# import wandb
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Lambda

import gym
import argparse
import numpy as np
from threading import Thread
from multiprocessing import cpu_count
tf.keras.backend.set_floatx('float64')
# wandb.init(name='A3C', project="deep-rl-tf2")



args_gamma=0.99
args_update_interval=5
args_actor_lr=0.0005
args_critic_lr=0.001




CUR_EPISODE = 0

class Actor:
    def __init__(self, state_dim, action_dim, action_bound, std_bound):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_bound = action_bound
        self.std_bound = std_bound
        self.model = self.create_model()
        self.opt = tf.keras.optimizers.Adam(args_actor_lr)
        self.entropy_beta = 0.01

    def create_model(self):
        state_input = Input((self.state_dim,))
        dense_1 = Dense(32, activation='relu')(state_input)
        dense_2 = Dense(32, activation='relu')(dense_1)
        out_mu = Dense(self.action_dim, activation='tanh')(dense_2)
        mu_output = Lambda(lambda x: x * self.action_bound)(out_mu)
        std_output = Dense(self.action_dim, activation='softplus')(dense_2)
        return tf.keras.models.Model(state_input, [mu_output, std_output])

    def get_action(self, state):
        state = np.reshape(state, [1, self.state_dim])
        mu, std = self.model.predict(state)
        print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
        print("state",end=",")
        print(state)
        print("mu",end=",")
        print(mu)
        print("std",end=",")
        print(std)
        print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
        mu, std = mu[0], std[0]
        
        return np.random.normal(mu, std, size=self.action_dim)

    def log_pdf(self, mu, std, action):
        std = tf.clip_by_value(std, self.std_bound[0], self.std_bound[1])
        var = std ** 2
        log_policy_pdf = -0.5 * (action - mu) ** 2 / \
            var - 0.5 * tf.math.log(var * 2 * np.pi)
        return tf.reduce_sum(log_policy_pdf, 1, keepdims=True)

    def compute_loss(self, mu, std, actions, advantages):
        log_policy_pdf = self.log_pdf(mu, std, actions)
        loss_policy = log_policy_pdf * advantages
        return tf.reduce_sum(-loss_policy)

    def train(self, states, actions, advantages):
        with tf.GradientTape() as tape:
            mu, std = self.model(states, training=True)
            loss = self.compute_loss(mu, std, actions, advantages)
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss


class Critic:
    def __init__(self, state_dim):
        self.state_dim = state_dim
        self.model = self.create_model()
        self.opt = tf.keras.optimizers.Adam(args_critic_lr)

    def create_model(self):
        return tf.keras.Sequential([
            Input((self.state_dim,)),
            Dense(32, activation='relu'),
            Dense(32, activation='relu'),
            Dense(16, activation='relu'),
            Dense(1, activation='linear')
        ])

    def compute_loss(self, v_pred, td_targets):
        mse = tf.keras.losses.MeanSquaredError()
        return mse(td_targets, v_pred)

    def train(self, states, td_targets):
        with tf.GradientTape() as tape:
            v_pred = self.model(states, training=True)
            assert v_pred.shape == td_targets.shape
            loss = self.compute_loss(v_pred, tf.stop_gradient(td_targets))
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss


class Agent:
    def __init__(self, env_name):
        env = gym.make(env_name)
        self.env_name = env_name
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.action_bound = env.action_space.high[0]
        self.std_bound = [1e-2, 1.0]

        self.global_actor = Actor(
            self.state_dim, self.action_dim, self.action_bound, self.std_bound)
        self.global_critic = Critic(self.state_dim)
        self.num_workers = cpu_count()

    def train(self, max_episodes=1):
        workers = []

        for i in range(self.num_workers):
            env = gym.make(self.env_name)
            workers.append(WorkerAgent(
                env, self.global_actor, self.global_critic, max_episodes))

        for worker in workers:
            worker.start()

        for worker in workers:
            worker.join()


class WorkerAgent(Thread):
    def __init__(self, env, global_actor, global_critic, max_episodes):
        Thread.__init__(self)
        self.env = env
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]
        self.action_bound = self.env.action_space.high[0]
        self.std_bound = [1e-2, 1.0]

        self.max_episodes = max_episodes
        self.global_actor = global_actor
        self.global_critic = global_critic
        self.actor = Actor(self.state_dim, self.action_dim,
                           self.action_bound, self.std_bound)
        self.critic = Critic(self.state_dim)

        self.actor.model.set_weights(self.global_actor.model.get_weights())
        self.critic.model.set_weights(self.global_critic.model.get_weights())

    def n_step_td_target(self, rewards, next_v_value, done):
        td_targets = np.zeros_like(rewards)
        cumulative = 0
        if not done:
            cumulative = next_v_value

        for k in reversed(range(0, len(rewards))):
            cumulative = args_gamma * cumulative + rewards[k]
            td_targets[k] = cumulative
        return td_targets

    def advatnage(self, td_targets, baselines):
        return td_targets - baselines

    def list_to_batch(self, list):
        batch = list[0]
        for elem in list[1:]:
            batch = np.append(batch, elem, axis=0)
        return batch

    def train(self):
        global CUR_EPISODE

        while self.max_episodes >= CUR_EPISODE:
            state_batch = []
            action_batch = []
            reward_batch = []
            episode_reward, done = 0, False

            state = self.env.reset()[0]
            

            while not done:
                # self.env.render()
                action = self.actor.get_action(state)
                action = np.clip(action, -self.action_bound, self.action_bound)
                
                #print(self.env.step(action))

                next_state, reward, done, info, _ = self.env.step(action)

                state = np.reshape(state, [1, self.state_dim])
                action = np.reshape(action, [1, 1])
                next_state = np.reshape(next_state, [1, self.state_dim])
                reward = np.reshape(reward, [1, 1])

                state_batch.append(state)
                action_batch.append(action)
                reward_batch.append(reward)
                print("State :",end="")
                print(state)
                print("action :",end="")
                print(action)
                print("reward :",end="")
                print(reward)

                if len(state_batch) >= args_update_interval or done:
                    states = self.list_to_batch(state_batch)
                    actions = self.list_to_batch(action_batch)
                    rewards = self.list_to_batch(reward_batch)

                    next_v_value = self.critic.model.predict(next_state)
                    td_targets = self.n_step_td_target(
                        (rewards+8)/8, next_v_value, done)
                    advantages = td_targets - self.critic.model.predict(states)

                    actor_loss = self.global_actor.train(
                        states, actions, advantages)
                    critic_loss = self.global_critic.train(
                        states, td_targets)

                    self.actor.model.set_weights(
                        self.global_actor.model.get_weights())
                    self.critic.model.set_weights(
                        self.global_critic.model.get_weights())

                    state_batch = []
                    action_batch = []
                    reward_batch = []
                    td_target_batch = []
                    advatnage_batch = []

                episode_reward += reward[0][0]
                state = next_state[0]

            print('EP{} EpisodeReward={}'.format(CUR_EPISODE, episode_reward))
            # wandb.log({'Reward': episode_reward})
            CUR_EPISODE += 1

    def run(self):
        self.train()


def main():
    env_name = 'Pendulum-v1'
    agent = Agent(env_name)
    agent.train()


if __name__ == "__main__":
    main()

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
state,[[ 0.17076476 -0.9853118  -0.62420964]]
mu,[[0.07013499]]
std,[[0.72708039]]
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
state,[[-0.70515394  0.70905423 -0.9201289 ]]
mu,[[0.35308619]]
std,[[0.65529717]]
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
state,[[ 0.49988616 -0.86609113 -0.46538302]]
mu,[[0.10715296]]
std,[[0.7755397]]
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
State :[[-0.70515394  0.70905423 -0.9201289 ]]
action :[[1.0660675]]
reward :[[-5.62446388]]
State :[[ 0.49988616 -0.86609113 -0.46538302]]
action :[[0.88974591]]
reward :[[-1.11934781]]
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
state,[[-0.7252764  -0.6884578   0.42577055]]
mu,[[0.65736669]]
std,[[0.55847427]]
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
State :[[-0.7252764  -0.6884578   0.42577055]]
action :[[0.30916819]]
reward :[[-5.69325364]]
state,[[-0.04872078 -0.99881244  0.17436452]]
mu,[[0.32149343]]
std,[[0.58156494]]
>>>>>>>>>

  if not isinstance(terminated, (bool, np.bool8)):


>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
state,[[ 0.11975323  0.9928037  -0.012955  ]]
mu,[[0.04296362]]
std,[[0.64379773]]
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
State :[[ 0.11975323  0.9928037  -0.012955  ]]
action :[[-1.41392429]]
reward :[[-2.10670607]]
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
state,[[0.23054524 0.9730616  0.8814207 ]]
mu,[[-0.07064095]]
std,[[0.61354692]]
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
State :[[0.23054524 0.9730616  0.8814207 ]]
action :[[-0.56475224]]
reward :[[-1.86867697]]
state,[[ 0.11623646 -0.9932216  -1.1021193 ]]
mu,[[0.04127685]]
std,[[0.73785117]]
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
State :[[ 0.11623646 -0.9932216  -1.1021193 ]]
action :[[-0.6636791]]
reward :[[-2.23688554]]
state,[[-0.72679603 -0.68685335 -0.04419757]]
mu,[[0.6114794]]
std,[[0.61200372]]
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
State :[[-0.72679603 -0.68685335 -0.04419757]]
action :[[-0.1337062]]
reward :[[-5.68577699]]
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
state,[[-0.07484268 -0.99719536 -0.52