In [17]:


class PolicyGradient(object):
    def __init__(self, env, num_iterations=300, batch_size=2000, max_ep_len=200, output_path="../results/"):
        self.output_path = output_path
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        self.env = env
        self.observation_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.n
        self.gamma = 0.9
        self.num_iterations = num_iterations
        self.batch_size = batch_size
        self.max_ep_len = max_ep_len
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=3e-2)
        self.policy_net = PolicyNet(input_size=self.observation_dim, output_size=self.action_dim)
        self.baseline_net = BaselineNet(input_size=self.observation_dim, output_size=1)
        
    def play_games(self, env=None, num_episodes = None):
        episode = 0
        episode_rewards = []
        paths = []
        t = 0
        if not env:
            env = self.env

        while (num_episodes or t < self.batch_size):
            state = env.reset()
            states, actions, rewards = [], [], []
            episode_reward = 0

            for step in range(self.max_ep_len):
                states.append(state)
                action = self.policy_net.sampel_action(np.atleast_2d(state))[0]
                state, reward, done, _ = env.step(action)
                actions.append(action)
                rewards.append(reward)
                episode_reward += reward
                t += 1
                env.render()
                if (done or step == self.max_ep_len-1):
                    episode_rewards.append(episode_reward)
                    break
                if (not num_episodes) and t == self.batch_size:
                    break

            path = {"observation": np.array(states),
                    "reward": np.array(rewards),
                    "action": np.array(actions)}
            paths.append(path)
            episode += 1
            if num_episodes and episode >= num_episodes:
                break
        return paths, episode_rewards
    
    def get_advantage(self, returns, observations):
        values = self.baseline_net.forward(observations).numpy()
        advantages = returns - values
        advantages = (advantages-np.mean(advantages)) / np.sqrt(np.sum(advantages**2))
        return advantages
    
    def update_policy(self, observations, actions, advantages):
        observations = tf.convert_to_tensor(observations)
        actions = tf.convert_to_tensor(actions)
        advantages = tf.convert_to_tensor(advantages)
        with tf.GradientTape() as tape:
            log_prob = self.policy_net.action_distribution(observations).log_prob(actions)
            loss = -tf.math.reduce_mean(log_prob * tf.cast(advantages, tf.float32))
        grads = tape.gradient(loss, self.policy_net.model.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.policy_net.model.trainable_weights))
        
    def train(self):
        all_total_rewards = []
        averaged_total_rewards = []
        for t in range(self.num_iterations):
            paths, total_rewards = self.play_games()
            all_total_rewards.extend(total_rewards)
            observations = np.concatenate([path["observation"] for path in paths])
            actions = np.concatenate([path["action"] for path in paths])
            returns = self.get_returns(paths)
            advantages = self.get_advantage(returns, observations)
            self.baseline_net.update(observations=observations, target=returns)
            self.update_policy(observations, actions, advantages)
            avg_reward = np.mean(total_rewards)
            averaged_total_rewards.append(avg_reward)
            print("Average reward for batch {}: {:04.2f}".format(t,avg_reward))
        print("Training complete")
        np.save(self.output_path+ "rewards.npy", averaged_total_rewards)
        export_plot(averaged_total_rewards, "Reward", "CartPole-v0", self.output_path + "rewards.png")
        
    def eval(self, env, num_episodes=1):
        paths, rewards = self.play_games(env, num_episodes)
        avg_reward = np.mean(rewards)
        print("Average eval reward: {:04.2f}".format(avg_reward))
        return avg_reward

    
    def get_returns(self, paths):
        all_returns = []
        for path in paths:
            rewards = path["reward"]
            returns = []
            reversed_rewards = np.flip(rewards,0)
            g_t = 0
            for r in reversed_rewards:
                g_t = r + self.gamma*g_t
                returns.insert(0, g_t)
            all_returns.append(returns)
        returns = np.concatenate(all_returns)
        return returns

In [6]:
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np

class AC():

    def __init__(self, GAMMA=0.99, UB=1, LB = -1, CLR = 1e-3, ALR = 1e-3):

        self.num_states = 3
        self.num_actions = 1
        self.GAMMA = GAMMA
        self.UPPER_BOUND = UB
        self.LOWER_BOUND = LB
        self.CRITIC_LR = CLR
        self.ACTOR_LR = ALR

    def Actor(self):

        inputs = tf.keras.layers.Input(shape=(self.num_states,))
        out = tf.keras.layers.Dense(64, activation="relu")(inputs)
        outputs = tf.keras.layers.Dense(self.num_actions)(out)

        model = tf.keras.Model(inputs, outputs)
        
        return model
    
    def make_distribution(self, actor_model, observations):
        
        logits = actor_model(observations)

        return tfp.distributions.Categorical(logits=logits)
    
    def sample_action(self, actor_model, observations):
        
        sampled_actions = self.make_distribution(actor_model, observations).sample().numpy()
        
        return sampled_actions
    

    def Critic(self):
        state_input = tf.keras.layers.Input(shape=(self.num_states))
        
        state_out = tf.keras.layers.Dense(16, activation="relu")(state_input)
        state_out = tf.keras.layers.Dense(32, activation="relu")(state_out)

        outputs = tf.keras.layers.Dense(1)(state_out)

        model = tf.keras.Model(state_input, outputs)
        
        return model


    def initialize(self):
        
        actor = self.Actor()
        
        critic = self.Critic()
        
        critic_optimizer = tf.keras.optimizers.Adam(self.CRITIC_LR)
        actor_optimizer = tf.keras.optimizers.Adam(self.ACTOR_LR)
        
        
        return [actor, critic, critic_optimizer, actor_optimizer]
    
class Buffer:
    def __init__(self, agent, batch_size=64):
        # Number of "experiences" to remember
        self.batch_size = batch_size
        self.buffer_counter = 0

        self.states = []
        self.actions = []
        self.rewards = []
        self.next_states = []

    def record(self, obs_tuple):
        # Set index to zero if buffer_capacity is exceeded,
        # replacing old records
        
        self.states.insert(0, obs_tuple[0])
        self.actions.insert(0, obs_tuple[1])
        self.rewards.insert(0, obs_tuple[2])
        self.next_states.insert(0, obs_tuple[3])
    
    def forget(self):
        
        self.states = []
        self.actions = []
        self.rewards = []
        self.next_states = []
        

    @tf.function
    def update(self, state_batch, action_batch, reward_batch, next_state_batch):
        
        with tf.GradientTape() as tape:
            
            b = critic_model(state_batch, training=True)
            
            critic_loss = tf.math.reduce_mean(tf.math.square(reward_batch - b))

        critic_grad = tape.gradient(critic_loss, critic_model.trainable_variables)
        critic_optimizer.apply_gradients(
            zip(critic_grad, critic_model.trainable_variables)
        )

        with tf.GradientTape() as tape:
            
            log_prob = agent.make_distribution(actor_model, state_batch).log_prob(action_batch)
            
            b = critic_model(state_batch, training=True)
            
            actor_loss = -tf.math.reduce_mean(log_prob * tf.cast(reward_batch - b, tf.float32))

        actor_grad = tape.gradient(actor_loss, actor_model.trainable_variables)
        actor_optimizer.apply_gradients(
            zip(actor_grad, actor_model.trainable_variables)
        )

        
    def learn(self, agent):
        # Get sampling range
        
        for i in range(len(self.rewards)//self.batch_size):
            state_batch = tf.convert_to_tensor(self.states[i*self.batch_size:((i+1)*self.batch_size)+1])
            action_batch = tf.convert_to_tensor(self.actions[i*self.batch_size:((i+1)*self.batch_size)+1])
            reward_batch = tf.convert_to_tensor(self.rewards[i*self.batch_size:((i+1)*self.batch_size)+1])
            reward_batch = tf.cast(reward_batch, dtype=tf.float32)
            next_state_batch = tf.convert_to_tensor(self.next_states[i*self.batch_size:((i+1)*self.batch_size)+1])
    
            self.update(agent, state_batch, action_batch, reward_batch, next_state_batch)
        

In [7]:
import gym
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt
import tensorflow_probability as tfp

problem = "CartPole-v1"
env = gym.make(problem)

agent = AC(ALR = 2e-2, CLR = 3e-2)
agent.num_states = 4
agent.num_actions = 2

actor_model, critic_model, critic_optimizer, actor_optimizer = agent.initialize()

MAX_EPISODES = 5000

buffer = Buffer(agent, 4)
Gt = 0

for ep in range(MAX_EPISODES):
    prev_state = env.reset()
    episodic_reward = 0
    while True:
        tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)
        prob = agent.make_distributions(actor_model, tf_prev_state)
        action = agent.sample_action(actor_model, tf_prev_state)
        state, reward, done, _ = env.step(action[0])
        
        
        Gt += reward
        
        
        buffer.record([prev_state, prob, Gt, state])
        
        
        prev_state = state
        env.render()
        
        if done:
            buffer.learn(agent)
            if ep % 10 == 0:
                print("For episode {} the finale reward-to-go is {}".format(ep, Gt))
            Gt = 0
            buffer.forget()
            break

        if Gt > 200:
            print('Goal is reached! saving results..')
            actor_model.save_weights("cartpole_actor.h5")
            critic_model.save_weights("cartpole_critic.h5")
            break


For episode 0 the finale reward-to-go is 17.0
For episode 10 the finale reward-to-go is 15.0
For episode 20 the finale reward-to-go is 10.0
For episode 30 the finale reward-to-go is 9.0
For episode 40 the finale reward-to-go is 10.0
For episode 50 the finale reward-to-go is 11.0
For episode 60 the finale reward-to-go is 10.0
For episode 70 the finale reward-to-go is 10.0
For episode 80 the finale reward-to-go is 9.0
For episode 90 the finale reward-to-go is 9.0
For episode 100 the finale reward-to-go is 12.0
For episode 110 the finale reward-to-go is 44.0
For episode 120 the finale reward-to-go is 10.0
For episode 130 the finale reward-to-go is 9.0
For episode 140 the finale reward-to-go is 9.0
For episode 150 the finale reward-to-go is 10.0
For episode 160 the finale reward-to-go is 9.0
For episode 170 the finale reward-to-go is 8.0
For episode 180 the finale reward-to-go is 9.0
For episode 190 the finale reward-to-go is 10.0
For episode 200 the finale reward-to-go is 10.0
For episode

For episode 1720 the finale reward-to-go is 10.0
For episode 1730 the finale reward-to-go is 10.0
For episode 1740 the finale reward-to-go is 9.0
For episode 1750 the finale reward-to-go is 9.0
For episode 1760 the finale reward-to-go is 9.0
For episode 1770 the finale reward-to-go is 10.0
For episode 1780 the finale reward-to-go is 9.0
For episode 1790 the finale reward-to-go is 10.0
For episode 1800 the finale reward-to-go is 10.0
For episode 1810 the finale reward-to-go is 9.0
For episode 1820 the finale reward-to-go is 9.0
For episode 1830 the finale reward-to-go is 8.0
For episode 1840 the finale reward-to-go is 8.0
For episode 1850 the finale reward-to-go is 9.0
For episode 1860 the finale reward-to-go is 9.0
For episode 1870 the finale reward-to-go is 8.0
For episode 1880 the finale reward-to-go is 8.0
For episode 1890 the finale reward-to-go is 9.0
For episode 1900 the finale reward-to-go is 8.0
For episode 1910 the finale reward-to-go is 10.0
For episode 1920 the finale reward

For episode 3410 the finale reward-to-go is 10.0
For episode 3420 the finale reward-to-go is 9.0
For episode 3430 the finale reward-to-go is 10.0
For episode 3440 the finale reward-to-go is 8.0
For episode 3450 the finale reward-to-go is 9.0
For episode 3460 the finale reward-to-go is 10.0
For episode 3470 the finale reward-to-go is 8.0
For episode 3480 the finale reward-to-go is 10.0
For episode 3490 the finale reward-to-go is 9.0
For episode 3500 the finale reward-to-go is 9.0
For episode 3510 the finale reward-to-go is 9.0
For episode 3520 the finale reward-to-go is 11.0
For episode 3530 the finale reward-to-go is 9.0
For episode 3540 the finale reward-to-go is 10.0
For episode 3550 the finale reward-to-go is 9.0
For episode 3560 the finale reward-to-go is 10.0
For episode 3570 the finale reward-to-go is 8.0
For episode 3580 the finale reward-to-go is 10.0
For episode 3590 the finale reward-to-go is 9.0
For episode 3600 the finale reward-to-go is 9.0
For episode 3610 the finale rewa