In [1]:
from ppo_agent import PPOAgent
import gymnasium as gym
import torch
from agent_configs import PPOConfig, PPOActorConfig, PPOCriticConfig
from game_configs import CartPoleConfig

env = gym.make('CartPole-v1', render_mode='rgb_array')
config_dict = {
        'clip_param': 0.2,
        'activation': 'relu',
        'kernel_initializer': 'orthogonal',
        # NORMALIZATION?
        'discount_factor': 0.99,
        'gae_lambda': 0.98,
        'critic_dense_layers': [64, 64],
        'actor_dense_layers': [64, 64],
        # REWARD CLIPPING
        'steps_per_epoch': 4800,
        'train_policy_iterations': 5,
        'train_value_iterations': 5,
        'target_kl': 0.02,
        'entropy_coefficient': 0.01,
        'num_minibatches': 4,
        'loss_function': None,
    }

actor_config_dict = {
    'optimizer': torch.optim.Adam,
    'learning_rate': 0.0005,
    'adam_epsilon': 1e-7,
    'clipnorm': 0.5,
    'loss_function': None,
}

critic_config_dict = {
    'optimizer': torch.optim.Adam,
    'learning_rate': 0.0005,
    'adam_epsilon': 1e-7,
    'clipnorm': 0.5,
    'loss_function': None,
}

print("Actor Config")
actor_config = PPOActorConfig(actor_config_dict)
print("Critic Config")
critic_config = PPOCriticConfig(critic_config_dict)

print("PPO Config")
config = PPOConfig(config_dict, CartPoleConfig(), actor_config=actor_config, critic_config=critic_config)

agent = PPOAgent(env, config=config, name='PPOAgent')


Actor Config
Using         adam_epsilon                  : 1e-07
Using         learning_rate                 : 0.0005
Using         clipnorm                      : 0.5
Using         optimizer                     : <class 'torch.optim.adam.Adam'>
Critic Config
Using         adam_epsilon                  : 1e-07
Using         learning_rate                 : 0.0005
Using         clipnorm                      : 0.5
Using         optimizer                     : <class 'torch.optim.adam.Adam'>
PPO Config
Using default save_intermediate_weights     : True
Using default training_steps                : 10000
Using default adam_epsilon                  : 1e-06
Using default learning_rate                 : 0.001
Using default clipnorm                      : 0
Using default optimizer                     : <class 'torch.optim.adam.Adam'>
Using         loss_function                 : None
Using         activation                    : relu
Using         kernel_initializer            : orthogonal
Usin

  logger.warn(


In [2]:
for param in agent.model.parameters():
  print(param)

Parameter containing:
tensor([[-0.0188,  0.1328, -0.3588, -0.1511],
        [ 0.0210, -0.0698,  0.2872,  0.1027],
        [ 0.3353, -0.1770,  0.0415,  0.3463],
        [-0.3660,  0.2736,  0.1861,  0.1978],
        [-0.4334,  0.0354, -0.4449,  0.0904],
        [-0.1341,  0.2054,  0.2652, -0.3402],
        [ 0.3130, -0.1293, -0.2161, -0.0225],
        [ 0.3094,  0.2433, -0.4701,  0.4768],
        [ 0.1703,  0.4450,  0.3639, -0.0158],
        [ 0.3755, -0.0208,  0.2743,  0.3631],
        [-0.2859,  0.2027,  0.3840, -0.0775],
        [ 0.1772, -0.1554, -0.3849, -0.0368],
        [-0.4292,  0.1008,  0.2758, -0.4441],
        [ 0.0615,  0.0400,  0.1797, -0.1823],
        [ 0.0367,  0.0893, -0.3543,  0.2728],
        [-0.4383,  0.1561, -0.1111, -0.4471],
        [ 0.1336,  0.3687, -0.1922, -0.0667],
        [-0.1665, -0.3475, -0.3938,  0.2104],
        [-0.2057, -0.1326,  0.0188,  0.3214],
        [ 0.2277, -0.4257, -0.0897,  0.2079],
        [ 0.0521, -0.2148, -0.0887, -0.1913],
        [-0.

In [3]:
# state, _ = env.reset()
# print(state)
# print(agent.select_action(state)[0])
agent.train()

Training Step:  0
torch.Size([1, 2])
tensor([[0.1027]])
torch.Size([1, 2])
tensor([[0.1056]])
torch.Size([1, 2])
tensor([[0.1027]])
torch.Size([1, 2])
tensor([[0.1060]])
torch.Size([1, 2])
tensor([[0.1072]])
torch.Size([1, 2])
tensor([[0.1068]])
torch.Size([1, 2])
tensor([[0.1077]])
torch.Size([1, 2])
tensor([[0.1002]])
torch.Size([1, 2])
tensor([[0.1085]])
torch.Size([1, 2])
tensor([[0.1015]])
torch.Size([1, 2])
tensor([[0.1035]])
torch.Size([1, 2])
tensor([[0.1027]])
torch.Size([1, 2])
tensor([[0.1100]])
torch.Size([1, 2])
tensor([[0.1113]])
torch.Size([1, 2])
tensor([[0.1100]])
torch.Size([1, 2])
tensor([[0.1026]])
torch.Size([1, 2])
tensor([[0.1088]])
torch.Size([1, 2])
tensor([[0.1133]])
torch.Size([1, 2])
tensor([[0.1077]])
torch.Size([1, 2])
tensor([[0.1121]])
torch.Size([1, 2])
tensor([[0.1089]])
torch.Size([1, 2])
tensor([[0.1106]])
torch.Size([1, 2])
tensor([[0.1103]])
torch.Size([1, 2])
tensor([[0.1047]])
torch.Size([1, 2])
tensor([[0.0935]])
torch.Size([1, 2])
tensor([[0.10

  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [None]:
agent.test()

In [None]:
import numpy as np
legal_moves = [0, 1, 2]
num_actions = 4
probabilities = torch.tensor([0.1, 0.1, 0.1, 0.1])
mask = np.zeros(num_actions, dtype=np.int8)
mask[legal_moves] = 1
print(mask)
probabilities[mask == 0] = 0
print(probabilities)


[1 1 1 0]
tensor([0.1000, 0.1000, 0.1000, 0.0000])


In [None]:
logits = torch.tensor([[11.0, 10.0, 5.0, 3.0]])
print(logits.shape)
print(logits.softmax(dim=-1))

torch.Size([1, 4])
tensor([[7.2956e-01, 2.6839e-01, 1.8084e-03, 2.4474e-04]])


In [None]:
import argparse
import os
import random
import time
from distutils.util import strtobool

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
from torch.utils.tensorboard import SummaryWriter

class Agent(nn.Module):
    def get_value(self, x):
        return self.critic(self.network(x))

    def get_action_and_value(self, x, action=None):
        hidden = self.network(x)
        logits = self.actor(hidden)
        probs = Categorical(logits=logits)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action), probs.entropy(), self.critic(hidden)


if __name__ == "__main__":
    agent = Agent(envs).to(device)
    optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5)

    # ALGO Logic: Storage setup
    obs = torch.zeros((args.num_steps, args.num_envs) + envs.single_observation_space.shape).to(device)
    actions = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device)
    logprobs = torch.zeros((args.num_steps, args.num_envs)).to(device)
    rewards = torch.zeros((args.num_steps, args.num_envs)).to(device)
    dones = torch.zeros((args.num_steps, args.num_envs)).to(device)
    values = torch.zeros((args.num_steps, args.num_envs)).to(device)

    # TRY NOT TO MODIFY: start the game
    global_step = 0
    start_time = time.time()
    next_obs = torch.Tensor(envs.reset()).to(device)
    next_done = torch.zeros(args.num_envs).to(device)
    num_updates = args.total_timesteps // args.batch_size

    for update in range(1, num_updates + 1):
        # Annealing the rate if instructed to do so.
        if args.anneal_lr:
            frac = 1.0 - (update - 1.0) / num_updates
            lrnow = frac * args.learning_rate
            optimizer.param_groups[0]["lr"] = lrnow

        for step in range(0, args.num_steps):
            global_step += 1 * args.num_envs
            obs[step] = next_obs
            dones[step] = next_done

            # ALGO LOGIC: action logic
            with torch.no_grad():
                action, logprob, _, value = agent.get_action_and_value(next_obs)
                values[step] = value.flatten()
            actions[step] = action
            logprobs[step] = logprob

            # TRY NOT TO MODIFY: execute the game and log data.
            next_obs, reward, done, info = envs.step(action.cpu().numpy())
            rewards[step] = torch.tensor(reward).to(device).view(-1)
            next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(device)

        # bootstrap value if not done
        with torch.no_grad():
            next_value = agent.get_value(next_obs).reshape(1, -1)
            if args.gae:
                advantages = torch.zeros_like(rewards).to(device)
                lastgaelam = 0
                for t in reversed(range(args.num_steps)):
                    if t == args.num_steps - 1:
                        nextnonterminal = 1.0 - next_done
                        nextvalues = next_value
                    else:
                        nextnonterminal = 1.0 - dones[t + 1]
                        nextvalues = values[t + 1]
                    delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
                    advantages[t] = lastgaelam = delta + args.gamma * args.gae_lambda * nextnonterminal * lastgaelam
                returns = advantages + values
            else:
                returns = torch.zeros_like(rewards).to(device)
                for t in reversed(range(args.num_steps)):
                    if t == args.num_steps - 1:
                        nextnonterminal = 1.0 - next_done
                        next_return = next_value
                    else:
                        nextnonterminal = 1.0 - dones[t + 1]
                        next_return = returns[t + 1]
                    returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
                advantages = returns - values

        # flatten the batch
        b_obs = obs.reshape((-1,) + envs.single_observation_space.shape)
        b_logprobs = logprobs.reshape(-1)
        b_actions = actions.reshape((-1,) + envs.single_action_space.shape)
        b_advantages = advantages.reshape(-1)
        b_returns = returns.reshape(-1)
        b_values = values.reshape(-1)

        # Optimizing the policy and value network
        b_inds = np.arange(args.batch_size)
        clipfracs = []
        for epoch in range(args.update_epochs):
            np.random.shuffle(b_inds)
            for start in range(0, args.batch_size, args.minibatch_size):
                end = start + args.minibatch_size
                mb_inds = b_inds[start:end]

                _, newlogprob, entropy, newvalue = agent.get_action_and_value(b_obs[mb_inds], b_actions.long()[mb_inds])
                logratio = newlogprob - b_logprobs[mb_inds]
                ratio = logratio.exp()

                with torch.no_grad():
                    # calculate approx_kl http://joschu.net/blog/kl-approx.html
                    old_approx_kl = (-logratio).mean()
                    approx_kl = ((ratio - 1) - logratio).mean()
                    clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]

                mb_advantages = b_advantages[mb_inds]
                if args.norm_adv:
                    mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8)

                # Policy loss
                pg_loss1 = -mb_advantages * ratio
                pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - args.clip_coef, 1 + args.clip_coef)
                pg_loss = torch.max(pg_loss1, pg_loss2).mean()

                # Value loss
                newvalue = newvalue.view(-1)
                if args.clip_vloss:
                    v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
                    v_clipped = b_values[mb_inds] + torch.clamp(
                        newvalue - b_values[mb_inds],
                        -args.clip_coef,
                        args.clip_coef,
                    )
                    v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
                    v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
                    v_loss = 0.5 * v_loss_max.mean()
                else:
                    v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()

                entropy_loss = entropy.mean()
                loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.vf_coef

                optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
                optimizer.step()

            if args.target_kl is not None:
                if approx_kl > args.target_kl:
                    break

        y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
        var_y = np.var(y_true)
        explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y

    envs.close()
