In [6]:
# CONFIG NEEDED : 
# - env
# PPO config above but env : inshape = obs shape + acm output shape, 
# ACM network config - inpud_shape, output_shape, embed shapes, log (bool), optimizer, learning rate, device, num layer
# Memory buffer config - buffer size
# main model config - checkpoint path - training steps, print interval, steps per epoch checkpoint interval

from ppo.ppo_agent import PPOAgent
import gymnasium as gym
import torch
from agent_configs import PPOConfig, ActorConfig, CriticConfig
from ACMconfig import ACModelconfig, ACMNconfig, ACMAconfig, Buffconfig, MHABconfig
from game_configs import CartPoleConfig
from acm_agent import ACMAgent
from acm_network import MHA
env = gym.make('CartPole-v1', render_mode='rgb_array')

device = torch.device('mps')

MultiHeadAttentionBlockconfig = {
    'optimizer': "Adam",
    'learning_rate': 1e-4,
    'adam_epsilon': 1e-7,
    'clipnorm': 0.5,
    'layers': [
        {
            'type': 'MultiheadAttention',
            'num_heads': 1,
            'dropout': 0.1,
            'embed_dim': 1,
            'bias': False,
            'batch_first': True,
            'kdim': 1,
            'vdim': 1,
            'add_zero_attn': False,
            'add_bias_kv': False
        }
    ],
    'self_attention': [
        False,
    ]
}

ACMconfig = {
    'input_shape': 1,
    'output_shape': 1,
    'embed_shapes': [4, 4, 1],
    'log': False,
    'optimizer': "Adam",
    'learning_rate': 1e-3,
    'adam_epsilon': 1e-7,
    'clipnorm': 0.5,
    'device': torch.device('mps'),
    'num_layers': 1
}

MainModelConfig = {
    'checkpoint_path': 'checkpoints/ACMCartPole',
    'training_steps': 100000,
    'print_interval': 1000,
    'steps_per_epoch': 512,
    'checkpoint_interval': 5000,
    'env': env,
    'log': True,
}

MemoryBufferConfig = {
    'buffer_size': 1000,
}
config_dict = {
        'activation': 'tanh',
        'clip_param': 0.2,
        'kernel_initializer': 'orthogonal',
        # NORMALIZATION?
        'discount_factor': 0.99,
        'gae_lambda': 0.95,
        'critic_dense_layers': [64],
        'actor_dense_layers': [64],
        # REWARD CLIPPING
        'steps_per_epoch': 512,
        'train_policy_iterations': 4,
        'train_value_iterations': 4,
        'target_kl': 0.02,
        'entropy_coefficient': 0.01,
        'num_minibatches': 4,
        'loss_function': None,
        'observation_space': None
    }

actor_config_dict = {
    'optimizer': torch.optim.Adam,
    'learning_rate': 2.5e-4,
    'adam_epsilon': 1e-7,
    'clipnorm': 0.5,
    'loss_function': None,
}

critic_config_dict = {
    'optimizer': torch.optim.Adam,
    'learning_rate': 2.5e-4,
    'adam_epsilon': 1e-7,
    'clipnorm': 0.5,
    'loss_function': None,
}

# print("Actor Config")
actor_config = ActorConfig(actor_config_dict)
# print("Critic Config")
critic_config = CriticConfig(critic_config_dict)

# print("PPO Config")
config = PPOConfig(config_dict, CartPoleConfig(), actor_config=actor_config, critic_config=critic_config)

# print("ACM Attention Network Config")
ACMNconfigdict = ACMNconfig(config=ACMconfig)

# print("ACM whole model (attention + ppo) Config")
MainModelConfig = ACMAconfig(MainModelConfig)

# print("Memory Buffer Config (for Attention Network)")
MemoryBufferConfig = Buffconfig(MemoryBufferConfig)



MultiHeadAttentionBlockconfig = MHABconfig(config=MultiHeadAttentionBlockconfig)
MultiHeadAgent = MHA(config=MultiHeadAttentionBlockconfig, device="mps", logging=False)

configurations = {
    'ACMNconfig': ACMNconfigdict,
    'ACMconfig': MainModelConfig,
    'Buff': MemoryBufferConfig,
    'PPOconfig': config,
    'MHABconfig': MultiHeadAttentionBlockconfig,
}

ACMconfig = ACModelconfig(configurations)
# print("ACM Config")


ACMAgentI = ACMAgent(configu=ACMconfig, device="mps")



Using         adam_epsilon                  : 1e-07
Using         learning_rate                 : 0.00025
Using         clipnorm                      : 0.5
Using         optimizer                     : <class 'torch.optim.adam.Adam'>
Using         adam_epsilon                  : 1e-07
Using         learning_rate                 : 0.00025
Using         clipnorm                      : 0.5
Using         optimizer                     : <class 'torch.optim.adam.Adam'>
Using default save_intermediate_weights     : False
Using default training_steps                : 10000
Using default adam_epsilon                  : 1e-06
Using default momentum                      : 0.9
Using default learning_rate                 : 0.001
Using default clipnorm                      : 0
Using default optimizer                     : <class 'torch.optim.adam.Adam'>
Using default weight_decay                  : 0.0
Using         loss_function                 : None
Using         activation                    : t

In [2]:
%matplotlib inline

In [1]:
### WRITING TESTS ###
### NETWORK TESTS ###

# # Test the network's forward pass
# input = torch.randn(1, ACMAgentI.network.input_dim)
# history = torch.randn(5, ACMAgentI.network.input_dim)
# output = ACMAgentI.network.forward(input, history)
# assert output.shape == (1, ACMAgentI.network.output_dim), "Output shape mismatch"

# TESTING
# GIVEN INPUT VECTOR : 4
# HISTORY : 1, 3, 5, 7
# Target = Gaussian weighted averge of 1,3,5,7 centered aroind 4
# updateplot = 10000
# plot = []
# for i in range(10000):

#     loss = []
#     for o in range(50):
#         input = torch.randn(1,1)
#         history = torch.randn(4,1)
#         weights = torch.exp(-0.5 * ((input - history) ** 2).sum(dim=1))
#         target = torch.sum(weights * history) / torch.sum(weights)
#         output = ACMAgentI.network.forward(input, history)
#         loss.append(torch.nn.HuberLoss()(output[0], target))
        
#     print("Epoch: ", i)
#     loss = torch.mean(torch.stack(loss))
#     plot.append(loss.item())
#     ACMAgentI.network.learn(loss)

# import matplotlib.pyplot as plt
# plt.plot(plot)
# plt.xlabel("Epochs")
# plt.ylabel("Loss")
# plt.title("ACM Network Loss")
# plt.show()

### WRITING TESTS ###
### NETWORK TESTS ###

# Test the network's forward pass
input = torch.randn(1, 1)
history = torch.randn(5, 1)
output = MultiHeadAgent(input, history)
print("Input shape: ", input.shape)
print("History shape: ", history.shape)
print("Output shape: ", output.shape)
print("Output: ", output)
print("Input: ", input)
print("History: ", history)


# TESTING
# GIVEN INPUT VECTOR : 4
# HISTORY : 1, 3, 5, 7
# Target = Gaussian weighted averge of 1,3,5,7 centered aroind 4
# updateplot = 10000
plot = []
for i in range(10000):

    loss = []
    for o in range(100):
        input = torch.randn(1, 1)
        history = torch.randn(5, 1)
        weights = torch.exp(-0.5 * ((input - history) ** 2).sum(dim=1))
        target = torch.sum(weights * history) / torch.sum(weights)
        output = MultiHeadAgent(input, history)
        loss.append(torch.nn.HuberLoss()(output, target))
        
    if i % 1000 == 0:
        print("Epoch: ", i)
    loss = torch.mean(torch.stack(loss))
    plot.append(loss.item())
    MultiHeadAgent.learn(loss)

import matplotlib.pyplot as plt
plt.plot(plot)
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("ACM Network Loss")
plt.show()

NameError: name 'torch' is not defined

In [1]:
from ppo.ppo_agent import PPOAgent
import gymnasium as gym
import torch
from agent_configs import PPOConfig, ActorConfig, CriticConfig
from ACMconfig import ACModelconfig, ACMNconfig, ACMAconfig, Buffconfig, MHABconfig
from game_configs import CartPoleConfig
from acm_agent import ACMAgent
from acm_network import MHA
env = gym.make('CartPole-v1', render_mode='rgb_array')

device = torch.device('mps')

MultiHeadAttentionBlockconfig = {
    'optimizer': "Adam",
    'learning_rate': 1e-4,
    'adam_epsilon': 1e-7,
    'clipnorm': 0.5,
    'layers': [
        {
            'type': 'MultiheadAttention',
            'num_heads': 1,
            'dropout': 0.1,
            'embed_dim': 12,
            'bias': False,
            'batch_first': True,
            'kdim': 12,
            'vdim': 12,
            'add_zero_attn': False,
            'add_bias_kv': False
        },
        {
            'type': 'Linear',
            'in_features': 12,
            'out_features': 12,
            'bias': True,
        },
        {
            'type': 'MultiheadAttention',
            'num_heads': 1,
            'dropout': 0.1,
            'embed_dim': 12,
            'bias': False,
            'batch_first': True,
            'kdim': 12,
            'vdim': 12,
            'add_zero_attn': False,
            'add_bias_kv': False
        },
        {
            'type': 'Linear',
            'in_features': 12,
            'out_features': 12,
            'bias': True,
        },
        {
            'type': 'MultiheadAttention',
            'num_heads': 1,
            'dropout': 0.1,
            'embed_dim': 4,
            'bias': False,
            'batch_first': True,
            'kdim': 12,
            'vdim': 12,
            'add_zero_attn': False,
            'add_bias_kv': False
        },
        {
            'type': 'Linear',
            'in_features': 4,
            'out_features': 4,
            'bias': True,
        }
    ],
    'self_attention': [
        True,
        False,
        True,
        False,
        False,
        False,
    ]
}

MemoryBufferConfig = {
    'buffer_size': 1000,
}
config_dict = {
        'activation': 'tanh',
        'clip_param': 0.2,
        'kernel_initializer': 'orthogonal',
        # NORMALIZATION?
        'discount_factor': 0.99,
        'gae_lambda': 0.95,
        'critic_dense_layers': [64],
        'actor_dense_layers': [64],
        # REWARD CLIPPING
        'steps_per_epoch': 512,
        'train_policy_iterations': 4,
        'train_value_iterations': 4,
        'target_kl': 0.02,
        'entropy_coefficient': 0.01,
        'num_minibatches': 1,
        'loss_function': None,
        'observation_space': env.observation_space,
    }

actor_config_dict = {
    'optimizer': torch.optim.Adam,
    'learning_rate': 2.5e-6,
    'adam_epsilon': 1e-7,
    'clipnorm': 0.5,
    'loss_function': None,
}

critic_config_dict = {
    'optimizer': torch.optim.Adam,
    'learning_rate': 2.5e-6,
    'adam_epsilon': 1e-7,
    'clipnorm': 0.5,
    'loss_function': None,
}

# print("Actor Config")
actor_config = ActorConfig(actor_config_dict)
# print("Critic Config")
critic_config = CriticConfig(critic_config_dict)

# print("PPO Config")
config = PPOConfig(config_dict, CartPoleConfig(), actor_config=actor_config, critic_config=critic_config)


# print("Memory Buffer Config (for Attention Network)")
MemoryBufferConfig = Buffconfig(MemoryBufferConfig)

MultiHeadAttentionBlockconfig = MHABconfig(config=MultiHeadAttentionBlockconfig)
MultiHeadAgent = MHA(config=MultiHeadAttentionBlockconfig, device="mps", logging=False)

Agent = PPOAgent(env=env, config=config, Buffconfig=MemoryBufferConfig, MHAconfig=MultiHeadAttentionBlockconfig, device="cpu", name="PPOWithAttention")



Using         adam_epsilon                  : 1e-07
Using         learning_rate                 : 2.5e-06
Using         clipnorm                      : 0.5
Using         optimizer                     : <class 'torch.optim.adam.Adam'>
Using         adam_epsilon                  : 1e-07
Using         learning_rate                 : 2.5e-06
Using         clipnorm                      : 0.5
Using         optimizer                     : <class 'torch.optim.adam.Adam'>
Using default save_intermediate_weights     : False
Using default training_steps                : 10000
Using default adam_epsilon                  : 1e-06
Using default momentum                      : 0.9
Using default learning_rate                 : 0.001
Using default clipnorm                      : 0
Using default optimizer                     : <class 'torch.optim.adam.Adam'>
Using default weight_decay                  : 0.0
Using         loss_function                 : None
Using         activation                    : t

  logger.warn(


In [2]:
Agent.train()

Resuming training at step 1 / 10000
Training step: 1/10000


  self.MBuff.add(torch.from_numpy(state), torch.tensor(action), torch.tensor(reward), torch.tensor(next_state), torch.tensor(log_probability), torch.tensor(0 if (terminated or truncated) else 1))


INPUTS SHAPE torch.Size([512, 8])
INPUTS tensor([[-0.0038, -0.4163, -0.1118,  ..., -0.2749, -0.1788, -0.5178],
        [-0.6852, -0.5715, -0.1133,  ..., -0.2749, -0.1789, -0.5178],
        [ 0.0731,  0.1835,  0.0431,  ..., -0.2748, -0.1785, -0.5175],
        ...,
        [ 0.2099,  0.0974,  0.1722,  ..., -0.2749, -0.1796, -0.5184],
        [ 0.0178, -0.6204,  0.0024,  ..., -0.2748, -0.1786, -0.5176],
        [ 0.0291,  0.5348, -0.0056,  ..., -0.2749, -0.1787, -0.5177]],
       grad_fn=<CatBackward0>)
Open AI Spinning Up KL Divergence tensor(-2.2624e-06)
37 Implimentation Details KL Divergence tensor(1.9616e-08)
INPUTS SHAPE torch.Size([512, 8])
INPUTS tensor([[-3.7649e-03, -4.1633e-01, -1.1181e-01,  ...,  6.9030e+04,
         -8.4216e+04,  6.0938e+04],
        [-6.8520e-01, -5.7145e-01, -1.1328e-01,  ...,  2.3012e+01,
         -3.3232e+01,  1.7773e+01],
        [ 7.3068e-02,  1.8348e-01,  4.3083e-02,  ...,  7.0530e+04,
         -8.6046e+04,  6.2262e+04],
        ...,
        [ 2.0985e-

                                                   

Moviepy - Done !
Moviepy - video ready checkpoints/PPOWithAttention/step_0/videos/PPOWithAttention/0/PPOWithAttention-episode-4.mp4
score:  10.0




{'score': [{'score': 44.0}, {'score': 25.0}, {'score': 29.0}, {'score': 18.0}, {'score': 109.0}, {'score': 46.0}, {'score': 37.0}, {'score': 129.0}, {'score': 13.0}, {'score': 33.0}, {'score': 29.0}], 'actor_loss': [tensor(1.9616e-08), tensor(7.2323), tensor(7.2323), tensor(6.5699)], 'critic_loss': [tensor(601.9719), tensor(27194.1328), tensor(24109.5820), tensor(6720.5293)], 'test_score': [{'score': 9.0, 'max_score': 10.0, 'min_score': 8.0}]}
{'score': 475.0, 'test_score': 475.0, 'actor_loss': 0.02}
[{'score': 44.0}, {'score': 25.0}, {'score': 29.0}, {'score': 18.0}, {'score': 109.0}, {'score': 46.0}, {'score': 37.0}, {'score': 129.0}, {'score': 13.0}, {'score': 33.0}, {'score': 29.0}]
[{'score': 9.0, 'max_score': 10.0, 'min_score': 8.0}]


  axs[row][col].set_xlim(1, len(values))
  self.MBuff.add(torch.from_numpy(state), torch.tensor(action), torch.tensor(reward), torch.tensor(next_state), torch.tensor(log_probability), torch.tensor(0 if (terminated or truncated) else 1))


INPUTS SHAPE torch.Size([512, 8])
INPUTS tensor([[-2.1443e-02, -5.5325e-01, -7.4756e-03,  ..., -1.3603e+05,
          1.1704e+05, -1.3648e+05],
        [-3.1067e-03, -4.1113e-01,  6.4681e-03,  ..., -1.2543e+05,
          1.0791e+05, -1.2584e+05],
        [-5.9667e-02, -7.8768e-01,  1.7657e-02,  ..., -1.3779e+05,
          1.1856e+05, -1.3825e+05],
        ...,
        [-1.0576e-01, -1.3759e+00,  7.6173e-02,  ..., -1.3603e+05,
          1.1704e+05, -1.3648e+05],
        [-3.9333e-02, -7.5800e-01,  5.2240e-02,  ..., -1.3426e+05,
          1.1552e+05, -1.3470e+05],
        [-7.8813e-02, -9.3678e-01,  9.5210e-02,  ..., -1.3249e+05,
          1.1400e+05, -1.3293e+05]], grad_fn=<CatBackward0>)
Open AI Spinning Up KL Divergence tensor(0.)
37 Implimentation Details KL Divergence tensor(0.)
INPUTS SHAPE torch.Size([512, 8])
INPUTS tensor([[-2.1443e-02, -5.5325e-01, -7.4756e-03,  ..., -2.5193e+05,
          2.0453e+05, -2.5264e+05],
        [-3.1067e-03, -4.1113e-01,  6.4681e-03,  ..., -2.5760e+

  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


INPUTS SHAPE torch.Size([512, 8])
INPUTS tensor([[-2.8105e-02, -1.0213e+00,  8.4636e-02,  ..., -2.4989e+12,
          1.4704e+12, -2.5061e+12],
        [-2.6832e-02, -1.4761e-01,  5.9091e-03,  ..., -2.4863e+12,
          1.4629e+12, -2.4935e+12],
        [ 3.4652e-02, -3.1891e-02, -2.6006e-02,  ..., -2.4737e+12,
          1.4555e+12, -2.4808e+12],
        ...,
        [-1.4271e-01, -1.7061e+00,  1.4822e-01,  ..., -2.5020e+12,
          1.4722e+12, -2.5092e+12],
        [-2.8422e-02, -2.6543e-03,  3.0006e-02,  ..., -2.4800e+12,
          1.4592e+12, -2.4872e+12],
        [-1.8335e-01, -1.7829e+00,  1.7208e-01,  ..., -2.4548e+12,
          1.4444e+12, -2.4619e+12]], grad_fn=<CatBackward0>)
Open AI Spinning Up KL Divergence tensor(0.)
37 Implimentation Details KL Divergence tensor(0.)
INPUTS SHAPE torch.Size([512, 8])
INPUTS tensor([[-2.8105e-02, -1.0213e+00,  8.4636e-02,  ..., -2.8335e+12,
          1.6418e+12, -2.8416e+12],
        [-2.6832e-02, -1.4761e-01,  5.9091e-03,  ..., -2.7775e+

KeyboardInterrupt: 