In [1]:
from hydra import compose, initialize
from omegaconf import OmegaConf

with initialize(version_base=None, config_path="conf"):
    cfg = compose(config_name="config", overrides=["device=cpu"])
    OmegaConf.resolve(cfg)

## Initialize

In [2]:
from src.common.environments import MujocoMjxEnv
from src.common.agents import PPOAgentTorcRL, PPOAgent
from src.common.util import save_as_onnx
from src.common.experiments import Go2WalkingExperiment
import torch as T

agent = PPOAgent(**cfg.agent).to(device="cpu", dtype=T.float32)
# agent = PPOAgentTorcRL(**cfg.agent).to(device="cpu", dtype=T.float32)
environment = MujocoMjxEnv(**cfg.environment)
experiment = Go2WalkingExperiment(mjx_model = environment.mjx_model, **cfg.experiment)

## Save model as onnx

In [3]:
save_as_onnx(agent.network, "./model_checkpoints/test.onnx", 37) # go2

W1204 14:05:59.495000 388 Lib\site-packages\torch\onnx\_internal\exporter\_registration.py:107] torchvision is not installed. Skipping torchvision::nms


[torch.onnx] Obtain model graph for `BasicPolicyValueNetwork([...]` with `torch.export.export(..., strict=False)`...
[torch.onnx] Obtain model graph for `BasicPolicyValueNetwork([...]` with `torch.export.export(..., strict=False)`... ✅
[torch.onnx] Run decomposition...
[torch.onnx] Run decomposition... ✅
[torch.onnx] Translate the graph into ONNX...
[torch.onnx] Translate the graph into ONNX... ✅


## calculate reward

In [7]:
environment.reset(0)
data = environment.mjx_data
experiment.calculate_reward(data)

tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
      

In [7]:
# Command	1.0 – 2.0
# Energy	0.01 – 0.1
# Contact	0.2 – 0.6 (per foot)
# Height	0.5 – 1.0
# Orientation	0.01 – 0.05
# Smoothness	0.01 – 0.05

In [12]:
import torch as T
from src.common.datastructure import UnrollData
from src.common.agents import PPOAgentTorcRL


ud = UnrollData(
    num_unrolls = 128,
    unroll_length = 500,
    observation_size = 37,
    action_size = 8
)

pg_net = PPOAgentTorcRL(
    observation_size=37,
    action_size=8,
    network_hidden_size=64,
    discounting=0.99,
    lambda_=0.97,
    epsilon=0.25,
    policy_loss_scale=1.0,
    value_loss_scale=0.5,
    entropy_loss_scale=0.01,

    
    num_hidden_layers=1,
    moving_average_window_size=10,
    reward_scaling=1.0,
    train_sequence_length=10
)


# # we now have a batch of data to work with. Let's learn something from it.

# # We'll need an "advantage" signal to make PPO work.
# # We re-compute it at each epoch as its value depends on the value
# # network which is updated in the inner loop.
# advantage_module(tensordict_data)
# data_view = tensordict_data.reshape(-1)
# replay_buffer.extend(data_view.cpu())
# for _ in range(frames_per_batch // sub_batch_size):
#     subdata = replay_buffer.sample(sub_batch_size)
#     loss_vals = loss_module(subdata.to(device))
#     loss_value = (
#         loss_vals["loss_objective"]
#         + loss_vals["loss_critic"]
#         + loss_vals["loss_entropy"]
#     )

#     # Optimization: backward, grad clipping and optimization step
#     loss_value.backward()
#     # this is not strictly mandatory but it's good practice to keep
#     # your gradient norm bounded
#     torch.nn.utils.clip_grad_norm_(loss_module.parameters(), max_grad_norm)
#     optim.step()
#     optim.zero_grad()

# logs["reward"].append(tensordict_data["next", "reward"].mean().item())
# pbar.update(tensordict_data.numel())
# cum_reward_str = (
#     f"average reward={logs['reward'][-1]: 4.4f} (init={logs['reward'][0]: 4.4f})"
# )
# logs["step_count"].append(tensordict_data["step_count"].max().item())
# stepcount_str = f"step count (max): {logs['step_count'][-1]}"
# logs["lr"].append(optim.param_groups[0]["lr"])
# lr_str = f"lr policy: {logs['lr'][-1]: 4.4f}"




# if i % 10 == 0:
#     # We evaluate the policy once every 10 batches of data.
#     # Evaluation is rather simple: execute the policy without exploration
#     # (take the expected value of the action distribution) for a given
#     # number of steps (1000, which is our ``env`` horizon).
#     # The ``rollout`` method of the ``env`` can take a policy as argument:
#     # it will then execute this policy at each step.
#     with set_exploration_type(ExplorationType.DETERMINISTIC), torch.no_grad():
#         # execute a rollout with the trained policy
#         eval_rollout = env.rollout(1000, policy_module)
#         logs["eval reward"].append(eval_rollout["next", "reward"].mean().item())
#         logs["eval reward (sum)"].append(
#             eval_rollout["next", "reward"].sum().item()
#         )
#         logs["eval step_count"].append(eval_rollout["step_count"].max().item())
#         eval_str = (
#             f"eval cumulative reward: {logs['eval reward (sum)'][-1]: 4.4f} "
#             f"(init: {logs['eval reward (sum)'][0]: 4.4f}), "
#             f"eval step-count: {logs['eval step_count'][-1]}"
#         )
#         del eval_rollout
# pbar.set_description(", ".join([eval_str, cum_reward_str, stepcount_str, lr_str]))


In [13]:
data = ud.as_tensor_dict()

with T.no_grad():
    data = pg_net.loss_module.actor_network(data)
    data = pg_net.gae(data)


# out = pg_net.loss_module(data)

# loss_value = (
#     out["loss_objective"]
#     + out["loss_critic"]
#     + out["loss_entropy"]
# )

# loss_value
loc, scale, action, log_prob = pg_net.loss_module.actor_network(T.rand(1, 1, 32))
# pg_net.loss_module.actor_network.dist_sample_keys

T.cat([loc,scale], dim=-1)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x32 and 37x37)