In [1]:
from hydra import compose, initialize
from omegaconf import OmegaConf

with initialize(version_base=None, config_path="conf"):
    cfg = compose(config_name="config", overrides=["device=cpu"])
    OmegaConf.resolve(cfg)

In [2]:
cfg

{'optimizer': {'lr': 0.008}, 'lr_scheduler': {'linear_lr_warmup': {'start_factor': 0.25, 'end_factor': 1.0, 'total_iters': 20}, 'cosine_annealing_warm_restarts': {'T_0': 40, 'T_mult': 1, 'eta_min': 1e-05}}, 'train_epochs': 500, 'num_parallel_environments': 200, 'device': 'cpu', 'random_seed': 0, 'experiment_name': 'standing_up', 'xla_gpu_memory_fraction': 0.6, 'environment': {'num_parallel_environments': 200, 'sim_frames_per_step': 5, 'mujoco_timestep': 0.005, 'model_path': './external/unitree_mj_models/go2/scene.xml', 'initial_noise_scale': 0.1, 'observation_size': 37, 'action_size': 12}, 'experiment': {'body_name': 'base_link', 'body_angle_reward_scale': 1.0, 'body_height_reward_scale': 1.0, 'energy_reward_scale': 1.0, 'distance_from_origin_reward_scale': 1.0, 'joint_limit_reward_scale': 1.0}, 'agent': {'observation_size': 37, 'action_size': 12, 'train_sequence_length': 200, 'network_hidden_size': 16, 'num_hidden_layers': 0, 'lambda_': 0.99, 'epsilon': 0.3, 'discounting': 0.97, 'rewa

In [3]:
from unitree_robot.common.environments import MujocoMjxEnv

environment = MujocoMjxEnv(**cfg.environment)
environment.mjx_data_initial.__dir__()

['time',
 'qpos',
 'qvel',
 'act',
 'qacc_warmstart',
 'plugin_state',
 'ctrl',
 'qfrc_applied',
 'xfrc_applied',
 'eq_active',
 'mocap_pos',
 'mocap_quat',
 'qacc',
 'act_dot',
 'userdata',
 'sensordata',
 'xpos',
 'xquat',
 'xmat',
 'xipos',
 'ximat',
 'xanchor',
 'xaxis',
 'ten_length',
 'geom_xpos',
 'geom_xmat',
 'site_xpos',
 'site_xmat',
 'cam_xpos',
 'cam_xmat',
 'subtree_com',
 'cvel',
 'qfrc_bias',
 'qfrc_gravcomp',
 'qfrc_fluid',
 'qfrc_passive',
 'qfrc_actuator',
 'actuator_force',
 'qfrc_smooth',
 'qacc_smooth',
 'qfrc_constraint',
 'qfrc_inverse',
 '_impl',
 '__module__',
 '__firstlineno__',
 '__annotations__',
 '__doc__',
 'impl',
 '__getattr__',
 '__getitem__',
 '__static_attributes__',
 '__dataclass_params__',
 '__dataclass_fields__',
 '__replace__',
 '__hash__',
 '__init__',
 '__repr__',
 '__eq__',
 '__setattr__',
 '__delattr__',
 '__match_args__',
 'replace',
 'bind',
 '__init_subclass__',
 'fields',
 'tree_replace',
 '__dict__',
 '__weakref__',
 '__new__',
 '__str__

In [138]:
import numpy as np
import string
from unitree_robot.common.experiments import Experiment
from mujoco.mjx import Data as MjxData
from mujoco.mjx import Model as MjxModel


In [139]:

# def food_contact(data: MjxData):
#     return data.

# energy_reward(environment.mjx_data_initial)
# environment.mjx_data_initial.xpos

experiment = Go2WalkingExperiment(environment.mjx_model, torso_name="base_link", energy_reward_scale=0.1, torso_height_reward_scale=1.0, torso_distance_from_origin_reward_scale=1.0)

# mapper.parse_mjx_data(environment.mjx_data_initial)
experiment.calculate_reward(environment.mjx_data_initial)

Array(0., dtype=float32)

In [1]:
# Command	1.0 – 2.0
# Energy	0.01 – 0.1
# Contact	0.2 – 0.6 (per foot)
# Height	0.5 – 1.0
# Orientation	0.01 – 0.05
# Smoothness	0.01 – 0.05

In [1]:
import torch as T
from unitree_robot.common.datastructure import UnrollData
from unitree_robot.common.agents import PPOAgentTorcRL


ud = UnrollData(
    num_unrolls = 128,
    unroll_length = 500,
    observation_size = 32,
    action_size = 8
)

pg_net = PPOAgentTorcRL(
    observation_size=32,
    action_size=8,
    network_hidden_size=64,
    discounting=0.99,
    lambda_=0.97,
    epsilon=0.25,
    policy_loss_scale=1.0,
    value_loss_scale=0.5,
    entropy_loss_scale=0.01,

    
    num_hidden_layers=1,
    moving_average_window_size=10,
    reward_scaling=1.0,
    train_sequence_length=10
)




# # we now have a batch of data to work with. Let's learn something from it.

# # We'll need an "advantage" signal to make PPO work.
# # We re-compute it at each epoch as its value depends on the value
# # network which is updated in the inner loop.
# advantage_module(tensordict_data)
# data_view = tensordict_data.reshape(-1)
# replay_buffer.extend(data_view.cpu())
# for _ in range(frames_per_batch // sub_batch_size):
#     subdata = replay_buffer.sample(sub_batch_size)
#     loss_vals = loss_module(subdata.to(device))
#     loss_value = (
#         loss_vals["loss_objective"]
#         + loss_vals["loss_critic"]
#         + loss_vals["loss_entropy"]
#     )

#     # Optimization: backward, grad clipping and optimization step
#     loss_value.backward()
#     # this is not strictly mandatory but it's good practice to keep
#     # your gradient norm bounded
#     torch.nn.utils.clip_grad_norm_(loss_module.parameters(), max_grad_norm)
#     optim.step()
#     optim.zero_grad()

# logs["reward"].append(tensordict_data["next", "reward"].mean().item())
# pbar.update(tensordict_data.numel())
# cum_reward_str = (
#     f"average reward={logs['reward'][-1]: 4.4f} (init={logs['reward'][0]: 4.4f})"
# )
# logs["step_count"].append(tensordict_data["step_count"].max().item())
# stepcount_str = f"step count (max): {logs['step_count'][-1]}"
# logs["lr"].append(optim.param_groups[0]["lr"])
# lr_str = f"lr policy: {logs['lr'][-1]: 4.4f}"




# if i % 10 == 0:
#     # We evaluate the policy once every 10 batches of data.
#     # Evaluation is rather simple: execute the policy without exploration
#     # (take the expected value of the action distribution) for a given
#     # number of steps (1000, which is our ``env`` horizon).
#     # The ``rollout`` method of the ``env`` can take a policy as argument:
#     # it will then execute this policy at each step.
#     with set_exploration_type(ExplorationType.DETERMINISTIC), torch.no_grad():
#         # execute a rollout with the trained policy
#         eval_rollout = env.rollout(1000, policy_module)
#         logs["eval reward"].append(eval_rollout["next", "reward"].mean().item())
#         logs["eval reward (sum)"].append(
#             eval_rollout["next", "reward"].sum().item()
#         )
#         logs["eval step_count"].append(eval_rollout["step_count"].max().item())
#         eval_str = (
#             f"eval cumulative reward: {logs['eval reward (sum)'][-1]: 4.4f} "
#             f"(init: {logs['eval reward (sum)'][0]: 4.4f}), "
#             f"eval step-count: {logs['eval step_count'][-1]}"
#         )
#         del eval_rollout
# pbar.set_description(", ".join([eval_str, cum_reward_str, stepcount_str, lr_str]))


In [23]:
data = ud.as_tensor_dict()

with T.no_grad():
    data = pg_net.loss_module.actor_network(data)
    data = pg_net.gae(data)


# out = pg_net.loss_module(data)

# loss_value = (
#     out["loss_objective"]
#     + out["loss_critic"]
#     + out["loss_entropy"]
# )

# loss_value
loc, scale, action, log_prob = pg_net.loss_module.actor_network(T.rand(1, 1, 32))
# pg_net.loss_module.actor_network.dist_sample_keys

T.cat([loc,scale], dim=-1)

tensor([[[-0.4419,  0.0302,  0.1637,  0.3082, -0.0841,  0.0662,  0.0974,
          -0.2352,  1.1254,  1.0621,  0.8163,  1.0777,  1.0454,  0.8775,
           1.0421,  0.8418]]], grad_fn=<CatBackward0>)