In [1]:
import numpy as np
import torch as T
import mlflow
import os

from unitree_robot.train.environments import Go2Env, MujocoEnv
from unitree_robot.train.training import Trainer
from unitree_robot.train.experiments import StandUpExperiment



PARAMS = {
    "MJCF_PATH": "./external/unitree_mj_models/go2/scene.xml",
    "SEED": 0,
    "DEVICE": "cuda:0",
    "BODY_ANGLE_REWARD_SCALE": 1.0,
    "BODY_HEIGHT_REWARD_SCALE": 1.0,
    "ENERGY_REWARD_SCALE": 1.0,
    "JOINT_LIMIT_REWARD_SCALE": 1.0,
    "DISTANCE_FROM_ORIGIN_REWARD": 1.0,

    "REWARD_SCALING": 3.0,
    "ENTROPY_LOSS_SCALE": 0.01,
    "VALUE_LOSS_SCALE": 0.1,
    "POLICY_LOSS_SCALE": 1,
    "DISCOUNTING": 0.97,

    "SIM_FRAMES_PER_STEP": 5, # number of frames that are simulated between each 'decision step' by the network
    "TRAIN_EPOCHS": 5000,
    "LEARNING_RATE": 4e-3,
    "MAX_GRADIENT_NORM": 2.0,

    "NETWORK_HIDDEN_SIZE": 16,
    "NETWORK_LAYERS": 1,

    "UNROLL_LENGTH": 256, # number of actions taken in the environment (inbetween these actions there are SIM_FRAMES_PER_STEP steps of simulation)
    "NUM_UNROLLS": 1, # number of full unrolls to collect training samples
    "MINIBATCH_SIZE": 128, # the sequence length that is trained on (UNROLL_LENGTH has to be divisible by this number)
    "TRAIN_BATCH_SIZE": 1, # the number of sequences that are used for a single training step (all batches are used in one epoch)
    
    "EXPERIMENT_NAME": "standing_up"
}

In [2]:
np.random.seed(seed=PARAMS["SEED"])
T.random.manual_seed(seed=PARAMS["SEED"])

experiment=StandUpExperiment(
    body_name = "base_link",
    body_angle_reward_scale = PARAMS["BODY_ANGLE_REWARD_SCALE"],
    body_height_reward_scale = PARAMS["BODY_HEIGHT_REWARD_SCALE"],
    energy_reward_scale = PARAMS["ENERGY_REWARD_SCALE"],
    distance_from_origin_reward_scale = PARAMS["DISTANCE_FROM_ORIGIN_REWARD"],
    # joint_limit_reward_scale = JOINT_LIMIT_REWARD_SCALE
)

env = Go2Env(
    model_path = PARAMS["MJCF_PATH"],
    sim_frames_per_step = PARAMS["SIM_FRAMES_PER_STEP"]
)


trainer = Trainer(
    env=env,
    experiment=experiment,
    device=PARAMS["DEVICE"],
    network_hidden_size=PARAMS["NETWORK_HIDDEN_SIZE"],
    network_layers=PARAMS["NETWORK_LAYERS"],
    learning_rate=PARAMS["LEARNING_RATE"],
    reward_scaling=PARAMS["REWARD_SCALING"],
    discounting=PARAMS["DISCOUNTING"],
    max_gradient_norm=PARAMS["MAX_GRADIENT_NORM"]
)

Trainer: device set to gpu (cuda) !


In [None]:
if not mlflow.get_experiment_by_name(PARAMS["EXPERIMENT_NAME"]):
    mlflow.create_experiment(PARAMS["EXPERIMENT_NAME"])
mlflow.set_experiment(PARAMS["EXPERIMENT_NAME"])


with mlflow.start_run():

    try:
        
        mlflow.log_params(PARAMS)
        
        trainer.train(
            epochs=PARAMS["TRAIN_EPOCHS"],
            unroll_length=PARAMS["UNROLL_LENGTH"],
            num_unrolls=PARAMS["NUM_UNROLLS"],
            minibatch_size=PARAMS["MINIBATCH_SIZE"],
            train_batch_size=PARAMS["TRAIN_BATCH_SIZE"],
            seed=PARAMS["SEED"],
            entropy_loss_scale=PARAMS["ENTROPY_LOSS_SCALE"],
            value_loss_scale=PARAMS["VALUE_LOSS_SCALE"],
            policy_loss_scale=PARAMS["POLICY_LOSS_SCALE"],
        )

    except Exception as e:
        raise e
    finally:
        mlflow.stop_run()

training:   1%|â–Œ                                                          | 48/5000 [01:47<3:08:42,  2.29s/it]

---

---

# Visualization

In [1]:
import numpy as np
import torch as T

from unitree_robot.train.environments import Go2Env
from unitree_robot.train.rewards import DistanceFromCenterReward

env = Go2Env(
    model_path = "./external/unitree_mj_models/go2/scene.xml",
    sim_frames_per_step = 5
)

env.model.camera("main").pos = np.array([1, 1, 3])

try:

    i = 0
    while True:

        if i % 500 == 0:
            env.reset(seed=0)
        i += 1
        
        action = T.Tensor(env.action_space.sample())
        # env.step(action=action)
        env._do_simulation(ctrl=action)

        print(DistanceFromCenterReward()(env.data))

        env.render()
        
except Exception as e:
    raise e
finally:
    env.close()

0.44560439061427787
0.4464568901188164
0.44643275870380494
0.44533545482357256
0.443248116643078
0.44018469748310224
0.4361363945020628
0.4310949579397691
0.4251301280246506
0.41833201777132156
0.4110212723554074
0.40278236813349294
0.39372564845531144
0.38432114178625143
0.374258475454727
0.36350389826166896
0.35202373923217933
0.3397302708464992
0.32669443429209655
0.3126543453242468
0.2979851100417809
0.28267048524048843
0.26703350496573863
0.250856904863683
0.23414381584242036
0.22047244357045692
0.2091929932865492
0.19834870081938977
0.18726880342382984
0.17605013626450933
0.16673800680569456
0.1581109296253772
0.1522891742615536
0.1487168227831975
0.14589567795297184
0.14507890610200175
0.14740453533251452
0.15063107784671614
0.15373402819151852
0.15634720169217353
0.1583970229325583
0.1600895643829118
0.16174481266838717
0.16323894533573616
0.16461464994093902
0.16586310746026278
0.16698084585512568
0.16813970356911734
0.1693181119554199
0.17029461054345987
0.17125360934445083
0

KeyboardInterrupt: 

In [9]:
from unitree_robot.train.environments import lookat


target = env.data.cam("main").xpos + np.array([0, 0, -1])
target

# lookat(env.data.cam("main").xpos, target)


<mujoco._structs.MjData at 0x2b5a63c05b0>

In [2]:
# env.action_space

# len(env.data.qpos)

# env.data.("imu_quat")


# q_rotate = quaternion.as_quat_array(env.data.sensor("imu_quat").data)
# quaternion.rotate_vectors(q_rotate, np.array([0,0,1]))
# (np.array([1, 0, 0]))




# env.get_sensor_state()

# env.data.xquat.shape