## Installation and Imports

In [None]:
# Configure env variables
import os
os.environ["DARM_MUJOCO_PATH"] = "/home/daniel/DARM/darm_mujoco"

In [None]:
import ray
from ray.rllib.algorithms.sac import SACConfig
from ray.tune.registry import register_env
from ray.tune.logger import pretty_print

from ray import air, tune
from ray.air import session
from ray.air.integrations.wandb import setup_wandb
from ray.air.integrations.wandb import WandbLoggerCallback

import gym
from darm_gym_env import DARMSFEnv

## Register Environment with RLlib

In [None]:
# env_creator = lambda env_config: gym.make("darm/DarmSFHand-v0", render_mode=None, hand_name="hand1") # DARMSFEnv(render_mode=None, reaction_time=0.08, hand_name="hand1") # 

def make_env(env_config):
    env = gym.wrappers.TimeLimit(env=DARMSFEnv(render_mode=None, reaction_time=0.08, hand_name="hand1"), max_episode_steps=100)
    return env
env_creator = lambda env_config: make_env(env_config) #gym.wrappers.TimeLimit(env=DARMSFEnv(render_mode=None, reaction_time=0.08, hand_name="hand1"), max_episode_steps=100)

register_env("darm/DarmSFHand-v0", env_creator)

## Configure and Run

In [None]:
config = (
    SACConfig()
    .environment(
        env="darm/DarmSFHand-v0",
        normalize_actions=True
    )
    .training(
        q_model_config={
            "fcnet_activation": "relu",
            "fcnet_hiddens": [256, 256]
        },
        policy_model_config={
            "fcnet_activation": "relu",
            "fcnet_hiddens": [256, 256]
        },
        tau=0.005,
        target_entropy="auto",
        n_step=1,  # num of SGD steps per batch of data in training step
        train_batch_size=256,
        target_network_update_freq=1,
        replay_buffer_config={"type":"MultiAgentPrioritizedReplayBuffer"},
        num_steps_sampled_before_learning_starts=10_000,
        optimization_config={
          "actor_learning_rate": 0.0003,
          "critic_learning_rate": 0.0003,
          "entropy_learning_rate": 0.0003,
        },
        clip_actions=False
    )
    .rollouts(
        num_rollout_workers=3,
        rollout_fragment_length=1,
    )
    .resources(num_gpus=0)
    .evaluation(evaluation_interval=100) # For 1000 timesteps iter; 100 evals
    .reporting(
        min_sample_timesteps_per_iteration=1000,
        metrics_num_episodes_for_smoothing=5
    )
    .framework(framework="torch")
)

In [None]:
wandb_init = dict(
    save_code=True,
    config={
        "env": "DARMSFHand-v0",
        
        "actor_learning_rate": 0.0003,
        "critic_learning_rate": 0.0003,
        "entropy_learning_rate": 0.0003,
        "framework": "torch",
        
        "num_rollout_workers": 3,
        "num_gpu": 0,
        "metrics_num_episodes_for_smoothing": 5
    },
    tags=["single_finger"],
    notes="Fixed the env to use targets that are delta increaments from the starting state. Removed velocity penalty, and used only effort penalty",
    name="Test_DARMSF_DELTA_TARGET"
    # job_type=D
    # monitor_gym=
)

In [None]:
tuner = tune.Tuner(
    "SAC",
    run_config=air.RunConfig(
        name="Test_DARMSF_DELTA_TARGET",
        stop={"training_iteration": 10_000, "episode_reward_mean": 200},
        checkpoint_config=air.CheckpointConfig(checkpoint_at_end=True),
        callbacks=[
                WandbLoggerCallback(project="DARM", save_checkpoints=True, **wandb_init)
            ],
        local_dir="./results"
        ),
    param_space=config
)

results = tuner.fit()

2023-02-09 14:34:37,372	INFO worker.py:1538 -- Started a local Ray instance.
2023-02-09 14:34:38,879	INFO wandb.py:250 -- Already logged into W&B.


0,1
Current time:,2023-02-09 14:40:08
Running for:,00:05:30.07
Memory:,6.4/7.5 GiB

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_darm_DarmSFHand-v0_80340_00000,RUNNING,192.168.152.36:27220,14,267.249,14028,-181.857,-167.734,-193.16,100


[2m[36m(SAC pid=27220)[0m 2023-02-09 14:34:42,995	INFO algorithm.py:501 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[34m[1mwandb[0m: Currently logged in as: [33mdanieladejumo[0m. Use [1m`wandb login --relogin`[0m to force relogin
[2m[33m(raylet)[0m [2023-02-09 14:34:47,263 E 26959 27004] (raylet) file_system_monitor.cc:105: /tmp/ray/session_2023-02-09_14-34-35_158333_26854 is over 95% full, available space: 1199403008; capacity: 31845081088. Object creation will fail if spilling is required.
[2m[36m(RolloutWorker pid=27324)[0m   logger.warn(


[2m[36m(RolloutWorker pid=27324)[0m Loaded XML file successfully


[2m[36m(RolloutWorker pid=27325)[0m   logger.warn(


[2m[36m(RolloutWorker pid=27325)[0m Loaded XML file successfully


[2m[36m(RolloutWorker pid=27327)[0m   logger.warn(


[2m[36m(RolloutWorker pid=27327)[0m Loaded XML file successfully
[2m[36m(SAC pid=27220)[0m Loaded XML file successfully


[2m[36m(SAC pid=27220)[0m   logger.warn(


Trial name,agent_timesteps_total,counters,custom_metrics,date,done,episode_len_mean,episode_media,episode_reward_max,episode_reward_mean,episode_reward_min,episodes_this_iter,episodes_total,experiment_id,hostname,info,iterations_since_restore,node_ip,num_agent_steps_sampled,num_agent_steps_trained,num_env_steps_sampled,num_env_steps_sampled_this_iter,num_env_steps_trained,num_env_steps_trained_this_iter,num_faulty_episodes,num_healthy_workers,num_in_flight_async_reqs,num_remote_worker_restarts,num_steps_trained_this_iter,perf,pid,policy_reward_max,policy_reward_mean,policy_reward_min,sampler_perf,sampler_results,time_since_restore,time_this_iter_s,time_total_s,timers,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
SAC_darm_DarmSFHand-v0_80340_00000,14028,"{'num_env_steps_sampled': 14028, 'num_env_steps_trained': 343808, 'num_agent_steps_sampled': 14028, 'num_agent_steps_trained': 343808, 'last_target_update_ts': 14028, 'num_target_updates': 1343}",{},2023-02-09_14-39-18,False,100,{},-167.734,-181.857,-193.16,9,142,4d5d6cddaea7444681811f5901f7f828,Daniel,"{'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 8.39486026763916, 'actor_loss': -6.580883979797363, 'critic_loss': 0.4550599455833435, 'alpha_loss': -3.383199691772461, 'alpha_value': 0.66830647, 'log_alpha_value': -0.40300846, 'target_entropy': -5.0, 'policy_t': -0.03436039388179779, 'mean_q': 4.342226505279541, 'max_q': 5.156735420227051, 'min_q': 3.575044631958008}, 'td_error': array([6.2540793e-01, 4.5765638e-01, 6.0351992e-01, 5.3600264e-01,  5.3697062e-01, 3.5429788e-01, 6.3966894e-01, 4.3950689e-01,  3.0898547e-01, 3.5763597e-01, 2.5843692e-01, 2.4424387e+02,  6.2366247e-01, 1.0298094e+00, 3.7921548e-01, 5.8851790e-01,  6.8428922e-01, 3.6911631e-01, 2.4983907e-01, 3.1475449e-01,  4.7771585e-01, 2.0441175e-01, 2.5444245e-01, 4.7385430e-01,  6.3522744e-01, 2.4424387e+02, 2.2937608e-01, 1.9195795e-01,  1.1908977e+00, 2.7746677e-01, 7.4611473e-01, 5.3805947e-01,  2.8580499e-01, 4.8387599e-01, 4.7730875e-01, 3.1153989e-01,  1.8242693e-01, 4.0938568e-01, 4.4973850e-02, 3.5717463e-01,  4.5586491e-01, 5.5774808e-01, 3.1213140e-01, 6.1641049e-01,  7.9763246e-01, 5.5203891e-01, 6.3834667e-02, 3.4150100e-01,  1.7865181e-01, 4.3705249e-01, 9.3548775e-02, 4.8295999e-01,  5.6634879e-01, 3.6766005e-01, 6.7817593e-01, 3.5869682e-01,  7.9080129e-01, 3.2671928e-01, 4.3091679e-01, 2.9990554e-01,  3.8590550e-01, 2.8836060e-01, 5.7863498e-01, 5.6745982e-01,  5.3336382e-02, 5.7227468e+00, 6.2225733e+00, 4.2531133e-01,  3.6855268e-01, 2.5376964e-01, 5.9127808e-01, 3.9476085e-01,  4.9604011e-01, 2.4424387e+02, 3.9734340e-01, 5.8897972e-01,  6.5261531e+00, 4.1279554e-02, 4.1853189e-01, 5.2429509e-01,  2.2952294e-01, 1.3147068e-01, 1.0730052e-01, 5.5268192e-01,  4.6468949e-01, 4.2544770e-01, 8.0994105e-01, 5.7329106e-01,  4.6431756e-01, 9.7654200e-01, 5.6129479e-01, 5.1981664e-01,  4.6256256e-01, 3.2733679e-01, 3.4979343e-01, 5.5347395e-01,  6.5031314e-01, 5.6426740e-01, 3.4597373e-01, 4.5243812e-01,  4.2402530e-01, 5.8651328e-01, 1.1226618e+00, 1.8286109e-01,  4.9003768e-01, 4.9098778e-01, 5.1805449e-01, 8.1689692e-01,  8.0412102e-01, 6.4753199e-01, 6.1038661e-01, 4.3800688e-01,  2.1849394e-02, 3.0075407e-01, 2.0346689e-01, 4.3718982e-01,  7.8672647e-02, 4.2797709e-01, 2.8174806e-01, 1.0752141e+00,  1.1570907e-01, 4.3866563e-01, 2.7734494e-01, 5.7188749e-01,  6.6870232e+00, 3.7410498e-01, 4.7048974e-01, 2.4428757e+02,  2.1687126e-01, 1.6757846e-01, 1.6909933e-01, 5.7601271e+00,  6.6289234e-01, 3.4381604e-01, 5.7368374e-01, 5.6940422e+00,  6.9504523e-01, 3.7642837e-01, 6.4249420e+00, 4.4458151e-01,  5.2501297e-01, 5.7120371e-01, 5.0119138e-01, 5.8855166e+00,  4.8160148e-01, 1.9192815e-01, 5.1493216e-01, 7.9043508e-01,  1.0468115e+00, 4.7878098e-01, 5.1332283e-01, 5.0460100e-01,  3.3191776e-01, 5.1709008e-01, 4.0797472e-02, 2.0680261e-01,  6.4284921e-01, 3.9638317e-01, 7.5174785e-01, 4.5112944e-01,  5.2390027e-01, 4.3554664e-01, 8.9489746e-01, 4.5726538e-01,  4.8996449e-01, 4.6001816e-01, 3.2921410e-01, 7.2976160e-01,  5.1476574e-01, 4.4994020e-01, 3.4559798e-01, 6.7357540e-01,  3.6850071e-01, 1.3330221e-01, 3.0081534e-01, 4.7243357e-02,  5.4233909e-01, 3.5538983e-01, 6.1634445e-01, 8.9196515e-01,  2.8894258e-01, 5.5968370e+00, 4.0648484e-01, 2.0235538e-01,  1.9109607e-01, 6.5696096e-01, 5.6301966e+00, 3.5394931e-01,  2.6782990e-01, 5.6557250e-01, 9.5883894e-01, 5.1984143e-01,  6.8428326e-01, 7.7551746e-01, 2.1740031e-01, 3.1341791e-01,  2.4398997e+02, 8.0687523e-02, 4.9486244e-01, 4.5796943e-01,  2.4358420e+02, 6.4911675e-01, 5.7567859e-01, 3.2206440e-01,  3.8403773e-01, 1.3797998e-02, 6.5622520e-01, 6.7508049e+00,  2.2412586e-01, 5.6361699e-01, 1.3913512e-01, 7.0668221e-01,  4.5013642e-01, 3.7972713e-01, 2.4033666e-01, 1.0932443e+00,  1.4943323e+00, 1.9672847e-01, 5.2402592e-01, 2.4330020e-01,  1.0074015e+00, 5.2334332e-01, 5.3800082e-01, 6.7673922e-01,  5.2325344e-01, 5.8589959e-01, 1.3278627e-01, 4.3292975e-01,  5.6907225e-01, 6.6979933e-01, 1.1941693e+00, 2.1429205e-01,  3.3362865e-01, 5.5084324e-01, 5.3002417e-01, 5.6737447e-01,  2.6911235e-01, 2.4358420e+02, 2.8977942e-01, 7.0883727e-01,  7.9896545e-01, 4.2363119e-01, 2.4384717e+02, 1.1563802e-01,  2.0128179e-01, 5.3285384e-01, 7.3034000e-01, 5.0068188e-01,  6.5261531e+00, 2.6539207e-01, 5.2622700e-01, 4.8029709e-01,  5.7244158e-01, 6.1082673e-01, 9.5797169e-01, 3.5520434e-01],  dtype=float32), 'mean_td_error': 8.342748641967773, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 256.0, 'num_grad_updates_lifetime': 1343.0, 'diff_num_grad_updates_vs_sampler_policy': 1342.0}}, 'num_env_steps_sampled': 14028, 'num_env_steps_trained': 343808, 'num_agent_steps_sampled': 14028, 'num_agent_steps_trained': 343808, 'last_target_update_ts': 14028, 'num_target_updates': 1343}",14,192.168.152.36,14028,343808,14028,1002,343808,85504,0,3,0,0,85504,"{'cpu_util_percent': 52.06941176470589, 'ram_util_percent': 87.08588235294118}",27220,{},{},{},"{'mean_raw_obs_processing_ms': 1.2571061467183018, 'mean_inference_ms': 2.3783673479538177, 'mean_action_processing_ms': 0.23117343083424405, 'mean_env_wait_ms': 3.096820011455652, 'mean_env_render_ms': 0.0}","{'episode_reward_max': -167.73383703827858, 'episode_reward_min': -193.160078458488, 'episode_reward_mean': -181.85733484476805, 'episode_len_mean': 100.0, 'episode_media': {}, 'episodes_this_iter': 9, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [-191.58597734570503, -190.74350184202194, -167.73383703827858, -184.81242395937443, -172.80720306932926, -193.160078458488, -171.496223077178, -177.10790572315454, -187.26886308938265], 'episode_lengths': [100, 100, 100, 100, 100, 100, 100, 100, 100]}, 'sampler_perf': {'mean_raw_obs_processing_ms': 1.2571061467183018, 'mean_inference_ms': 2.3783673479538177, 'mean_action_processing_ms': 0.23117343083424405, 'mean_env_wait_ms': 3.096820011455652, 'mean_env_render_ms': 0.0}, 'num_faulty_episodes': 0}",267.249,61.6602,267.249,"{'training_iteration_time_ms': 158.074, 'load_time_ms': 0.312, 'load_throughput': 821468.766, 'learn_time_ms': 26.655, 'learn_throughput': 9604.062, 'synch_weights_time_ms': 5.892}",1675949958,0,14028,14,80340_00000,8.03539


[2m[33m(raylet)[0m [2023-02-09 14:34:57,270 E 26959 27004] (raylet) file_system_monitor.cc:105: /tmp/ray/session_2023-02-09_14-34-35_158333_26854 is over 95% full, available space: 1199267840; capacity: 31845081088. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-02-09 14:35:07,276 E 26959 27004] (raylet) file_system_monitor.cc:105: /tmp/ray/session_2023-02-09_14-34-35_158333_26854 is over 95% full, available space: 1199169536; capacity: 31845081088. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-02-09 14:35:17,287 E 26959 27004] (raylet) file_system_monitor.cc:105: /tmp/ray/session_2023-02-09_14-34-35_158333_26854 is over 95% full, available space: 1199026176; capacity: 31845081088. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-02-09 14:35:27,295 E 26959 27004] (raylet) file_system_monitor.cc:105: /tmp/ray/session_2023-02-09_14-34-35_158333_26854 is over 95% full, available space: 119884