## Installation and Imports

In [None]:
!pwd

In [None]:
# Configure env variables
import os
os.environ["DARM_MUJOCO_PATH"] = "/home/daniel/DARM/darm_mujoco"

In [None]:
!python setup.py install

In [None]:
!pip install ray[rllib] torch

In [None]:
import ray
from ray.rllib.algorithms.sac import SACConfig
from ray.tune.registry import register_env
from ray.tune.logger import pretty_print

from ray import air, tune
from ray.air import session
from ray.air.integrations.wandb import setup_wandb
from ray.air.integrations.wandb import WandbLoggerCallback

import gym
from darm_gym_env import DARMSFEnv

## Register Environment with RLlib

In [None]:
# env_creator = lambda env_config: gym.make("darm/DarmSFHand-v0", render_mode=None, hand_name="hand1") # DARMSFEnv(render_mode=None, reaction_time=0.08, hand_name="hand1") # 

def make_env(env_config):
    env = gym.wrappers.TimeLimit(env=DARMSFEnv(render_mode=None, reaction_time=0.08, hand_name="hand1"), max_episode_steps=100)
    return env
env_creator = lambda env_config: make_env(env_config) #gym.wrappers.TimeLimit(env=DARMSFEnv(render_mode=None, reaction_time=0.08, hand_name="hand1"), max_episode_steps=100)

register_env("darm/DarmSFHand-v0", env_creator)

## Configure and Run

In [None]:
config = (
    SACConfig()
    .environment(
        env="darm/DarmSFHand-v0",
        normalize_actions=True
    )
    .training(
        q_model_config={
            "fcnet_activation": "relu",
            "fcnet_hiddens": [256, 256]
        },
        policy_model_config={
            "fcnet_activation": "relu",
            "fcnet_hiddens": [256, 256]
        },
        tau=0.005,
        target_entropy="auto",
        n_step=1,  # num of SGD steps per batch of data in training step
        train_batch_size=256,
        target_network_update_freq=1,
        replay_buffer_config={"type":"MultiAgentPrioritizedReplayBuffer"},
        num_steps_sampled_before_learning_starts=10_000,
        optimization_config={
          "actor_learning_rate": 0.0003,
          "critic_learning_rate": 0.0003,
          "entropy_learning_rate": 0.0003,
        },
        clip_actions=False
    )
    .rollouts(
        num_rollout_workers=3,
        rollout_fragment_length=1,
    )
    .resources(num_gpus=0)
    .evaluation(evaluation_interval=100) # For 1000 timesteps iter; 100 evals
    .reporting(
        min_sample_timesteps_per_iteration=1000,
        metrics_num_episodes_for_smoothing=5
    )
    .framework(framework="torch")
)

In [None]:
wandb_init = dict(
    save_code=True,
    config={
        "env": "DARMSFHand-v0",
        
        "actor_learning_rate": 0.0003,
        "critic_learning_rate": 0.0003,
        "entropy_learning_rate": 0.0003,
        "framework": "torch",
        
        "num_rollout_workers": 3,
        "num_gpu": 0,
        "metrics_num_episodes_for_smoothing": 5
    },
    tags=["single_finger"],
    notes="Fixed the env to use targets that are delta increaments from the starting state. Removed velocity penalty, and used only effort penalty",
    name="Test_DARMSF_DELTA_TARGET"
    # job_type=D
    # monitor_gym=
)

In [None]:
tuner = tune.Tuner(
    "SAC",
    run_config=air.RunConfig(
        name="Test_DARMSF_DELTA_TARGET",
        stop={"training_iteration": 10_000, "episode_reward_mean": 200},
        checkpoint_config=air.CheckpointConfig(checkpoint_at_end=True),
        callbacks=[
                WandbLoggerCallback(project="DARM", save_checkpoints=True, **wandb_init)
            ],
        local_dir="./results"
        ),
    param_space=config
)

results = tuner.fit()

2023-02-09 14:34:37,372	INFO worker.py:1538 -- Started a local Ray instance.
2023-02-09 14:34:38,879	INFO wandb.py:250 -- Already logged into W&B.


0,1
Current time:,2023-02-09 15:22:27
Running for:,00:47:48.23
Memory:,6.9/7.5 GiB

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_darm_DarmSFHand-v0_80340_00000,RUNNING,192.168.152.36:27220,63,2843.06,63126,-172.193,-164.839,-184.317,100


[2m[36m(SAC pid=27220)[0m 2023-02-09 14:34:42,995	INFO algorithm.py:501 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[34m[1mwandb[0m: Currently logged in as: [33mdanieladejumo[0m. Use [1m`wandb login --relogin`[0m to force relogin
[2m[33m(raylet)[0m [2023-02-09 14:34:47,263 E 26959 27004] (raylet) file_system_monitor.cc:105: /tmp/ray/session_2023-02-09_14-34-35_158333_26854 is over 95% full, available space: 1199403008; capacity: 31845081088. Object creation will fail if spilling is required.
[2m[36m(RolloutWorker pid=27324)[0m   logger.warn(


[2m[36m(RolloutWorker pid=27324)[0m Loaded XML file successfully


[2m[36m(RolloutWorker pid=27325)[0m   logger.warn(


[2m[36m(RolloutWorker pid=27325)[0m Loaded XML file successfully


[2m[36m(RolloutWorker pid=27327)[0m   logger.warn(


[2m[36m(RolloutWorker pid=27327)[0m Loaded XML file successfully
[2m[36m(SAC pid=27220)[0m Loaded XML file successfully


[2m[36m(SAC pid=27220)[0m   logger.warn(


Trial name,agent_timesteps_total,counters,custom_metrics,date,done,episode_len_mean,episode_media,episode_reward_max,episode_reward_mean,episode_reward_min,episodes_this_iter,episodes_total,experiment_id,hostname,info,iterations_since_restore,node_ip,num_agent_steps_sampled,num_agent_steps_trained,num_env_steps_sampled,num_env_steps_sampled_this_iter,num_env_steps_trained,num_env_steps_trained_this_iter,num_faulty_episodes,num_healthy_workers,num_in_flight_async_reqs,num_remote_worker_restarts,num_steps_trained_this_iter,perf,pid,policy_reward_max,policy_reward_mean,policy_reward_min,sampler_perf,sampler_results,time_since_restore,time_this_iter_s,time_total_s,timers,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
SAC_darm_DarmSFHand-v0_80340_00000,63126,"{'num_env_steps_sampled': 63126, 'num_env_steps_trained': 4533504, 'num_agent_steps_sampled': 63126, 'num_agent_steps_trained': 4533504, 'last_target_update_ts': 63126, 'num_target_updates': 17709}",{},2023-02-09_15-22-17,False,100,{},-164.839,-172.193,-184.317,12,650,4d5d6cddaea7444681811f5901f7f828,Daniel,"{'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 0.05446416512131691, 'actor_loss': 76.25946044921875, 'critic_loss': 0.9490589499473572, 'alpha_loss': -0.23494774103164673, 'alpha_value': 0.013382537, 'log_alpha_value': -4.3138046, 'target_entropy': -5.0, 'policy_t': -0.35983332991600037, 'mean_q': -76.21299743652344, 'max_q': -70.89612579345703, 'min_q': -79.94763946533203}, 'td_error': array([8.70933533e-01, 7.70902863e+01, 3.23394775e-01, 8.54465485e-01,  7.96669006e-01, 4.95552063e-01, 1.53480530e-01, 3.98036957e-01,  4.50363159e-01, 5.58853149e-02, 5.80398560e-01, 5.31684875e-01,  5.50632477e-01, 7.64622040e+01, 7.11090927e+01, 2.78018951e-01,  3.23509216e-01, 3.19911133e+02, 7.41439819e-01, 2.86090851e-01,  3.65581512e-01, 2.23262787e-01, 7.22920532e+01, 3.44985962e-01,  3.23246613e+02, 8.72760773e-01, 3.07350159e-01, 5.89252472e-01,  2.89245605e-01, 4.45289612e-01, 3.86116028e-01, 4.95372772e-01,  1.17622375e-01, 9.02973175e-01, 7.25410156e+01, 1.47972107e-01,  6.08467102e-01, 2.89993286e-01, 3.01383972e-01, 4.63035583e-01,  5.49930573e-01, 1.24683380e-01, 3.99410248e-01, 3.23452942e+02,  6.49154663e-01, 2.43892670e-01, 4.22500610e-01, 4.51553345e-01,  7.61413422e+01, 1.55357361e-01, 7.37118378e+01, 3.53500366e-01,  7.60710297e+01, 1.08942413e+00, 9.30511475e-01, 6.56528473e-01,  1.54560089e-01, 3.71330261e-01, 7.55985260e-01, 3.66878510e-01,  2.27027893e-01, 7.43968735e+01, 3.20534180e+02, 6.25057220e-01,  7.77720642e+01, 7.64764175e+01, 7.48228302e+01, 5.73928833e-01,  1.05478668e+00, 7.39582062e-01, 4.76772308e-01, 2.92900085e-01,  9.57145691e-02, 4.59468842e-01, 7.44354248e-01, 7.67494736e+01,  6.19934082e-01, 5.33565521e-01, 7.66028137e+01, 7.59688263e+01,  3.37337494e-01, 6.05121613e-01, 7.90679932e-01, 6.38469696e-01,  5.49167633e-01, 3.45630646e-01, 4.72106934e-01, 7.92686462e-01,  3.90197754e-01, 4.97699738e-01, 1.92794800e-01, 7.39824295e+01,  7.30886688e+01, 4.13452148e-01, 3.63849640e-01, 8.74122620e-01,  6.96380615e-01, 5.35755157e-01, 7.00260162e-01, 6.58126831e-01,  7.68320770e+01, 7.62116928e+01, 9.42646027e-01, 8.24729919e-01,  4.65587616e-01, 7.88040161e-02, 5.91468811e-02, 8.01280975e-01,  6.11064911e-01, 7.13958740e-01, 7.75476685e+01, 3.88767242e-01,  9.35134888e-02, 1.77021027e-01, 5.10936737e-01, 4.80121613e-01,  7.46583023e+01, 2.04490662e-01, 4.83104706e-01, 4.81597900e-01,  7.24423370e+01, 1.94416046e-01, 6.70280457e-01, 2.96279907e-01,  6.90635681e-01, 2.48329163e-01, 7.64808655e+01, 3.60755920e-02,  4.58869934e-02, 7.36387024e+01, 7.67126617e+01, 7.41887360e+01,  6.83135986e-02, 4.25193787e-01, 2.39738464e-01, 6.15932465e-01,  3.20888367e+02, 7.31712341e-01, 7.55614471e+01, 3.03653717e-01,  2.27016449e-01, 3.25490265e+02, 1.88999176e-01, 9.98191833e-02,  6.00170135e-01, 2.74959564e-01, 4.04270172e-01, 7.58676605e+01,  2.70957947e-01, 4.20745850e-01, 4.25727844e-01, 6.67224884e-01,  7.13282623e+01, 7.65510025e+01, 1.29165649e-01, 9.25651550e-01,  7.63499985e+01, 3.48808289e-01, 7.69172211e+01, 7.38582916e+01,  3.43040466e-01, 7.69128342e+01, 5.50804138e-01, 5.10574341e-01,  3.06976318e-01, 1.41979218e-01, 3.22810760e+02, 6.73149109e-01,  1.36470795e-01, 2.25135803e-01, 2.33856201e-01, 6.67255402e-01,  6.99861755e+01, 7.64545441e-01, 3.70082855e-01, 7.62906265e+01,  4.55997467e-01, 5.77690125e-01, 4.95250702e-01, 7.43064880e+01,  1.29013062e-01, 7.42013779e+01, 1.01188660e-01, 2.22694397e-01,  6.30340576e-01, 9.43016052e-01, 7.19112091e+01, 8.31756592e-02,  3.55602264e-01, 3.91139984e-01, 5.80112457e-01, 4.19925690e-01,  7.69043579e+01, 4.72667694e-01, 1.24923706e-01, 5.35736084e-01,  7.18591309e+01, 4.56710815e-01, 7.65410004e+01, 3.83666992e-01,  7.56062927e+01, 1.26029968e-01, 1.34836960e+00, 7.12051392e-01,  5.03646851e-01, 1.01305389e+00, 2.66372681e-01, 6.44523621e-01,  3.83968353e-01, 3.13415527e-01, 4.50347900e-01, 7.17262573e+01,  7.36541748e+01, 9.85507965e-01, 7.52568970e+01, 7.10615845e+01,  4.35791016e-01, 7.67352982e+01, 1.49221802e+00, 7.12772980e+01,  4.33174133e-01, 6.36009216e-01, 1.15173340e-01, 1.32225037e-01,  7.17760391e+01, 1.83715820e-01, 4.18880463e-01, 6.91764832e-01,  5.55019379e-01, 5.31143188e-01, 1.44248962e-01, 5.80833435e-01,  7.54985657e+01, 7.13498688e+01, 1.91131592e-01, 7.71976471e+01,  5.79730988e-01, 7.31511612e+01, 7.53228607e+01, 3.07754517e-01,  8.55087280e-01, 5.27164459e-01, 7.98782349e-01, 6.00166321e-01,  4.12220001e-01, 5.42694092e-01, 7.03353882e-01, 2.60578156e-01,  7.07602539e+01, 4.52239990e-01, 3.82194519e-01, 4.77043152e-01,  3.13735962e-01, 4.05822754e-01, 8.02108765e-01, 1.02853394e+00],  dtype=float32), 'mean_td_error': 25.4951114654541, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 256.0, 'num_grad_updates_lifetime': 17709.0, 'diff_num_grad_updates_vs_sampler_policy': 17708.0}}, 'num_env_steps_sampled': 63126, 'num_env_steps_trained': 4533504, 'num_agent_steps_sampled': 63126, 'num_agent_steps_trained': 4533504, 'last_target_update_ts': 63126, 'num_target_updates': 17709}",63,192.168.152.36,63126,4533504,63126,1002,4533504,85504,0,3,0,0,85504,"{'cpu_util_percent': 37.994202898550725, 'ram_util_percent': 91.45072463768112}",27220,{},{},{},"{'mean_raw_obs_processing_ms': 1.161695348102149, 'mean_inference_ms': 2.3783665558109326, 'mean_action_processing_ms': 0.2257217219056826, 'mean_env_wait_ms': 3.018582295305679, 'mean_env_render_ms': 0.0}","{'episode_reward_max': -164.83878053724766, 'episode_reward_min': -184.31719098985195, 'episode_reward_mean': -172.19335282345614, 'episode_len_mean': 100.0, 'episode_media': {}, 'episodes_this_iter': 12, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [-173.22791515290737, -166.02916425466537, -184.31719098985195, -164.90399967879057, -167.61980755627155, -172.49408177286386, -181.30813418328762, -164.83878053724766, -167.37792050093412, -175.67796140164137, -165.0356966406107, -183.4895812124014], 'episode_lengths': [100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100]}, 'sampler_perf': {'mean_raw_obs_processing_ms': 1.161695348102149, 'mean_inference_ms': 2.3783665558109326, 'mean_action_processing_ms': 0.2257217219056826, 'mean_env_wait_ms': 3.018582295305679, 'mean_env_render_ms': 0.0}, 'num_faulty_episodes': 0}",2843.06,50.7209,2843.06,"{'training_iteration_time_ms': 149.157, 'load_time_ms': 0.275, 'load_throughput': 930774.813, 'learn_time_ms': 24.949, 'learn_throughput': 10260.873, 'synch_weights_time_ms': 5.077}",1675952537,0,63126,63,80340_00000,8.03539


[2m[33m(raylet)[0m [2023-02-09 14:34:57,270 E 26959 27004] (raylet) file_system_monitor.cc:105: /tmp/ray/session_2023-02-09_14-34-35_158333_26854 is over 95% full, available space: 1199267840; capacity: 31845081088. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-02-09 14:35:07,276 E 26959 27004] (raylet) file_system_monitor.cc:105: /tmp/ray/session_2023-02-09_14-34-35_158333_26854 is over 95% full, available space: 1199169536; capacity: 31845081088. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-02-09 14:35:17,287 E 26959 27004] (raylet) file_system_monitor.cc:105: /tmp/ray/session_2023-02-09_14-34-35_158333_26854 is over 95% full, available space: 1199026176; capacity: 31845081088. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-02-09 14:35:27,295 E 26959 27004] (raylet) file_system_monitor.cc:105: /tmp/ray/session_2023-02-09_14-34-35_158333_26854 is over 95% full, available space: 119884