## Installation and Imports

In [None]:
!pwd

In [None]:
# Configure env variables
import os
os.environ["DARM_MUJOCO_PATH"] = "/home/daniel/DARM/darm_mujoco"

In [None]:
!python setup.py install

In [None]:
!pip install ray[rllib] torch

In [None]:
import ray
from ray.rllib.algorithms.sac import SACConfig
from ray.tune.registry import register_env
from ray.tune.logger import pretty_print

from ray import air, tune
from ray.air import session
from ray.air.integrations.wandb import setup_wandb
from ray.air.integrations.wandb import WandbLoggerCallback

import gym
from darm_gym_env import DARMSFEnv

## Register Environment with RLlib

In [None]:
# env_creator = lambda env_config: gym.make("darm/DarmSFHand-v0", render_mode=None, hand_name="hand1") # DARMSFEnv(render_mode=None, reaction_time=0.08, hand_name="hand1") # 

def make_env(env_config):
    env = gym.wrappers.TimeLimit(env=DARMSFEnv(render_mode=None, reaction_time=0.08, hand_name="hand1"), max_episode_steps=100)
    return env
env_creator = lambda env_config: make_env(env_config) #gym.wrappers.TimeLimit(env=DARMSFEnv(render_mode=None, reaction_time=0.08, hand_name="hand1"), max_episode_steps=100)

register_env("darm/DarmSFHand-v0", env_creator)

## Configure and Run

In [None]:
config = (
    SACConfig()
    .environment(
        env="darm/DarmSFHand-v0",
        normalize_actions=True
    )
    .training(
        q_model_config={
            "fcnet_activation": "relu",
            "fcnet_hiddens": [256, 256]
        },
        policy_model_config={
            "fcnet_activation": "relu",
            "fcnet_hiddens": [256, 256]
        },
        tau=0.005,
        target_entropy="auto",
        n_step=1,  # num of SGD steps per batch of data in training step
        train_batch_size=256,
        target_network_update_freq=1,
        replay_buffer_config={"type":"MultiAgentPrioritizedReplayBuffer"},
        num_steps_sampled_before_learning_starts=10_000,
        optimization_config={
          "actor_learning_rate": 0.0003,
          "critic_learning_rate": 0.0003,
          "entropy_learning_rate": 0.0003,
        },
        clip_actions=False
    )
    .rollouts(
        num_rollout_workers=3,
        rollout_fragment_length=1,
    )
    .resources(num_gpus=0)
    .evaluation(evaluation_interval=100) # For 1000 timesteps iter; 100 evals
    .reporting(
        min_sample_timesteps_per_iteration=1000,
        metrics_num_episodes_for_smoothing=5
    )
    .framework(framework="torch")
)

In [None]:
wandb_init = dict(
    save_code=True,
    config={
        "env": "DARMSFHand-v0",
        
        "actor_learning_rate": 0.0003,
        "critic_learning_rate": 0.0003,
        "entropy_learning_rate": 0.0003,
        "framework": "torch",
        
        "num_rollout_workers": 3,
        "num_gpu": 0,
        "metrics_num_episodes_for_smoothing": 5
    },
    tags=["single_finger"],
    notes="Fixed the env to use targets that are delta increaments from the starting state. Removed velocity penalty, and used only effort penalty",
    name="Test_DARMSF_DELTA_TARGET"
    # job_type=D
    # monitor_gym=
)

In [None]:
tuner = tune.Tuner(
    "SAC",
    run_config=air.RunConfig(
        name="Test_DARMSF_DELTA_TARGET",
        stop={"training_iteration": 10_000, "episode_reward_mean": 200},
        checkpoint_config=air.CheckpointConfig(checkpoint_at_end=True),
        callbacks=[
                WandbLoggerCallback(project="DARM", save_checkpoints=True, **wandb_init)
            ],
        local_dir="./results"
        ),
    param_space=config
)

results = tuner.fit()

2023-02-09 14:34:37,372	INFO worker.py:1538 -- Started a local Ray instance.
2023-02-09 14:34:38,879	INFO wandb.py:250 -- Already logged into W&B.


0,1
Current time:,2023-02-09 15:36:32
Running for:,01:01:53.58
Memory:,6.9/7.5 GiB

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_darm_DarmSFHand-v0_80340_00000,RUNNING,192.168.152.36:27220,79,3677.43,79158,-119.449,236.462,-167.668,90.8


[2m[36m(SAC pid=27220)[0m 2023-02-09 14:34:42,995	INFO algorithm.py:501 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[34m[1mwandb[0m: Currently logged in as: [33mdanieladejumo[0m. Use [1m`wandb login --relogin`[0m to force relogin
[2m[33m(raylet)[0m [2023-02-09 14:34:47,263 E 26959 27004] (raylet) file_system_monitor.cc:105: /tmp/ray/session_2023-02-09_14-34-35_158333_26854 is over 95% full, available space: 1199403008; capacity: 31845081088. Object creation will fail if spilling is required.
[2m[36m(RolloutWorker pid=27324)[0m   logger.warn(


[2m[36m(RolloutWorker pid=27324)[0m Loaded XML file successfully


[2m[36m(RolloutWorker pid=27325)[0m   logger.warn(


[2m[36m(RolloutWorker pid=27325)[0m Loaded XML file successfully


[2m[36m(RolloutWorker pid=27327)[0m   logger.warn(


[2m[36m(RolloutWorker pid=27327)[0m Loaded XML file successfully
[2m[36m(SAC pid=27220)[0m Loaded XML file successfully


[2m[36m(SAC pid=27220)[0m   logger.warn(


Trial name,agent_timesteps_total,counters,custom_metrics,date,done,episode_len_mean,episode_media,episode_reward_max,episode_reward_mean,episode_reward_min,episodes_this_iter,episodes_total,experiment_id,hostname,info,iterations_since_restore,node_ip,num_agent_steps_sampled,num_agent_steps_trained,num_env_steps_sampled,num_env_steps_sampled_this_iter,num_env_steps_trained,num_env_steps_trained_this_iter,num_faulty_episodes,num_healthy_workers,num_in_flight_async_reqs,num_remote_worker_restarts,num_steps_trained_this_iter,perf,pid,policy_reward_max,policy_reward_mean,policy_reward_min,sampler_perf,sampler_results,time_since_restore,time_this_iter_s,time_total_s,timers,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
SAC_darm_DarmSFHand-v0_80340_00000,79158,"{'num_env_steps_sampled': 79158, 'num_env_steps_trained': 5901568, 'num_agent_steps_sampled': 79158, 'num_agent_steps_trained': 5901568, 'last_target_update_ts': 79158, 'num_target_updates': 23053}",{},2023-02-09_15-36-12,False,90.8,{},236.462,-119.449,-167.668,10,817,4d5d6cddaea7444681811f5901f7f828,Daniel,"{'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 0.44777923822402954, 'actor_loss': 98.47936248779297, 'critic_loss': 1.2056889533996582, 'alpha_loss': 1.6964234113693237, 'alpha_value': 0.02262892, 'log_alpha_value': -3.7885265, 'target_entropy': -5.0, 'policy_t': -0.37933048605918884, 'mean_q': -98.42659759521484, 'max_q': -91.68278503417969, 'min_q': -102.14861297607422}, 'td_error': array([9.65326080e+01, 1.88430786e-01, 4.74941254e-01, 3.63899231e-01,  6.96640015e-01, 4.83299255e-01, 8.44535828e-02, 2.92938232e-01,  3.45318359e+02, 9.56597137e+01, 6.09916687e-01, 2.46566772e-01,  6.50405884e-02, 3.74794006e-02, 3.66432190e-01, 7.24552155e-01,  9.04037476e-01, 9.65690460e+01, 3.77117157e-01, 4.12132263e-01,  5.44467926e-01, 2.93872833e-01, 8.30268860e-01, 9.39289627e+01,  1.46934509e-01, 1.49459839e-01, 3.05706024e-01, 9.75575867e+01,  9.93884277e+01, 5.15876770e-01, 5.30914307e-01, 9.70716095e+01,  2.20588684e-01, 4.95990753e-01, 4.57740784e-01, 1.16844177e-01,  1.41380310e-01, 9.89918976e+01, 6.71958923e-02, 1.00233917e+02,  1.66282654e-01, 9.14423370e+01, 4.64057922e-01, 9.84658279e+01,  5.82759857e-01, 7.28111267e-02, 2.10105896e-01, 1.21845245e-01,  2.88631439e-01, 3.44143066e+02, 9.40399170e-02, 5.20477295e-02,  5.90412140e-01, 2.85041809e-01, 2.34840393e-01, 3.88366699e-01,  5.23185730e-01, 9.90606003e+01, 9.92819061e+01, 5.64163208e-01,  4.34150696e-01, 4.09690857e-01, 9.98555450e+01, 5.17166138e-01,  1.83506012e-01, 9.58137512e-02, 9.91972427e+01, 9.98406219e+01,  1.55624390e-01, 4.64088440e-01, 9.70878601e-02, 1.45580292e-01,  1.40396118e-01, 1.64722443e-01, 9.68006439e+01, 9.46453552e+01,  3.70513916e-01, 2.23228455e-01, 4.32266235e-01, 1.30847931e-01,  6.36116028e-01, 7.83992767e-01, 9.58167725e+01, 7.08263397e-01,  3.44152710e+02, 9.94132080e+01, 1.60076141e-01, 1.49600983e-01,  1.74148560e-01, 3.55842590e-01, 4.46060181e-01, 5.64731598e-01,  2.86720276e-01, 2.01625824e-01, 3.66615295e-01, 9.64736938e+01,  4.69024658e-01, 9.97734756e+01, 3.68164062e-01, 9.58973770e+01,  2.31739044e-01, 3.25889587e-01, 5.60173035e-01, 1.16310120e-01,  3.47429535e+02, 9.70573425e-02, 9.71788330e+01, 4.38884735e-01,  5.50682068e-01, 5.38806915e-01, 6.16466522e-01, 2.69485474e-01,  5.56953430e-01, 9.42717133e+01, 1.89476013e-01, 6.96327209e-01,  5.89076996e-01, 1.75117493e-01, 2.65472412e-01, 8.63304138e-01,  9.06639557e+01, 4.19006348e-02, 2.37651825e-01, 9.36075745e+01,  9.55759506e+01, 7.83084869e-01, 9.93789520e+01, 5.83946228e-01,  9.21190033e+01, 2.09747314e-01, 1.73587799e-01, 9.52705612e+01,  1.07780457e-01, 5.59684753e-01, 8.12759399e-01, 2.09861755e-01,  9.92427673e+01, 9.78794250e+01, 9.21563797e+01, 4.95391846e-01,  9.08598938e+01, 1.01684570e-01, 5.92018127e-01, 3.44033264e+02,  9.66585083e+01, 2.41230011e-01, 9.58226089e+01, 4.19387817e-01,  4.08351898e-01, 1.43970490e-01, 9.65978012e+01, 9.47104416e+01,  8.90121460e-02, 8.33156586e-01, 5.50231934e-01, 3.47919159e+02,  2.72514343e-01, 9.79634628e+01, 9.60691910e+01, 6.25965118e-01,  5.64109802e-01, 9.25840607e+01, 1.75895691e-01, 4.24644470e-01,  3.36208344e-01, 9.90863647e+01, 8.48587036e-01, 3.00994873e-01,  3.43139648e-01, 1.99569702e-01, 3.57833862e-01, 3.86852264e-01,  2.30205536e-01, 5.81569672e-01, 1.16115570e-01, 1.23687744e-01,  3.67759705e-01, 4.68719482e-01, 5.64117432e-02, 1.00241348e+02,  6.69418335e-01, 2.20020294e-01, 9.79550018e+01, 3.26538086e-01,  9.47155609e+01, 7.62527466e-01, 3.44556763e+02, 4.77714539e-01,  2.98240662e-01, 2.94868469e-01, 2.64904022e-01, 3.77658844e-01,  4.51728821e-01, 3.73344421e-01, 8.94512177e-01, 9.69173965e+01,  6.41647339e-01, 8.66630554e-01, 9.56040802e+01, 9.88605042e+01,  5.55278778e-01, 9.93618469e+01, 1.30482101e+00, 6.67148590e-01,  4.75990295e-01, 4.16549683e-01, 1.01001740e+00, 9.85688324e+01,  2.71095276e-01, 9.64921265e+01, 3.44448090e-01, 2.94029236e-01,  1.68186188e-01, 3.68556976e-01, 2.79441833e-01, 5.03505707e-01,  1.45736694e-01, 3.38127136e-01, 5.68584442e-01, 2.39879608e-01,  4.58442688e-01, 5.93364716e-01, 4.90901947e-01, 4.05441284e-01,  1.00446320e+00, 1.94156647e-01, 2.42805481e-01, 9.67271271e+01,  7.40745544e-01, 9.83971710e+01, 3.29704285e-01, 9.94171448e+01,  4.31541443e-01, 1.00281807e+02, 9.39289627e+01, 4.39437866e-01,  9.80926361e+01, 3.44326782e+02, 7.41397858e-01, 5.29464722e-01,  9.90519867e+01, 2.79018402e-01, 6.83475494e-01, 5.51383972e-01,  3.73771667e-01, 2.30266571e-01, 9.08376083e+01, 4.97486115e-01,  4.26132202e-01, 3.42708588e-01, 6.47731781e-01, 2.94609070e-01,  5.69805145e-01, 4.21024323e-01, 1.18968964e-01, 1.50733948e-01],  dtype=float32), 'mean_td_error': 33.75336837768555, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 256.0, 'num_grad_updates_lifetime': 23053.0, 'diff_num_grad_updates_vs_sampler_policy': 23052.0}}, 'num_env_steps_sampled': 79158, 'num_env_steps_trained': 5901568, 'num_agent_steps_sampled': 79158, 'num_agent_steps_trained': 5901568, 'last_target_update_ts': 79158, 'num_target_updates': 23053}",79,192.168.152.36,79158,5901568,79158,1002,5901568,85504,0,3,0,0,85504,"{'cpu_util_percent': 38.80704225352113, 'ram_util_percent': 91.643661971831}",27220,{},{},{},"{'mean_raw_obs_processing_ms': 1.1416198350112192, 'mean_inference_ms': 2.3564032897142537, 'mean_action_processing_ms': 0.22313536539423637, 'mean_env_wait_ms': 2.986136513552914, 'mean_env_render_ms': 0.0}","{'episode_reward_max': 236.46240736544132, 'episode_reward_min': -167.6678908020258, 'episode_reward_mean': -119.44905663728714, 'episode_len_mean': 90.8, 'episode_media': {}, 'episodes_this_iter': 10, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [-155.22694990038872, -148.03948517143726, -167.6678908020258, -155.15678357332945, -165.17036618292332, -160.64602085202932, -150.88469261676073, -160.58221770077944, 236.46240736544132, -167.5785669386387], 'episode_lengths': [100, 100, 100, 100, 100, 100, 100, 100, 8, 100]}, 'sampler_perf': {'mean_raw_obs_processing_ms': 1.1416198350112192, 'mean_inference_ms': 2.3564032897142537, 'mean_action_processing_ms': 0.22313536539423637, 'mean_env_wait_ms': 2.986136513552914, 'mean_env_render_ms': 0.0}, 'num_faulty_episodes': 0}",3677.43,51.4841,3677.43,"{'training_iteration_time_ms': 169.556, 'load_time_ms': 0.284, 'load_throughput': 901697.87, 'learn_time_ms': 27.936, 'learn_throughput': 9163.895, 'synch_weights_time_ms': 5.953}",1675953372,0,79158,79,80340_00000,8.03539


[2m[33m(raylet)[0m [2023-02-09 14:34:57,270 E 26959 27004] (raylet) file_system_monitor.cc:105: /tmp/ray/session_2023-02-09_14-34-35_158333_26854 is over 95% full, available space: 1199267840; capacity: 31845081088. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-02-09 14:35:07,276 E 26959 27004] (raylet) file_system_monitor.cc:105: /tmp/ray/session_2023-02-09_14-34-35_158333_26854 is over 95% full, available space: 1199169536; capacity: 31845081088. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-02-09 14:35:17,287 E 26959 27004] (raylet) file_system_monitor.cc:105: /tmp/ray/session_2023-02-09_14-34-35_158333_26854 is over 95% full, available space: 1199026176; capacity: 31845081088. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-02-09 14:35:27,295 E 26959 27004] (raylet) file_system_monitor.cc:105: /tmp/ray/session_2023-02-09_14-34-35_158333_26854 is over 95% full, available space: 119884