In [1]:
!pwd

In [2]:
# Configure env variables

# TODO: change path
import os
os.environ["DARM_MUJOCO_PATH"] = "/workspace/darm-mujoco"
os.getenv('DARM_MUJOCO_PATH')

'/workspace/darm-mujoco'

In [3]:
# Check if GCC is installed
!gcc --version

In [4]:
# Install GCC if absent
!sudo apt update
!sudo apt install build-essential -y

In [5]:
# Setup Mujoco for gym - If needed
!apt-get install -y \
    libgl1-mesa-dev \
    libgl1-mesa-glx \
    libglew-dev \
    libosmesa6-dev \
    software-properties-common

!apt-get install -y patchelf

!pip install gym

!pip install free-mujoco-py

import mujoco_py
import gym

In [6]:
!pip install ray[rllib] torch
!pip install wandb
!pip install tensorflow_probability

In [7]:
# Check if mujoco import is successful
import mujoco

In [8]:
import ray
from ray.rllib.algorithms.es import ESConfig
from ray.tune.registry import register_env
from ray.tune.logger import pretty_print

from ray import air, tune
from ray.air import session
from ray.air.integrations.wandb import setup_wandb
from ray.air.integrations.wandb import WandbLoggerCallback

import gym
from darm_gym_env import DARMSFEnv

In [9]:
# env_creator = lambda env_config: gym.make("darm/DarmSFHand-v0", render_mode=None, hand_name="hand1") # DARMSFEnv(render_mode=None, reaction_time=0.08, hand_name="hand1") # 

def make_env(env_config):
    env = gym.wrappers.TimeLimit(env=DARMSFEnv(render_mode=None, reaction_time=0.08, hand_name="hand1"), max_episode_steps=100)
    return env
env_creator = lambda env_config: make_env(env_config) #gym.wrappers.TimeLimit(env=DARMSFEnv(render_mode=None, reaction_time=0.08, hand_name="hand1"), max_episode_steps=100)

register_env("darm/DarmSFHand-v0", env_creator)

In [10]:
# TODO:
# change: rollout_workers
# change: gpu

config = (
    ESConfig()
    .environment(
        env="darm/DarmSFHand-v0"
    )
    .rollouts(
        num_rollout_workers=121,
        num_envs_per_worker=64,
        # rollout_fragment_length=1,
        recreate_failed_workers=True,
        num_consecutive_worker_failures_tolerance=10,
        restart_failed_sub_environments=True,
    )
    .resources(num_gpus=1)
    # .evaluation(evaluation_interval=100) # For 1000 timesteps iter; 100 evals
    .framework(framework="torch")
)
# config.to_dict()

In [11]:
# TODO:
# change: rollout_workers
# change: gpu
# change: tags
# change: name

wandb_init = dict(
    save_code=True,
    resume=True,
    config={
        "env": "DARMSFHand-v0",
        "num_rollout_workers": 121,
        "num_envs_per_worker": 64,
        "recreate_failed_workers": True,
        "num_consecutive_worker_failures_tolerance": 10,
        "restart_failed_sub_environments": True,
        "num_gpus": 1,
        "framework": "torch"
    },
    tags=["single_finger", "es", "delta_target", "vast_ai", "no_vel_penalty", "effort_penalty"],
    notes="Updated reward function. Fixed the env to use targets that are delta increaments from the starting state. Removed velocity penalty, and used only effort penalty",
    name="SF_rllib_es_vast_ai_rew3"
    # job_type=
    # monitor_gym=
)

In [12]:
# TODO: 
# change: name
# change: checkpoint_freq

sync_config = tune.SyncConfig()

tuner = tune.Tuner(
    "ES",
    run_config=air.RunConfig(
        name="SF_rllib_es_vast_ai_rew3",
        local_dir=f"{os.getenv('DARM_MUJOCO_PATH')}/darm_training/results",
        sync_config=sync_config,
        stop={"training_iteration": 10_000, "episode_reward_mean": 20},
        checkpoint_config=air.CheckpointConfig(
            checkpoint_at_end=True,
            checkpoint_score_attribute="episode_reward_mean",  # or leave to save last chkpts
            checkpoint_score_order="max",
            checkpoint_frequency=50,
            num_to_keep=3
        ),
        callbacks=[
                WandbLoggerCallback(project="DARM", 
                                    api_key="392c8a47eb0658eb5c71190757a69110e2140f4a",
                                    save_checkpoints=True, 
                                    **wandb_init)
            ],
        ),
    param_space=config.to_dict()
)

results = tuner.fit()