In [1]:
!pwd

/home/daniel/DARM/darm_mujoco/darm_training


In [2]:
import os
os.environ["DARM_MUJOCO_PATH"] = "/home/daniel/DARM/darm_mujoco"
os.getenv('DARM_MUJOCO_PATH')

'/home/daniel/DARM/darm_mujoco'

In [3]:
# Check if GCC is installed
!gcc --version

gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0
Copyright (C) 2017 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.



In [None]:
# Install GCC if absent
!sudo apt update
!sudo apt install build-essential -y

In [None]:
!python setup.py install

In [None]:
# Check if mujoco import is successful
import mujoco

In [None]:
# If mujoco import fails, update pandas and restart runtime
!pip install pandas -U

In [None]:
# If GLFW is missing
%%bash
sudo apt-get install libglfw3 -y
sudo apt-get install libglfw3-dev -y
pip install --user glfw

In [None]:
!pip install stable-baselines3[extra]
!pip install wandb

In [4]:
import gym
from darm_gym_env import DARMSFEnv
from stable_baselines3 import SAC
from stable_baselines3.common.vec_env.subproc_vec_env import SubprocVecEnv
from stable_baselines3.common.vec_env.vec_monitor import VecMonitor
from stable_baselines3.common.vec_env.vec_normalize import VecNormalize
from stable_baselines3.common.vec_env.dummy_vec_env import DummyVecEnv
from stable_baselines3.common.env_util import make_vec_env

import wandb
from wandb.integration.sb3 import WandbCallback
from stable_baselines3.common.callbacks import CallbackList, EvalCallback, StopTrainingOnRewardThreshold, StopTrainingOnNoModelImprovement


from datetime import datetime

In [35]:
config = {
    "env_id": "darm/DarmSFHand-v0",
    "algo": "SAC",
    "rl_lib": "SB3",
    
    "seed": 0,
    "mean_reward_thresh": 1_300,
    "total_timesteps": 10_000_000,
    "pi_net_arch": [32, 256, 256, 64],
    "qf_net_arch": [32, 256, 256, 64],
    "learning_starts": 40_000,
    "num_cpu": 6,
    
    "eval_freq": 2_000, # 5_000
    "max_no_improvement_evals": 10,
    "no_improvement_min_evals": 20,
    
    "log_interval": 20, # episodes
    "wandb_model_save_freq": 2_000 #5_000 timesteps?
}

In [5]:
run_name = "test1_SF_SB3_SAC_1"

notes = """
- The environment was updated such that the target is within a range from the start point
- Velocity penalty was removed and only effort penalty was used
- The reward function was updated according to the reach task reward used in facebookresearch/myosuite [https://github.com/facebookresearch/myosuite/blob/main/myosuite/envs/myo/reach_v0.py]
- The done signal is trigerred only when the fingertip goes beyond a threshold. The episode continues to the maximum timestep otherwise.
- The friction and damping coefficient of the environment is updated. Values are inspired from Deepmind's Mujoco Menagerie [https://github.com/deepmind/mujoco_menagerie/blob/main/shadow_hand/right_hand.xml]
- The range of action from the model was changed to [-1, 1]. This action is mapped to the actual action sent to mujoco e.g [0, 2]]. This change is inspired from values used in OpenAI's Gym Mujoco environments.
- max_episode_steps was updated to 200.
- Velocity vector (size [3,]) was added to observation. Observation size is now (9,)
- Action range was increased to [0, 5]

- This run was trained on vast_ai using SB3's SAC algo.
"""

tags = ["single_finger", "sac", "sb3", "vast_ai"]

run = wandb.init(
    project="DARM",
    name=run_name,
    tags=tags,
    notes=notes,
    config=config,
    sync_tensorboard=True,  # auto-upload sb3's tensorboard metrics
    # monitor_gym=True,  # auto-upload the videos of agents playing the game
    save_code=True,  # optional
)

[34m[1mwandb[0m: Currently logged in as: [33mdanieladejumo[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
NUM_CPU = config["num_cpu"]

env = make_vec_env(config["env_id"], n_envs=NUM_CPU, seed=config["seed"])
# env = VecNormalize(env)   #FIXME: Remember to save norm params if using VecNorm env
# env = VecMonitor(env)

policy_kwargs = dict(net_arch=dict(pi=config["pi_net_arch"], qf=config["qf_net_arch"]))

model = SAC("MlpPolicy", env, verbose=1,
            learning_starts=config["learning_starts"],
            gradient_steps=NUM_CPU, # num of envs
            policy_kwargs=policy_kwargs,
            tensorboard_log="./results/darm_sf_hand")

Loaded XML file successfully


  logger.warn(


Loaded XML file successfully
Loaded XML file successfully
Loaded XML file successfully
Using cpu device


In [7]:
eval_env = make_vec_env(config["env_id"], n_envs=1, seed=config["seed"])

# Stop training when the model reaches the reward threshold
reward_thresh_callback = StopTrainingOnRewardThreshold(reward_threshold=config["mean_reward_thresh"], verbose=1)

# Stop training if there is no improvement after more than N evaluations
stop_train_callback = StopTrainingOnNoModelImprovement(
    max_no_improvement_evals=config["max_no_improvement_evals"], 
    min_evals=config["no_improvement_min_evals"], 
    verbose=1)

eval_callback = EvalCallback(eval_env, 
                             best_model_save_path=f"./results/darm_sf_hand/{run_name}/models/best",
                             log_path=f"./results/darm_sf_hand/{run_name}/models/best/logs", 
                             eval_freq=config["eval_freq"],
                             callback_on_new_best=reward_thresh_callback,
                             # callback_after_eval=stop_train_callback,
                             deterministic=True, render=False, verbose=1)

wandb_callback=WandbCallback(model_save_path=f"./results/darm_sf_hand/{run_name}/models",
                             model_save_freq=config["wandb_model_save_freq"],
                             verbose=2)

# Create the callback list
callback = CallbackList([wandb_callback, eval_callback])
callback

Loaded XML file successfully


<stable_baselines3.common.callbacks.CallbackList at 0x7f4920e50fa0>

In [8]:
try:
    model.learn(total_timesteps=config["total_timesteps"], 
                log_interval=config["log_interval"], 
                tb_log_name=run_name,
                callback=callback)
except Exception as e:
    print("Exception caught:")
    print(e)
finally:
    # timestamp = f"{datetime.now().date()}__{datetime.now().time()}"
    print("Saving last checkpoint")
    model_name = f"./results/darm_sf_hand/{run_name}/models/last_model"
    env_norm_name = f"./results/darm_sf_hand/{run_name}/env_norm"
    model.save(model_name)
    print(f"Last checkpoint saved in: {model_name}")
    # env.save(env_norm_name) # FIXME: Remember to save norm params if using VecNorm env
    

Logging to ./results/darm_sf_hand/test4_SF_SB3_SAC_1


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 91.5     |
|    ep_rew_mean     | -32.1    |
| time/              |          |
|    episodes        | 20       |
|    fps             | 500      |
|    time_elapsed    | 3        |
|    total_timesteps | 1968     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 92.2     |
|    ep_rew_mean     | -27.9    |
| time/              |          |
|    episodes        | 40       |
|    fps             | 502      |
|    time_elapsed    | 8        |
|    total_timesteps | 4020     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 85.2     |
|    ep_rew_mean     | -30.4    |
| time/              |          |
|    episodes        | 60       |
|    fps             | 509      |
|    time_elapsed    | 10       |
|    total_timesteps | 5384     |
--------------

KeyboardInterrupt: 

In [9]:
# Finish the run if it's final
run.finish()
print(f"Finished run {run_name}")

0,1
eval/mean_ep_length,▁▅▃▃▄█
eval/mean_reward,▁▄▂▃█▅
global_step,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇██
rollout/ep_len_mean,▂▃▁▄▄▃▃▄▂▂▂▂▃▂▂▅▅▃▄▄▃▂▃▄▇█
rollout/ep_rew_mean,▃▄▃▃▃▂▁▁▁▁▁▂▂▂▂▂▁▁▁▁▂▃▃▆█▇
time/fps,████████▇▇▇▇▇▇▇▇▇▇▇▇▆▄▃▂▁▁
train/actor_loss,█▄▂▁▃▃▄
train/critic_loss,▅██▁▃▂▁
train/ent_coef,█▅▃▂▁▁▁
train/ent_coef_loss,█▆▄▃▁▃▇

0,1
eval/mean_ep_length,200.0
eval/mean_reward,-37.05579
global_step,51840.0
rollout/ep_len_mean,116.95
rollout/ep_rew_mean,-9.54759
time/fps,128.0
train/actor_loss,-17.60356
train/critic_loss,2.88043
train/ent_coef,0.04804
train/ent_coef_loss,-3.41416


Finished run test4_SF_SB3_SAC


In [None]:
# MORE TRAINING

# LOAD TRAINED MODEL

try:
    model.learn(total_timesteps=10_000_000, log_interval=8, tb_log_name="PlainDarmEnv",
                    callback=WandbCallback(model_save_path=f"checkpoints/wandb/{run.id}",
                                           model_save_freq=10, verbose=2)
               )
    # Add calbacks
except Exception as e:
    print("Exception caught:")
    print(e)
finally:
    timestamp = f"{datetime.now().date()}__{datetime.now().time()}"
    print(f"Saving checkpoint {timestamp}")
    model_name = f"./checkpoints/darm_sf_hand_{timestamp}"
    env_norm_name = f"./checkpoints/darm_sf_hand_env_norm_{timestamp}"
    model.save(model_name)
    # env.save(env_norm_name) # FIXME: Remember to save norm params if using VecNorm env
    

In [None]:
env.close()

### DONE TRAINING

In [144]:
!pwd

/home/daniel/DARM/darm_mujoco/darm_training


In [154]:
model_name = "./results/darm_sf_hand/model"
# env_norm_name = "./checkpoints/darm_sf_hand_env_norm_2022-12-28__10:10:05.637581"

eval_env = make_vec_env(config["env_id"], n_envs=1, seed=config["seed"])
# eval_env = DummyVecEnv([lambda: gym.make("darm/DarmSFHand-v0", render_mode="human", hand_name="hand1")])
# eval_env = gym.make("darm/DarmSFHand-v0", render_mode="human", hand_name="hand1")

eval_model = SAC.load(model_name, env=eval_env)
eval_model

Loaded XML file successfully


<stable_baselines3.sac.sac.SAC at 0x7f22c9bab0a0>

In [155]:
from stable_baselines3.common.evaluation import evaluate_policy

# Evaluate the model with 10 evaluation episodes and deterministic=True
mean_reward, std_reward = evaluate_policy(eval_model, env=eval_model.get_env(), 
                                          n_eval_episodes=10, deterministic=True)

# Print the results
print(mean_reward, std_reward)

1385.6010603 555.9812113953661


In [141]:
# env = DummyVecEnv([lambda: gym.make("darm/DarmSFHand-v0", render_mode="human", hand_name="hand1")])

# env = VecNormalize.load(env_norm_name, env)
# env.training = False
# print("Zero Norm: ", env.unnormalize_reward(-0.47959065))

In [153]:
import pprint

env = make_vec_env(config["env_id"], n_envs=1, seed=config["seed"], env_kwargs={"render_mode": "human"})
# env = DummyVecEnv([lambda: gym.make("darm/DarmSFHand-v0", render_mode="human", hand_name="hand1")])
# env = gym.make("darm/DarmSFHand-v0", render_mode="human", hand_name="hand1")

obs = env.reset()
episode_return = 0
episode_length = 0
N_EPISODES = 1

for i in range(N_EPISODES):
    done = False
    while not done:
        # env.render()
        action, _states = eval_model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        episode_return += reward[0]
        episode_length += 1
        done = done[0]
        
    print(f"Episode Return: {episode_return} Episode Length: {episode_length}")
    info[0]["model_action"] = action
    pprint.pprint(info[0])
    # info["model_action"] = action
    # pprint.pprint(info)
    
    done = False
    episode_return = 0
    episode_length = 0

env.close()

Loaded XML file successfully
Episode Return: 0.19471649080514908 Episode Length: 200
{'TimeLimit.truncated': True,
 'action': array([0.1990543 , 0.96680355, 1.668639  , 0.48699266, 0.06523365],
      dtype=float32),
 'episode': {'l': 200, 'r': 0.194717, 't': 4.237415},
 'model_action': array([[-0.8009457 , -0.03319645,  0.66863894, -0.51300734, -0.93476635]],
      dtype=float32),
 'reward': {'act_reg': -1.0000131130218506,
            'bonus': array([0.]),
            'dense': array([-0.11761407]),
            'done': array([False]),
            'penalty': array([-0.]),
            'reach': array([-0.00880638]),
            'solved': array([False]),
            'sparse': array([-0.00880638])},
 'sim_time': 16.154000000001925,
 'terminal_observation': array([-0.01216627, -0.02088929,  0.07975191, -0.02060792, -0.02012741,
        0.08214155])}


In [9]:
import numpy as np
def norm_to_target(obs):
    """
    Returns the norm of each fingertip to the target position
    obs: an observation from the observation space [...fingertip_pos, ...target_pos]
    """
    obs = obs.reshape((-1, 3))
    n_fingertips = len(obs)//2

    fingertip_poses = obs[0:n_fingertips]
    target_poses = obs[n_fingertips:]

    return np.linalg.norm(fingertip_poses-target_poses, ord=2, axis=-1)

In [10]:
obs = env.reset()
episode_return = 0
N_EPISODES = 10

for i in range(N_EPISODES):
  obs = env.reset()
  done = False
  episode_steps = 0
  episode_return = 0
  episode_return_norm = 0

  
  while not done:
    # print("Observation: ", env.unnormalize_obs(obs))
    old_norm = norm_to_target(env.unnormalize_obs(obs))

    action, _states = model.predict(obs, deterministic=True)
    # print("Action: ", action)

    obs, reward, done, info = env.step(action)
    episode_steps += 1
    new_norm = norm_to_target(env.unnormalize_obs(obs))

    # Get actual reward
    unnormalized_reward = env.unnormalize_reward(reward)
    episode_return += unnormalized_reward
    episode_return_norm += reward
    # print(f"Reward: {unnormalized_reward}; Normalized: {reward}")

    # print(f"Next Observation: {env.unnormalize_obs(obs)}")
    # print(f"Change in Norm: {new_norm - old_norm}")
    # print("-----------------------------------------------------")

    # render
    env.render()
  
  print(f"Num Steps: {episode_steps}")
  print(f"Episode Return: {episode_return}")
  print(f"Episode Return Norm: {episode_return_norm}")
  if episode_return > -70: 
    print("Goal Reached!")
  print("\n")

env.close()

Num Steps: 100
Episode Return: [-206.27367]
Episode Return Norm: [-6.1579084]


Num Steps: 7
Episode Return: [238.47386]
Episode Return Norm: [7.1191816]
Goal Reached!


Num Steps: 100
Episode Return: [-112.209465]
Episode Return Norm: [-3.3497994]


Num Steps: 29
Episode Return: [194.17151]
Episode Return Norm: [5.7966194]
Goal Reached!


Num Steps: 100
Episode Return: [-154.97745]
Episode Return Norm: [-4.6265574]


Num Steps: 4
Episode Return: [243.22809]
Episode Return Norm: [7.2611094]
Goal Reached!


Num Steps: 100
Episode Return: [-204.50894]
Episode Return Norm: [-6.1052227]


Num Steps: 100
Episode Return: [-108.443665]
Episode Return Norm: [-3.2373753]


Num Steps: 100
Episode Return: [-206.46591]
Episode Return Norm: [-6.163656]


Num Steps: 100
Episode Return: [-206.59006]
Episode Return Norm: [-6.167353]


