In [1]:
import sys
import os
import math
import json
import subprocess
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
from td3 import TD3
# from osim.env import ProstheticsEnv
from prosthetics_env_with_history import ProstheticsEnvWithHistory
from observations import prepare_model_observation
from actions import prepare_env_action, reset_frameskip
from rewards import env_obs_to_custom_reward
from database import persist_timesteps


In [2]:
with open('config_distributed.json', 'r') as f:
    CONFIG = json.load(f)
print(json.dumps(CONFIG, indent=4))

{
    "env": {
        "integrator_accuracy": 0.002
    },
    "model": {
        "architecture": "TD3"
    },
    "rollout": {
        "#": "Frameskip will be applied for random durations between 0 and `frameskip` timesteps.",
        "max_episode_steps": 600,
        "expl_noise": 0.25,
        "frameskip": 5
    },
    "distributed": {
        "policy_weights_dir_s3": "s3://colllin-nips-2018-prosthetics/checkpoints/",
        "policy_weights_dir_local": "./checkpoints/",
        "policy_weights_basename": "checkpoint_TD3"
    },
    "training": {
        "#": "Frequency of delayed policy updates",
        "eval_freq": 5000.0,
        "batch_size": 100,
        "discount": 0.99,
        "tau": 0.005,
        "policy_noise": 0.2,
        "noise_clip": 0.5,
        "policy_freq": 2
    }
}


In [3]:
# OUTPUT_DIR = Path('.')
# LOGS_DIR = OUTPUT_DIR/'logs'
# CHECKPOINTS_DIR = OUTPUT_DIR/'checkpoints'
# os.makedirs(OUTPUT_DIR, exist_ok=True)
# os.makedirs(LOGS_DIR, exist_ok=True)
# 


### Create simulation env

In [4]:
env = ProstheticsEnvWithHistory(visualize=False, integrator_accuracy=CONFIG['env']['integrator_accuracy'])
env_step_kwargs = {'project': False}


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


### Create Policy, Download & load latest weights

In [5]:
# state_dim = env.observation_space.shape[0]
env.reset(**env_step_kwargs)
state_dim = prepare_model_observation(env).shape[0]
action_dim = env.action_space.shape[0]
max_action = int(env.action_space.high[0])
state_dim, action_dim, max_action


(1260, 19, 1)

In [6]:
policy = TD3(state_dim, action_dim, max_action)

In [7]:
os.makedirs(CONFIG['distributed']['policy_weights_dir_local'], exist_ok=True)

subprocess.check_output('aws s3 cp {src} {dest} --recursive --exclude "*" --include "{basename}*" --no-sign-request'.format(
    src=CONFIG['distributed']['policy_weights_dir_s3'],
    dest=CONFIG['distributed']['policy_weights_dir_local'],
    basename=CONFIG['distributed']['policy_weights_basename'],
), shell=True)

!ls {CONFIG['distributed']['policy_weights_dir_local']}

checkpoint_TD3_actor.pth
checkpoint_TD3_critic.pth
checkpoint_TD3_episode1116_eval-563.7_actor.pth
checkpoint_TD3_episode1116_eval-563.7_critic.pth
checkpoint_TD3_episode319_eval-355.8_actor.pth
checkpoint_TD3_episode319_eval-355.8_critic.pth
checkpoint_TD3_episode373_eval-80.6_actor.pth
checkpoint_TD3_episode373_eval-80.6_critic.pth
checkpoint_TD3_episode425_eval-80.5_actor.pth
checkpoint_TD3_episode425_eval-80.5_critic.pth
checkpoint_TD3_episode479_eval-80.6_actor.pth
checkpoint_TD3_episode479_eval-80.6_critic.pth
checkpoint_TD3_episode532_eval-80.6_actor.pth
checkpoint_TD3_episode532_eval-80.6_critic.pth
checkpoint_TD3_episode597_eval-631.3_actor.pth
checkpoint_TD3_episode597_eval-631.3_critic.pth


In [8]:
print(f"Loading policy checkpoints from {CONFIG['distributed']['policy_weights_dir_local']}{CONFIG['distributed']['policy_weights_basename']}*")
policy.load(
    CONFIG['distributed']['policy_weights_dir_local'], 
    CONFIG['distributed']['policy_weights_basename'],
)


Loading policy checkpoints from ./checkpoints/checkpoint_TD3*


### Episode Hacking (Custom "done" criteria)


In [9]:
def should_abort_episode(env_obs, custom_rewards=None, verbose=False):
#     print((np.array(env_obs['body_pos_rot']['torso'])*180/math.pi > 60).any())
#     if env_obs['body_pos_rot']['torso'][2] < -0.2:
#         return True
    rewards = custom_rewards if custom_rewards != None else env_obs_to_custom_reward(env_obs)
    # print(f'Custom reward: {sum(rewards.values())}')
    if (env_obs['body_pos']['head'][0] - env_obs['body_pos']['pelvis'][0]) < -.2:
        if verbose: print(f'Aborting episode due to head being > .2m behind the pelvis ({env_obs["body_pos"]["head"][0] - env_obs["body_pos"]["pelvis"][0]}).')
        return True
    if np.fabs(env_obs['body_pos']['head'][2]) > 0.5:
        if verbose: print(f'Aborting episode due to head being > 0.5m away from centerline ({env_obs["body_pos"]["head"][2]}).')
        return True
    if sum(rewards.values()) < -10:
        if verbose:
            print(f'Aborting episode due to custom reward < -10 ({sum(rewards.values())}):')
            for k,v in rewards.items():
                if v < 0:
                    print(f'  reward `{k}` = {v}')
        return True
    return False
    

# Policy rollout (Record & Persist Simulations)

In [10]:
total_timesteps = 0
episode_num = 0
done = True
episode_timesteps = 0
total_timesteps, episode_num, episode_timesteps


(0, 0, 0)

In [11]:
while True:
    if done: 
        # Persist timesteps to central database
        persist_timesteps(env.history())
        env.reset_history()

        # Reset environment
        obs = env.reset(**env_step_kwargs)
        reset_frameskip(CONFIG['rollout']['frameskip'])
        done = False
        episode_reward = 0
        episode_timesteps = 0
        episode_num += 1 

    # # Select action randomly or according to policy
    # if total_timesteps < CONFIG['training']['start_timesteps']:
    #     action = env.action_space.sample()
    # else:
    action = policy.select_action(prepare_model_observation(env))
    if CONFIG['rollout']['expl_noise'] != 0: 
        action = (action + np.random.normal(0, CONFIG['rollout']['expl_noise'], size=env.action_space.shape[0])).clip(env.action_space.low, env.action_space.high)

    # Perform action
    action = prepare_env_action(action)
    obs, reward, done, _ = env.step(action, **env_step_kwargs)

    if not done:
        done = should_abort_episode(env.get_state_desc(), verbose=True)
    done_bool = 0 if episode_timesteps + 1 == CONFIG['rollout']['max_episode_steps'] else float(done)

    # custom_rewards = compute_rewards(new_obs_dict)
    episode_reward += reward #+ sum(custom_rewards.values())

    episode_timesteps += 1
    total_timesteps += 1

    if done:
        print(f"Total T: {total_timesteps} Episode Num: {episode_num} Episode T: {episode_timesteps} Reward: {episode_reward}")
        sys.stdout.flush()


Aborting episode due to head being > 0.5m away from centerline (0.5071070039671132).
Total T: 91 Episode Num: 1 Episode T: 91 Reward: -133.44861162800157
Aborting episode due to head being > 0.5m away from centerline (0.5089799849513539).
Total T: 188 Episode Num: 2 Episode T: 97 Reward: -142.8999702970768
Aborting episode due to head being > 0.5m away from centerline (0.5027802162827082).
Total T: 287 Episode Num: 3 Episode T: 99 Reward: -123.5708525772812
Aborting episode due to head being > 0.5m away from centerline (0.5049091180228467).
Total T: 389 Episode Num: 4 Episode T: 102 Reward: -132.48319817340297


SystemError: <built-in function Manager_integrate> returned a result with an error set