In [1]:
import sys
import os
import math
import json
import subprocess
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
from models.td3 import TD3
# from osim.env import ProstheticsEnv
from environment.prosthetics_env_with_history import ProstheticsEnvWithHistory
from environment.observations import prepare_model_observation
from environment.actions import prepare_env_action, reset_frameskip
from environment.rewards import env_obs_to_custom_reward
from distributed.database import persist_timesteps, persist_event
from distributed.s3_checkpoints import load_s3_model_checkpoint


In [2]:
with open('config_distributed.json', 'r') as f:
    CONFIG = json.load(f)
print(json.dumps(CONFIG, indent=4))


{
    "env": {
        "integrator_accuracy": 0.002
    },
    "model": {
        "architecture": "TD3"
    },
    "rollout": {
        "#": "Frameskip will be applied for random durations between 0 and `frameskip` timesteps.",
        "max_episode_steps": 600,
        "expl_noise": 0.25,
        "frameskip": 5
    },
    "distributed": {
        "policy_weights_dir_s3": "s3://colllin-nips-2018-prosthetics/checkpoints/",
        "policy_weights_basename": "checkpoint_TD3",
        "#": "How often (episodes) we download model weights during rollout.",
        "rollout_refresh_model_freq": 5
    },
    "training": {
        "#": "Frequency of delayed policy updates",
        "eval_freq": 2000,
        "batch_size": 100,
        "discount": 0.99,
        "tau": 0.005,
        "policy_noise": 0.2,
        "noise_clip": 0.5,
        "policy_freq": 2
    }
}


### Create simulation env

In [3]:
env = ProstheticsEnvWithHistory(visualize=False, integrator_accuracy=CONFIG['env']['integrator_accuracy'])
env_step_kwargs = {'project': False}


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


### Create Policy, Download & load latest weights

In [4]:
# state_dim = env.observation_space.shape[0]
env.reset(**env_step_kwargs)
state_dim = prepare_model_observation(env).shape[0]
action_dim = env.action_space.shape[0]
max_action = int(env.action_space.high[0])
state_dim, action_dim, max_action


(1260, 19, 1)

In [5]:
policy = TD3(state_dim, action_dim, max_action)

In [6]:
print(f"Loading policy checkpoints from {CONFIG['distributed']['policy_weights_dir_s3']}{CONFIG['distributed']['policy_weights_basename']}*")
load_s3_model_checkpoint(
    policy, 
    s3_dir=CONFIG['distributed']['policy_weights_dir_s3'],
    basename=CONFIG['distributed']['policy_weights_basename'],
    map_location='cpu'
)


Loading policy checkpoints from s3://colllin-nips-2018-prosthetics/checkpoints/checkpoint_TD3*


### Episode Hacking (Custom "done" criteria)


In [7]:
# def should_abort_episode(env_obs, custom_rewards=None, verbose=False):
# #     print((np.array(env_obs['body_pos_rot']['torso'])*180/math.pi > 60).any())
# #     if env_obs['body_pos_rot']['torso'][2] < -0.2:
# #         return True
#     rewards = custom_rewards if custom_rewards != None else env_obs_to_custom_reward(env_obs)
#     # print(f'Custom reward: {sum(rewards.values())}')
#     if (env_obs['body_pos']['head'][0] - env_obs['body_pos']['pelvis'][0]) < -.2:
#         if verbose: print(f'Aborting episode due to head being > .2m behind the pelvis ({env_obs["body_pos"]["head"][0] - env_obs["body_pos"]["pelvis"][0]}).')
#         return True
#     if np.fabs(env_obs['body_pos']['head'][2]) > 0.5:
#         if verbose: print(f'Aborting episode due to head being > 0.5m away from centerline ({env_obs["body_pos"]["head"][2]}).')
#         return True
#     if sum(rewards.values()) < -10:
#         if verbose:
#             print(f'Aborting episode due to custom reward < -10 ({sum(rewards.values())}):')
#             for k,v in rewards.items():
#                 if v < 0:
#                     print(f'  reward `{k}` = {v}')
#         return True
#     return False
    

# Policy rollout (Record & Persist Simulations)

In [8]:
total_timesteps = 0
episode_num = 0
done = True
episode_timesteps = 0
total_timesteps, episode_num, episode_timesteps


(0, 0, 0)

In [9]:
while True:
    if done: 
        if (episode_num % CONFIG['distributed']['rollout_refresh_model_freq']) == 0:
            print(f"\nLoading policy checkpoint from {CONFIG['distributed']['policy_weights_dir_s3']}{CONFIG['distributed']['policy_weights_basename']}\n")
            load_s3_model_checkpoint(
                policy, 
                s3_dir=CONFIG['distributed']['policy_weights_dir_s3'],
                basename=CONFIG['distributed']['policy_weights_basename'],
                map_location='cpu'
            )
            timesteps_since_model_update = 0
            persist_event('rollout_model_refreshed', {
                'episode_num': episode_num,
            })

        # Reset environment
        obs = env.reset(**env_step_kwargs)
        reset_frameskip(CONFIG['rollout']['frameskip'])
        done = False
        episode_reward = 0
        episode_timesteps = 0
        episode_num += 1 

    # # Select action randomly or according to policy
    # if total_timesteps < CONFIG['training']['start_timesteps']:
    #     action = env.action_space.sample()
    # else:
    action = policy.select_action(prepare_model_observation(env))
    if CONFIG['rollout']['expl_noise'] != 0: 
        action += np.random.normal(0, CONFIG['rollout']['expl_noise'], size=action.shape)

    # Perform action
    action = prepare_env_action(action)
    obs, reward, done, _ = env.step(action, **env_step_kwargs)

    # if not done:
    #     done = should_abort_episode(env.get_state_desc(), verbose=True)
    done_bool = 0 if episode_timesteps + 1 == CONFIG['rollout']['max_episode_steps'] else float(done)

    # custom_rewards = compute_rewards(new_obs_dict)
    episode_reward += reward #+ sum(custom_rewards.values())

    episode_timesteps += 1
    total_timesteps += 1

    if done:
        # Persist timesteps to central database
        persist_timesteps(env.history())
        env.reset_history()

        # Log episode
        persist_event('rollout_episode_completed', {
            'episode_num': episode_num,
            'episode_timesteps': episode_timesteps,
            'episode_reward': episode_reward,
        })
        print(f"Total T: {total_timesteps} Episode Num: {episode_num} Episode T: {episode_timesteps} Reward: {episode_reward}")
        sys.stdout.flush()




Loading policy checkpoint from s3://colllin-nips-2018-prosthetics/checkpoints/checkpoint_TD3

Aborting episode due to head being > 0.5m away from centerline (0.5080333301569389).
Total T: 86 Episode Num: 1 Episode T: 86 Reward: -138.58982267204854
Aborting episode due to head being > 0.5m away from centerline (0.5002184046665511).
Total T: 179 Episode Num: 2 Episode T: 93 Reward: -137.41753031874325
Aborting episode due to head being > 0.5m away from centerline (0.5037454832288987).
Total T: 270 Episode Num: 3 Episode T: 91 Reward: -141.31275441024255
Aborting episode due to head being > 0.5m away from centerline (0.5044327557843173).
Total T: 362 Episode Num: 4 Episode T: 92 Reward: -131.50359554868925
Aborting episode due to head being > 0.5m away from centerline (0.5075393278284324).
Total T: 478 Episode Num: 5 Episode T: 116 Reward: -132.5689455022213

Loading policy checkpoint from s3://colllin-nips-2018-prosthetics/checkpoints/checkpoint_TD3

Aborting episode due to head being >

SystemError: <built-in function Manager_integrate> returned a result with an error set