In [1]:
import sys
import os
import math
import json
import subprocess
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
from models.td3 import TD3
# from osim.env import ProstheticsEnv
from environment.prosthetics_env_with_history import ProstheticsEnvWithHistory
from environment.observations import prepare_model_observation
from environment.actions import prepare_env_action, reset_frameskip
# from environment.rewards import env_obs_to_custom_reward
from distributed.database import persist_timesteps, persist_event
import multiprocessing


In [2]:
with open('config_distributed.json', 'r') as f:
    CONFIG = json.load(f)
print(json.dumps(CONFIG, indent=4))


{
    "env": {
        "integrator_accuracy": 0.002
    },
    "model": {
        "architecture": "TD3"
    },
    "rollout": {
        "#": "Frameskip will be applied for random durations between 0 and `frameskip` timesteps.",
        "max_episode_steps": 600,
        "expl_noise": 0.25,
        "frameskip": 5
    },
    "distributed": {
        "policy_weights_dir_s3": "s3://colllin-nips-2018-prosthetics/checkpoints/",
        "policy_weights_basename": "checkpoint_TD3",
        "#": "How often (episodes) we download model weights during rollout.",
        "rollout_refresh_model_freq": 5
    },
    "training": {
        "#": "Frequency of delayed policy updates",
        "eval_freq": 2500,
        "batch_size": 100,
        "discount": 0.99,
        "tau": 0.005,
        "policy_noise": 0.2,
        "noise_clip": 0.5,
        "policy_freq": 2
    }
}


### Create simulation env

In [3]:
# env = ProstheticsEnvWithHistory(visualize=False, integrator_accuracy=CONFIG['env']['integrator_accuracy'])
env_step_kwargs = {'project': False}


### Episode Hacking (Custom "done" criteria)


In [4]:
# def should_abort_episode(env_obs, custom_rewards=None, verbose=False):
# #     print((np.array(env_obs['body_pos_rot']['torso'])*180/math.pi > 60).any())
# #     if env_obs['body_pos_rot']['torso'][2] < -0.2:
# #         return True
#     rewards = custom_rewards if custom_rewards != None else env_obs_to_custom_reward(env_obs)
#     # print(f'Custom reward: {sum(rewards.values())}')
#     if (env_obs['body_pos']['head'][0] - env_obs['body_pos']['pelvis'][0]) < -.2:
#         if verbose: print(f'Aborting episode due to head being > .2m behind the pelvis ({env_obs["body_pos"]["head"][0] - env_obs["body_pos"]["pelvis"][0]}).')
#         return True
#     if np.fabs(env_obs['body_pos']['head'][2]) > 0.5:
#         if verbose: print(f'Aborting episode due to head being > 0.5m away from centerline ({env_obs["body_pos"]["head"][2]}).')
#         return True
#     if sum(rewards.values()) < -10:
#         if verbose:
#             print(f'Aborting episode due to custom reward < -10 ({sum(rewards.values())}):')
#             for k,v in rewards.items():
#                 if v < 0:
#                     print(f'  reward `{k}` = {v}')
#         return True
#     return False
    

# Random rollout (Record & Persist Simulations)

In [5]:
envs_by_thread_id = {}

def get_env_for_current_thread():
    thread_id = id(multiprocessing.current_process())
    print(thread_id)
    if not thread_id in envs_by_thread_id:
        envs_by_thread_id[thread_id] = ProstheticsEnvWithHistory(visualize=False, integrator_accuracy=CONFIG['env']['integrator_accuracy'])
    return envs_by_thread_id[thread_id]
    

In [6]:
def rollout_random_episode(episode_num):
    env = get_env_for_current_thread()
    # Reset environment
    obs = env.reset(**env_step_kwargs)
    reset_frameskip(CONFIG['rollout']['frameskip'])
    done = False
    episode_reward = 0
    episode_timesteps = 0

    while not done:
        # Select action randomly or according to policy
        action = env.action_space.sample()

        # Perform action
        action = prepare_env_action(action)
        obs, reward, done, _ = env.step(action, **env_step_kwargs)

        # if not done:
        #     done = should_abort_episode(env.get_state_desc(), verbose=True)

        # custom_rewards = compute_rewards(new_obs_dict)
        episode_reward += reward #+ sum(custom_rewards.values())

        episode_timesteps += 1

    # Persist timesteps to central database
    persist_timesteps(env.history())
    env.reset_history()

    # Log episode
    persist_event('rollout_episode_completed', {
        'episode_num': episode_num,
        'episode_timesteps': episode_timesteps,
        'episode_reward': episode_reward,
    })
    print(f"Episode Num: {episode_num} Episode T: {episode_timesteps} Reward: {episode_reward}")
    sys.stdout.flush()



In [7]:
# Unfortunately, we can't use multiprocessing with opensim.  
# Even if we init separate environments for each thread, it 
# seems that they might share an opensim instance.... :facepalm:

# eps = [[i] for i in range(500)]
# pool = multiprocessing.Pool() # Defaults to os.cpu_count() for number of threads in the pool.
# rval = pool.starmap_async(rollout_random_episode, eps)#, callback=callback)
# pool.close()
# pool.join()

for i in range(500):
    rollout_random_episode(i)


139696887843584
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Episode Num: 0 Episode T: 86 Reward: -335.37909307949167
139696887843584
Episode Num: 1 Episode T: 105 Reward: -532.6242029128832
139696887843584
Episode Num: 2 Episode T: 59 Reward: -41.40560493443378
139696887843584
Episode Num: 3 Episode T: 91 Reward: -506.8561854425691
139696887843584
Episode Num: 4 Episode T: 100 Reward: -546.7786659979884
139696887843584
Episode Num: 5 Episode T: 89 Reward: -361.7370415300716
139696887843584
Episode Num: 6 Episode T: 88 Reward: -320.73696562512
139696887843584
Episode Num: 7 Episode T: 95 Reward: -562.4698248097595
139696887843584
Episode Num: 8 Episode T: 80 Reward: -21.851426105068168
139696887843584
Episode Num: 9 Episode T: 94 Reward: -510.8689696292805
139696887843584
Episode Num: 10 Episode T: 56 Reward: -18.0980

139696887843584
Episode Num: 109 Episode T: 95 Reward: -486.5481670447246
139696887843584
Episode Num: 110 Episode T: 63 Reward: 0.5272288347895913
139696887843584
Episode Num: 111 Episode T: 102 Reward: -419.8231893465308
139696887843584
Episode Num: 112 Episode T: 94 Reward: -513.8868664812738
139696887843584
Episode Num: 113 Episode T: 94 Reward: -283.2614310294514
139696887843584
Episode Num: 114 Episode T: 95 Reward: -344.93823454720916
139696887843584
Episode Num: 115 Episode T: 112 Reward: -647.352067528304
139696887843584
Episode Num: 116 Episode T: 81 Reward: -266.8906903562229
139696887843584
Episode Num: 117 Episode T: 109 Reward: -434.9146034573778
139696887843584
Episode Num: 118 Episode T: 127 Reward: -703.0957249109576
139696887843584
Episode Num: 119 Episode T: 97 Reward: -584.6468195040234
139696887843584
Episode Num: 120 Episode T: 94 Reward: -436.42765817946207
139696887843584
Episode Num: 121 Episode T: 117 Reward: -641.8382231416864
139696887843584
Episode Num: 122

139696887843584
Episode Num: 219 Episode T: 93 Reward: -458.28818640716577
139696887843584
Episode Num: 220 Episode T: 95 Reward: -534.6693551089006
139696887843584
Episode Num: 221 Episode T: 101 Reward: -501.4252968945872
139696887843584
Episode Num: 222 Episode T: 105 Reward: -400.46371709197496
139696887843584
Episode Num: 223 Episode T: 94 Reward: -451.28436141506506
139696887843584


SystemError: <built-in function Manager_integrate> returned a result with an error set