In [1]:
import sys
import os
import math
import json
import subprocess
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
from models.td3 import TD3
# from osim.env import ProstheticsEnv
from environment.prosthetics_env_with_history import ProstheticsEnvWithHistory
from environment.observations import prepare_model_observation
from environment.actions import prepare_env_action, reset_frameskip
from environment.rewards import env_obs_to_custom_reward
from distributed.s3_checkpoints import load_s3_model_checkpoint


In [2]:
with open('config_distributed.json', 'r') as f:
    CONFIG = json.load(f)
print(json.dumps(CONFIG, indent=4))


{
    "env": {
        "integrator_accuracy": 0.002
    },
    "model": {
        "architecture": "TD3"
    },
    "rollout": {
        "#": "Frameskip will be applied for random durations between 0 and `frameskip` timesteps.",
        "max_episode_steps": 600,
        "expl_noise": 0.25,
        "frameskip": 5
    },
    "distributed": {
        "policy_weights_dir_s3": "s3://colllin-nips-2018-prosthetics/checkpoints/",
        "policy_weights_basename": "checkpoint_TD3",
        "#": "How often (episodes) we download model weights during rollout.",
        "rollout_refresh_model_freq": 5
    },
    "training": {
        "#": "Frequency of delayed policy updates",
        "eval_freq": 2500,
        "batch_size": 100,
        "discount": 0.99,
        "tau": 0.005,
        "policy_noise": 0.2,
        "noise_clip": 0.5,
        "policy_freq": 2
    }
}


### Create simulation env

In [3]:
env = ProstheticsEnvWithHistory(visualize=True, integrator_accuracy=CONFIG['env']['integrator_accuracy'])
env_step_kwargs = {'project': False}


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


### Create Policy, Download & load latest weights

In [4]:
# state_dim = env.observation_space.shape[0]
env.reset(**env_step_kwargs)
state_dim = prepare_model_observation(env).shape[0]
action_dim = env.action_space.shape[0]
max_action = int(env.action_space.high[0])
state_dim, action_dim, max_action


(1260, 19, 1)

In [5]:
policy = TD3(state_dim, action_dim, max_action)

In [6]:
checkpoints_dir = '/Users/colllin/Downloads/'
# checkpoints_basename = 'checkpoint_TD3_T1577028_2018-10-21T00-25-00.739509'
# checkpoints_basename = 'checkpoint_TD3_T1577028_2018-10-20T22-33-13.970317'
checkpoints_basename = 'checkpoint_TD3_T1001791_2018-10-20T10-33-37.953576'
policy.load(checkpoints_dir, checkpoints_basename, map_location='cpu')


### Rollout Episode

In [7]:
# Reset environment
obs = env.reset(**env_step_kwargs)
reset_frameskip(CONFIG['rollout']['frameskip'])
done = False
episode_reward = 0
episode_timesteps = 0

while not done:
    # # Select action randomly or according to policy
    # if total_timesteps < CONFIG['training']['start_timesteps']:
    #     action = env.action_space.sample()
    # else:
    action = policy.select_action(prepare_model_observation(env))
    if CONFIG['rollout']['expl_noise'] != 0: 
        action += np.random.normal(0, CONFIG['rollout']['expl_noise'], size=action.shape)

    # Perform action
    action = prepare_env_action(action)
    obs, reward, done, _ = env.step(action, **env_step_kwargs)

    # if not done:
    #     done = should_abort_episode(env.get_state_desc(), verbose=True)

    # custom_rewards = compute_rewards(new_obs_dict)
    episode_reward += reward #+ sum(custom_rewards.values())
    episode_timesteps += 1

print(f"Episode T: {episode_timesteps} Reward: {episode_reward}")
sys.stdout.flush()



Episode T: 56 Reward: 27.883903757931353
