In [4]:
import warnings
warnings.filterwarnings('ignore')
import os
import bz2
import shutil
import pickle
from IPython.display import clear_output
from osim.env.osimMod36d import L2RunEnvMod
import numpy as np
import matplotlib.pyplot as plt
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=DeprecationWarning)
    import tensorflow as tf

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import SubprocVecEnv
from stable_baselines.common import set_global_seeds
from stable_baselines import PPO2
from stable_baselines.bench import Monitor
from stable_baselines.results_plotter import load_results, ts2xy
from stable_baselines.common.callbacks import BaseCallback, EveryNTimesteps

############################################################################## Set up Directories
log_dir = 'log/'
tensorboard_log_dir = log_dir+"tensorboard/"
os.makedirs(log_dir, exist_ok=True)

ModuleNotFoundError: No module named 'stable_baselines'

## Define functions

In [None]:
def make_env(env_in, rank, time_limit, log_dir, seed=0, stepsize=0.01, **kwargs):
    """
    Utility function for multiprocessed env.
    
    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environment you wish to have in subprocesses 
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    """
    if os.path.exists(log_dir + '/env_0/monitor.csv'):
        raise Exception("existing monitor files found!!!")
    
    def _init():
        env_in.time_limit = time_limit
        env = env_in(**kwargs) 
        env.osim_model.stepsize = stepsize
        log_sub_dir = log_dir + '/env_{}'.format(str(rank))
        os.makedirs(log_sub_dir, exist_ok=True)
        env = Monitor(env, log_sub_dir, allow_early_resets=True)
        env.seed(seed + rank)
        return env
    set_global_seeds(seed)
    return _init

def learning_rate(frac):
    # 3.0e-4*(np.exp(n*(frac-1))), used n=2 for 10m steps
    return params['lr_a1']*(np.exp(params['lr_a2']*(frac-1)))

def own_policy(obs):
    action = np.zeros(18)
    return action

## Define callbacks

In [None]:
def extract_xy(log_dir, num_rollout):
    y = []
    for folder in os.listdir(log_dir):
        if folder.startswith('env_'):
            _, y_tmp = ts2xy(load_results(log_dir+folder), 'timesteps')
            if len(y_tmp) > 0:
                y.extend(list(y_tmp[-num_rollout:]))
    y = sum(y)/len(y) if len(y) > 0 else -np.inf 
    return y

class LogCallback(BaseCallback):
    """
    A custom callback that derives from ``BaseCallback``.

    :param verbose: (int) Verbosity level 0: not output 1: info 2: debug
    """
    def __init__(self, log_dir, verbose=0, num_rollout=5):
        self.log_dir = log_dir
        self.best_mean_reward = -np.inf
        self.num_rollout = num_rollout
        super(LogCallback, self).__init__(verbose)

    def _on_step(self) -> bool:
        mean_reward = extract_xy(self.log_dir, self.num_rollout)
        if mean_reward != -np.inf:
            clear_output(wait=True)
            print(self.num_timesteps, 'timesteps')
            print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}". \
                  format(self.best_mean_reward, mean_reward))

            # New best model, you could save the agent here
            if mean_reward > self.best_mean_reward:
                self.best_mean_reward = mean_reward
                # Example for saving best model
                print("Saving new best model")
                self.model.save(self.log_dir + 'best_model')
                self.model.save(self.log_dir + 'latest_model')
            else:
                print("Saving latest model")
                self.model.save(self.log_dir + 'latest_model')
        return True
    
log_callback = LogCallback(log_dir, num_rollout=5)
event_callback = EveryNTimesteps(n_steps=2000, callback=log_callback)

## Define Parameters

In [2]:
params = {'reward_weight': [6.0, 1.0, 1.0, 0.4, 0.0, 1.0, 1.0, 0.0, 0.5],
          #['forward', 'survival', 'torso', 'joint', 'stability', 'act', 'footstep', 'jerk', 'slide']
          'action_limit': [1]*18,
          'time_limit': 1000,
          'stepsize': 0.01,
          'integrator_accuracy': 5e-3,
          'seed': 0,
          'num_cpu': 12,
          'lr_a1': 1.0e-4,
          'lr_a2': 2, 
          'target_speed_range': [0.8,1.2],
          'total_timesteps': 10000000}

## Set Up Environment and Model

In [3]:
# Create the vectorized environment
# env_in, rank, time_limit, log_dir, seed=0, stepsize=0.01, env_related arguments
env = SubprocVecEnv([make_env(L2RunEnvMod, i, params['time_limit'], log_dir, 
                              seed=params['seed'], 
                              stepsize=params['stepsize'], 
                              reward_weight = params['reward_weight'], 
                              action_limit = params['action_limit'], 
                              visualize=False,
                              integrator_accuracy=params['integrator_accuracy'], 
                              target_speed_range = params['target_speed_range'], 
                              own_policy=own_policy) 
                     for i in range(params['num_cpu'])])

# define policy network
policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[dict(vf=[256,256,256], pi=[256,256,256,12])])

# batch_size=n_steps*num_cpu
model = PPO2(MlpPolicy, env, verbose=0, policy_kwargs=policy_kwargs, 
             noptepochs=4, n_steps=128, learning_rate=learning_rate, 
             tensorboard_log=tensorboard_log_dir)

NameError: name 'SubprocVecEnv' is not defined

## Train

In [6]:
# run this block if want to load parameters from a pretrained model
model.load_parameters(log_dir+'latest_model')

In [None]:
# log environment and training parameters before start training
params_path = log_dir+'params.pbz2'
if os.path.exists(params_path):
    raise Exception("existing env_learning_params file found!")
else:
    with bz2.BZ2File(params_path, 'w') as f: 
        pickle.dump(params, f)

# lbackup custom osim environment to log-dir before start training
shutil.copy2('osim/env/osimMod36d.py', log_dir+'osimMod36d.py')

model.learn(total_timesteps=params['total_timesteps'], callback=event_callback)

965928 timesteps
Best mean reward: 4.52 - Last mean reward per episode: -0.38
Saving latest model


# Plot Reward Progress

In [None]:
def moving_average(values, window):
    """
    Smooth values by doing a moving average
    :param values: (numpy array)
    :param window: (int)
    :return: (numpy array)
    """
    weights = np.repeat(1.0, window) / window
    return np.convolve(values, weights, 'valid')


def plot_results(log_folder, title='Learning Curve', instances=1, same_plot=False):
    """
    plot the results

    :param log_folder: (str) the save location of the results to plot
    :param title: (str) the title of the task to plot
    :instances: (int) the number of instances to average
    """
    x, y = ts2xy(load_results(log_folder+'/env_0'), 'timesteps')

    if instances > 1:
        for i in range(1,instances):
            _, y_tmp = ts2xy(load_results(log_folder+'/env_'+str(i)), 'timesteps')
            if len(y) > len(y_tmp):
                y = y[:len(y_tmp)] + y_tmp
            else:
                y = y + y_tmp[:len(y)]
        y = y/instances
    
    y = moving_average(y, window=5) # change window value to change level of smoothness
    # Truncate x
    x = x[len(x) - len(y):]
    fig = plt.figure(title)
    plt.plot(x, y)
    plt.xlabel('Number of Timesteps')
    plt.ylabel('Rewards')
    plt.title(title + " Smoothed")
    if same_plot is False:
        plt.show()

In [None]:
plot_results(log_dir, instances=6)

## Some Test Code

In [5]:
# ############################################################################# Some Test Code
def own_policy(obs):
    return np.array([0,1,0,1,0,1,0,1,0,0,1,0,1,0,1,0,1,0])

env = L2RunEnvMod(visualize=True, integrator_accuracy = 5e-3, own_policy=own_policy, muscle_synergy=None)
env.load_model()
observation = env.reset()
print(env.action_space)
print(env.observation_space)
print(env.time_limit)

# for i in range(200):
#     # observation, reward, done, info = env.step(env.action_space.sample())
#     observation, reward, done, info = env.step(np.zeros(18))
#     clear_output(wait=True)
#     print(reward)
    
# del env

############################################################################## Visualize learning rate schedule
# x = 1-np.arange(0,1,0.0001)
# lr_exp = 1e-2*(np.exp(2*(x-1)))
# plt.plot(np.arange(0,1,0.0001),lr_exp,label='exponential decay')
# plt.title('learning rate scheduler comparison')
# plt.legend()

Box(4,)
Box(36,)
1000


In [5]:
model = PPO2(MlpPolicy, env, verbose=0,  
             noptepochs=4, n_steps=128, learning_rate=learning_rate, 
             tensorboard_log=tensorboard_log_dir)

NameError: name 'learning_rate' is not defined

In [9]:
model.get_parameter_list()

[<tf.Variable 'model/pi_fc0/w:0' shape=(36, 64) dtype=float32_ref>,
 <tf.Variable 'model/pi_fc0/b:0' shape=(64,) dtype=float32_ref>,
 <tf.Variable 'model/vf_fc0/w:0' shape=(36, 64) dtype=float32_ref>,
 <tf.Variable 'model/vf_fc0/b:0' shape=(64,) dtype=float32_ref>,
 <tf.Variable 'model/pi_fc1/w:0' shape=(64, 64) dtype=float32_ref>,
 <tf.Variable 'model/pi_fc1/b:0' shape=(64,) dtype=float32_ref>,
 <tf.Variable 'model/vf_fc1/w:0' shape=(64, 64) dtype=float32_ref>,
 <tf.Variable 'model/vf_fc1/b:0' shape=(64,) dtype=float32_ref>,
 <tf.Variable 'model/vf/w:0' shape=(64, 1) dtype=float32_ref>,
 <tf.Variable 'model/vf/b:0' shape=(1,) dtype=float32_ref>,
 <tf.Variable 'model/pi/w:0' shape=(64, 18) dtype=float32_ref>,
 <tf.Variable 'model/pi/b:0' shape=(18,) dtype=float32_ref>,
 <tf.Variable 'model/pi/logstd:0' shape=(1, 18) dtype=float32_ref>,
 <tf.Variable 'model/q/w:0' shape=(64, 18) dtype=float32_ref>,
 <tf.Variable 'model/q/b:0' shape=(18,) dtype=float32_ref>]