In [1]:
# Disable Tensorflow deprecation warnings
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

# One Leg training using PPO2 from StableBaselines

In [None]:
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common import set_global_seeds
from stable_baselines import PPO2
from stable_baselines.common.vec_env import SubprocVecEnv
import gym
from gym.wrappers import TimeLimit


def make_env(rank, seed=0):
    """
    Init an environment

    :param rank: (int) index of the subprocess
    :param seed: (int) the inital seed for RNG
    """
    timestep_limit = 128

    def _init():
        env = gym.make("gym_kraby:OneLegBulletEnv-v0")
        env = TimeLimit(env, timestep_limit)
        env.seed(seed + rank)
        return env
    set_global_seeds(seed)
    return _init

# Train 10 runs
for n in range(1, 11):  # PPO2_n
    print("Starting training", n)
    num_cpu = 32
    env = SubprocVecEnv([make_env(i, n*32) for i in range(num_cpu)])

    # Use `tensorboard --logdir notebooks/stablebaselines/tensorboard_log/one_leg_doc1` to inspect learning
    model = PPO2(
        policy=MlpPolicy,
        env=env,
        gamma=0.99,  # Discount factor
        n_steps=512,  # batchsize = n_steps * n_envs
        ent_coef=0.01,  # Entropy coefficient for the loss calculation
        learning_rate=2.5e-4,
        lam=0.95,  # Factor for trade-off of bias vs variance for Generalized Advantage Estimator
        nminibatches=64,  # Number of training minibatches per update.
                          # For recurrent policies, the nb of env run in parallel should be a multiple of it.
        noptepochs=30,  # Number of epoch when optimizing the surrogate
        cliprange=0.2,  # Clipping parameter, this clipping depends on the reward scaling
        verbose=False,
        tensorboard_log="./tensorboard_log/one_leg_doc1/",

        seed=32*n,  # Fixed seed
        n_cpu_tf_sess=1,  # force deterministic results
    )
    model.learn(total_timesteps=int(1e6))

    # Saving model
    model.save("trained_models/one_leg_doc1_" + str(n))

    env.close()
    del env
    del model

Starting training 1
Starting training 2
Starting training 3
Starting training 4
Starting training 5
Starting training 6
Starting training 7
Starting training 8
Starting training 9
Starting training 10


## Training with PPO2


doc1: Learning to go to a random target TODO

doc2: without torque

doc3: cos/sin