# One Leg training using PPO2 from StableBaselines

## Training with PPO2

These hyperparameters originates from StableBaselinesZoo Reacher's hyperparameters.

In [4]:
import gym

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common import make_vec_env
from stable_baselines import PPO2

# TODO: probl√®me de multiprocess
env = make_vec_env('gym_kraby:OneLegBulletEnv-v0', n_envs=8)

# Use `tensorboard --logdir notebooks/stablebaselines/tensorboard_log/one_leg` to inspect learning
model = PPO2(
    policy=MlpPolicy,
    env=env,
    gamma=0.99,  # Discount factor
    n_steps=2048,  # batchsize = n_steps * n_envs
    ent_coef=0.0,  # Entropy coefficient for the loss calculation
    learning_rate=2.5e-4,
    lam=0.95,  # Factor for trade-off of bias vs variance for Generalized Advantage Estimator
    nminibatches=32,  # Number of training minibatches per update.
                      # For recurrent policies, the nb of env run in parallel should be a multiple of it.
    noptepochs=10,  # Number of epoch when optimizing the surrogate
    cliprange=0.2,  # Clipping parameter, this clipping depends on the reward scaling
    verbose=False,
    tensorboard_log="./tensorboard_log/one_leg/",

    # For tests
    seed=0,  # Fixed seed
    n_cpu_tf_sess=1,  # force deterministic results
)
model.learn(total_timesteps=int(1e6))

<stable_baselines.ppo2.ppo2.PPO2 at 0x7f0a64d9f470>

## Rendering

In [None]:
images = []
obs = model.env.reset()
img = model.env.render(mode='rgb_array')
for i in range(350):
    images.append(img)
    action, _ = model.predict(obs)
    obs, _, _ ,_ = model.env.step(action)
    img = model.env.render(mode='rgb_array')

imageio.mimsave('one_leg.gif', [np.array(img) for i, img in enumerate(images) if i%2 == 0], fps=100)