## Install dependencies

In [None]:
!pip install gym[box2d] pyglet==1.3.2

In [1]:
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from gym.wrappers import TimeLimit
import os

## Environment

In [13]:
environment_name = "BipedalWalker-v3"
env = gym.make(environment_name, render_mode="human")
env = TimeLimit(env, max_episode_steps=300)
env.reset()
env.render()

In [3]:
print("action space:{}".format(env.action_space))
print("observation space:{}".format(env.observation_space))

action space:Box(-1.0, 1.0, (4,), float32)
observation space:Box([-3.1415927 -5.        -5.        -5.        -3.1415927 -5.
 -3.1415927 -5.        -0.        -3.1415927 -5.        -3.1415927
 -5.        -0.        -1.        -1.        -1.        -1.
 -1.        -1.        -1.        -1.        -1.        -1.       ], [3.1415927 5.        5.        5.        3.1415927 5.        3.1415927
 5.        5.        3.1415927 5.        3.1415927 5.        5.
 1.        1.        1.        1.        1.        1.        1.
 1.        1.        1.       ], (24,), float32)


In [4]:
#env??

In [5]:
#env = gym.make(environment_name, render_mode="rgb_array")

## Test

In [None]:
episodes = 100
for episode in range(1, episodes + 1):
    
    state = env.reset()
    done = False
    truncated = False
    
    step = 0
    score = 0 

    while not done and not truncated :
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, truncated, info = env.step(action)
        score += reward
        step += 1
            
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

In [None]:
env.action_space.sample()

In [None]:
env.observation_space.sample()

## Train Model

In [16]:
log_path = os.path.join('Training', 'Logs')

In [17]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [18]:
model.learn(total_timesteps=50000)

Logging to Training/Logs/PPO_3
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 164      |
|    ep_rew_mean     | -74      |
| time/              |          |
|    fps             | 48       |
|    iterations      | 1        |
|    time_elapsed    | 42       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 161         |
|    ep_rew_mean          | -74.2       |
| time/                   |             |
|    fps                  | 47          |
|    iterations           | 2           |
|    time_elapsed         | 86          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008818049 |
|    clip_fraction        | 0.0702      |
|    clip_range           | 0.2         |
|    entropy_loss         | -5.69       |
|    explained_variance   | 0.00931     |

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 218         |
|    ep_rew_mean          | -51.8       |
| time/                   |             |
|    fps                  | 46          |
|    iterations           | 11          |
|    time_elapsed         | 483         |
|    total_timesteps      | 22528       |
| train/                  |             |
|    approx_kl            | 0.008052049 |
|    clip_fraction        | 0.0786      |
|    clip_range           | 0.2         |
|    entropy_loss         | -5.55       |
|    explained_variance   | 0.576       |
|    learning_rate        | 0.0003      |
|    loss                 | 20          |
|    n_updates            | 100         |
|    policy_gradient_loss | -0.00616    |
|    std                  | 0.973       |
|    value_loss           | 29.3        |
-----------------------------------------
-----------------------------------------
| rollout/                |       

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 272         |
|    ep_rew_mean          | -30.4       |
| time/                   |             |
|    fps                  | 46          |
|    iterations           | 20          |
|    time_elapsed         | 880         |
|    total_timesteps      | 40960       |
| train/                  |             |
|    approx_kl            | 0.011709152 |
|    clip_fraction        | 0.134       |
|    clip_range           | 0.2         |
|    entropy_loss         | -5.24       |
|    explained_variance   | 0.559       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0747      |
|    n_updates            | 190         |
|    policy_gradient_loss | -0.0124     |
|    std                  | 0.894       |
|    value_loss           | 0.33        |
-----------------------------------------
-----------------------------------------
| rollout/                |       

<stable_baselines3.ppo.ppo.PPO at 0x7ff365298e50>

## Save Model

In [19]:
ppo_path = os.path.join('Training', 'Saved Models', 'PPO_Walking_model')

In [20]:
model.save(ppo_path)

## Evaluate & Test

In [21]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

KeyboardInterrupt: 

In [22]:
env.close()

In [23]:
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()

error: display Surface quit

In [24]:
env.close()