In [4]:
import gym

from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv

from huggingface_sb3 import package_to_hub

In [10]:
import wandb
from wandb.integration.sb3 import WandbCallback

run = wandb.init(
    project="hf-deep-rl-class",
    entity="deutschmann",
    sync_tensorboard=True,
    monitor_gym=True
)

In [6]:
def get_env(env_name):
    env = gym.make(env_name)
    env = Monitor(env)
    env.reset()
    return env

env_name = "LunarLander-v2"
env = get_env(env_name)

In [7]:
train_steps = 1_000_000 # int(2e5)

model = PPO(
    "MlpPolicy", 
    env, 
    verbose=1, 
    tensorboard_log=f"runs/{run.id}"
)

model.learn(total_timesteps=train_steps, callback=WandbCallback(
    gradient_save_freq=100,
    verbose=2,
))

Using cpu device
Wrapping the env in a DummyVecEnv.
Logging to runs/2cn87197/PPO_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 87.2     |
|    ep_rew_mean     | -166     |
| time/              |          |
|    fps             | 3460     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 87.4         |
|    ep_rew_mean          | -169         |
| time/                   |              |
|    fps                  | 2626         |
|    iterations           | 2            |
|    time_elapsed         | 1            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0049147257 |
|    clip_fraction        | 0.0109       |
|    clip_range           | 0.2          |
|    entropy_loss 

<stable_baselines3.ppo.ppo.PPO at 0x2824b8940>

wandb: Network error (ReadTimeout), entering retry loop.


In [8]:
run.finish()

VBox(children=(Label(value='0.284 MB of 0.284 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▂▃▅▇███▇▅▄▄▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃
rollout/ep_rew_mean,▁▁▂▂▃▄▅▅▅▆▇▇███████████████████▇██▇██▇██
time/fps,█▆▃▂▂▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃
train/approx_kl,▅▄▃▇▆█▆▁▅▅▂▃▃▃▃▅▃▆▂▃▅▂▂▃▂▂▂▅▃▄▄▄▂▁▃▂▃▄▃▂
train/clip_fraction,▄▄▃█▅▃▆▁▄▅▂▃▃▂▃▆▃▆▁▃▃▃▃▃▂▂▂▅▃▄▃▃▂▁▃▂▃▄▄▄
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▂▃▃▄▄▅▅▅▅▅▆▆▆▆▆▆▇▆▇▇▇▇▇▇▇▇▇▇▇█▇█████▇▇▇
train/explained_variance,▁▄▆▄▇▅█▅▇▆▇▇▆▇▇███▇▇▇▇█▃▅▆▇▇▇█▇▆▆▆▅▇█▇█▇
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,1001472.0
rollout/ep_len_mean,314.26001
rollout/ep_rew_mean,239.78058
time/fps,1356.0
train/approx_kl,0.00521
train/clip_fraction,0.09385
train/clip_range,0.2
train/entropy_loss,-0.51855
train/explained_variance,0.85302
train/learning_rate,0.0003


In [13]:
# Define the name of the environment
env_id = env_name

# Create the evaluation env
eval_env = DummyVecEnv([lambda: gym.make(env_id)])

# Define the model architecture we used
model_architecture = "PPO"

## Define a repo_id
## repo_id is the id of the model repository from the Hugging Face Hub (repo_id = {organization}/{repo_name} for instance ThomasSimonini/ppo-LunarLander-v2
repo_id = "deutschmann/ppo-LunarLander-v2"

## Define the commit message
commit_message = "Longer training"

model_name = "PPO-MLP"

# method save, evaluate, generate a model card and record a replay video of your agent before pushing the repo to the hub
package_to_hub(model=model, # Our trained model
               model_name=model_name, # The name of our trained model 
               model_architecture=model_architecture, # The model architecture we used: in our case PPO
               env_id=env_id, # Name of the environment
               eval_env=eval_env, # Evaluation Environment
               repo_id=repo_id, # id of the model repository from the Hugging Face Hub (repo_id = {organization}/{repo_name} for instance ThomasSimonini/ppo-LunarLander-v2
               commit_message=commit_message)


[38;5;4mℹ This function will save, evaluate, generate a video of your agent,
create a model card and push everything to the hub. It might take up to 1min.
This is a work in progress: If you encounter a bug, please open an issue and use
push_to_hub instead.[0m


/Users/patrick/Projects/deep-rl-class/unit1/hub/ppo-LunarLander-v2 is already a clone of https://huggingface.co/deutschmann/ppo-LunarLander-v2. Make sure you pull the latest changes with `repo.git_pull()`.


Saving video to /Users/patrick/Projects/deep-rl-class/unit1/-step-0-to-step-1000.mp4


ffmpeg version 5.0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with Apple clang version 13.1.6 (clang-1316.0.21.2)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/5.0.1 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags= --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libbluray --enable-libdav1d --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --enable-libsoxr --enable-libzmq --enable-libzimg --disable-libjack --disable-indev=jack --enable-videotoolbox

[38;5;4mℹ Pushing repo ppo-LunarLander-v2 to the Hugging Face Hub[0m


Upload file replay.mp4:   8%|8         | 32.0k/377k [00:00<?, ?B/s]

Upload file PPO-MLP/policy.optimizer.pth:  39%|###8      | 32.0k/82.7k [00:00<?, ?B/s]

Upload file PPO-MLP.zip:  23%|##2       | 32.0k/140k [00:00<?, ?B/s]

Upload file PPO-MLP/policy.pth:  76%|#######6  | 32.0k/42.1k [00:00<?, ?B/s]

remote: Enforcing permissions...        
remote: Allowed refs: all        
To https://huggingface.co/deutschmann/ppo-LunarLander-v2
   007b0e4..5ef6c5c  main -> main



[38;5;4mℹ Your model is pushed to the hub. You can view your model here:
https://huggingface.co/deutschmann/ppo-LunarLander-v2[0m


'https://huggingface.co/deutschmann/ppo-LunarLander-v2'