In [3]:
# Have to temporarily include the xquartz bin path to PATH so that package Xvfb
# is discoverable.
%env PATH=$PATH:/opt/X11/bin
from pyvirtualdisplay.display import Display

virtual_display = Display(visible=False, size=(1400, 900))
virtual_display.start()

env: PATH=$PATH:/opt/X11/bin


<pyvirtualdisplay.display.Display at 0x115286110>

In [4]:
import gymnasium

from huggingface_sb3 import load_from_hub, package_to_hub
from huggingface_hub import notebook_login
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

In [5]:
# A sample iteration of the RL loop.
environment = gymnasium.make('LunarLander-v3')
observation, info = environment.reset()
for _ in range(20):
    action = environment.action_space.sample()
    print(f'action taken: {action}')
    observation, reward, terminated, truncated, info = environment.step(action)
    if terminated or truncated:
        print('environment is reset')
        observation, info = environment.reset()
environment.close()

action taken: 3
action taken: 1
action taken: 2
action taken: 0
action taken: 1
action taken: 0
action taken: 0
action taken: 2
action taken: 2
action taken: 1
action taken: 2
action taken: 0
action taken: 3
action taken: 0
action taken: 0
action taken: 0
action taken: 3
action taken: 2
action taken: 1
action taken: 2


In [6]:
print('Observation space:')
print(environment.observation_space.shape)
print(f'sample observation: {environment.observation_space.sample()}')
print('Action space')
print(environment.action_space.shape)
print(f'sample action space: {environment.action_space.sample()}')

Observation space:
(8,)
sample observation: [-1.6106519  -0.08835039  3.262047   -0.1892082   5.601258    7.6346636
  0.9470163   0.16108984]
Action space
()
sample action space: 2


In [7]:
environment = make_vec_env('LunarLander-v3', n_envs=16)

In [8]:
model = PPO('MlpPolicy', environment, verbose=1)
model.learn(total_timesteps=int(2e5))

Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 88.9     |
|    ep_rew_mean     | -196     |
| time/              |          |
|    fps             | 11790    |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 32768    |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 97          |
|    ep_rew_mean          | -158        |
| time/                   |             |
|    fps                  | 6129        |
|    iterations           | 2           |
|    time_elapsed         | 10          |
|    total_timesteps      | 65536       |
| train/                  |             |
|    approx_kl            | 0.011826916 |
|    clip_fraction        | 0.112       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.38       |
|    explained_variance   | -0.00313    |
|    learning

<stable_baselines3.ppo.ppo.PPO at 0x16fad48d0>

In [9]:
model = PPO(
    policy='MlpPolicy',
    env=environment,
    n_steps=1024,
    batch_size=64,
    n_epochs=4,
    gamma=0.999,
    gae_lambda=0.98,
    ent_coef=0.01,
    verbose=1,
)
model.learn(total_timesteps=1_000_000)

Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 89.5     |
|    ep_rew_mean     | -180     |
| time/              |          |
|    fps             | 13571    |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 16384    |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 91.2        |
|    ep_rew_mean          | -142        |
| time/                   |             |
|    fps                  | 8745        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 32768       |
| train/                  |             |
|    approx_kl            | 0.008011678 |
|    clip_fraction        | 0.0569      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.38       |
|    explained_variance   | -2.71e-05   |
|    learning

<stable_baselines3.ppo.ppo.PPO at 0x32e70ad50>

In [12]:
model_name = 'ppo-LunarLander-v3'
model.save(model_name)

In [13]:
eval_environment = Monitor(gymnasium.make(
    'LunarLander-v3', render_mode='rgb_array'))
mean_reward, std_reward = evaluate_policy(
    model, eval_environment, n_eval_episodes=10, deterministic=True)
print(f'mean_reward={mean_reward:.2f} +/- {std_reward}')

mean_reward=274.17 +/- 22.13096038029531


In [None]:
notebook_login()

In [14]:
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_util import make_vec_env

from huggingface_sb3 import package_to_hub

env_id = 'LunarLander-v3'
model_arch = 'PPO'
repo_id = f'rodmosc/ppo-{env_id}'
eval_env = DummyVecEnv(
    [lambda: Monitor(gymnasium.make(env_id, render_mode='rgb_array'))])
package_to_hub(
    model=model,
    model_name=model_name,
    model_architecture=model_arch,
    env_id=env_id,
    eval_env=eval_env,
    repo_id=repo_id,
    commit_message='Upload PPO LunarLander-v3 trained agent.'
)

[38;5;4mℹ This function will save, evaluate, generate a video of your agent,
create a model card and push everything to the hub. It might take up to 1min.
This is a work in progress: if you encounter a bug, please open an issue.[0m
Saving video to /var/folders/zd/6xd2w7tx4dz9dc2hp_9jpsww0000gn/T/tmphqystngj/-step-0-to-step-1000.mp4
MoviePy - Building video /var/folders/zd/6xd2w7tx4dz9dc2hp_9jpsww0000gn/T/tmphqystngj/-step-0-to-step-1000.mp4.
MoviePy - Writing video /var/folders/zd/6xd2w7tx4dz9dc2hp_9jpsww0000gn/T/tmphqystngj/-step-0-to-step-1000.mp4



sh: ffmpeg: command not found                                              


MoviePy - Done !
MoviePy - video ready /var/folders/zd/6xd2w7tx4dz9dc2hp_9jpsww0000gn/T/tmphqystngj/-step-0-to-step-1000.mp4
[38;5;4mℹ Pushing repo rodmosc/ppo-LunarLander-v3 to the Hugging Face Hub[0m


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

[38;5;4mℹ Your model is pushed to the Hub. You can view your model here:
https://huggingface.co/rodmosc/ppo-LunarLander-v3/tree/main/[0m


CommitInfo(commit_url='https://huggingface.co/rodmosc/ppo-LunarLander-v3/commit/db8c1e7d7631973f7a38c090c3ca0520c9bfe877', commit_message='Upload PPO LunarLander-v3 trained agent.', commit_description='', oid='db8c1e7d7631973f7a38c090c3ca0520c9bfe877', pr_url=None, repo_url=RepoUrl('https://huggingface.co/rodmosc/ppo-LunarLander-v3', endpoint='https://huggingface.co', repo_type='model', repo_id='rodmosc/ppo-LunarLander-v3'), pr_revision=None, pr_num=None)

In [15]:
from huggingface_sb3 import load_from_hub

repo_id = 'rodmosc/ppo-LunarLander-v3'
filename = 'ppo-LunarLander-v3.zip'
custom_objects = {
    'learning_rate': 0.0,
    'lr_schedule': lambda _: 0.0,
    'clip_range': lambda _: 0.0,
}
checkpoint = load_from_hub(repo_id, filename)
model = PPO.load(checkpoint, custom_objects=custom_objects,
                 print_system_info=True)

ppo-LunarLander-v3.zip:   0%|          | 0.00/150k [00:00<?, ?B/s]

== CURRENT SYSTEM INFO ==
- OS: macOS-15.6.1-arm64-arm-64bit Darwin Kernel Version 24.6.0: Mon Jul 14 11:30:40 PDT 2025; root:xnu-11417.140.69~1/RELEASE_ARM64_T6041
- Python: 3.11.13
- Stable-Baselines3: 2.7.0
- PyTorch: 2.9.0
- GPU Enabled: False
- Numpy: 2.2.6
- Cloudpickle: 3.1.2
- Gymnasium: 1.2.2

== SAVED MODEL SYSTEM INFO ==
- OS: macOS-15.6.1-arm64-arm-64bit Darwin Kernel Version 24.6.0: Mon Jul 14 11:30:40 PDT 2025; root:xnu-11417.140.69~1/RELEASE_ARM64_T6041
- Python: 3.11.13
- Stable-Baselines3: 2.7.0
- PyTorch: 2.9.0
- GPU Enabled: False
- Numpy: 2.2.6
- Cloudpickle: 3.1.2
- Gymnasium: 1.2.2



In [16]:
eval_env = Monitor(gymnasium.make('LunarLander-v3'))
mean_reward, std_reward = evaluate_policy(
    model, eval_env, n_eval_episodes=10, deterministic=True)
print(f'mean_reward={mean_reward:.2f} +/- {std_reward}')

mean_reward=258.89 +/- 20.811088434062675
