## Install Libraries

In [1]:
%%capture
!pip install pybullet
!pip install stable-baselines3[extra]
!pip install huggingface_sb3
!pip install huggingface_hub

## Imports

In [2]:
import gym
import pybullet_envs

from stable_baselines3 import A2C
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.env_util import make_vec_env

## Create and Inspect Environment

In [3]:
ENV_ID = "AntBulletEnv-v0"

# create the environment
env = gym.make(ENV_ID)

# Get the state space and action space
s_size = env.observation_space.shape
a_size = env.action_space.shape

print("_____OBSERVATION SPACE_____ \n")
print("The State Space is: ", s_size)
print("Sample observation", env.observation_space.sample()) # Get a random observation

print("\n _____ACTION SPACE_____ \n")
print("The Action Space is: ", a_size)
print("Action Space Sample", env.action_space.sample()) # Take a random action

_____OBSERVATION SPACE_____ 

The State Space is:  (28,)
Sample observation [-0.297862   -0.7018604  -0.21544594 -0.39768916 -0.8538046   0.6491695
  0.6883185  -0.25325322 -1.2663126   0.71987957  0.53554916  0.62293637
  0.03228103 -0.32710662  1.1441112   1.1145478  -1.5885653  -0.28742996
 -0.45121768 -2.3514953   0.0139843  -0.9986272   0.6611533  -0.02038103
  1.4783595   0.84787905  0.16484033 -1.0185411 ]

 _____ACTION SPACE_____ 

The Action Space is:  (8,)
Action Space Sample [ 0.36242607  0.11882684  0.31736115  0.5157141  -0.21532527 -0.13148649
  0.8665574  -0.19245204]


In [4]:
env = make_vec_env(env_id=ENV_ID, n_envs=4)

# Add wrapper to normalize the observations
env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.0)
env.observation_space.sample()

array([-1.2491915 ,  1.3458656 , -0.12651971, -0.25719798,  1.289487  ,
       -1.0812659 , -1.1702935 , -2.6536837 ,  0.9434926 ,  0.07291584,
       -0.6294151 ,  1.2758362 , -0.30000466, -0.5301382 ,  1.009003  ,
       -0.35771188,  0.14494345,  1.3640046 , -0.19291496,  0.33478504,
        0.9608421 , -0.45231986, -1.456666  , -0.8541294 ,  0.7365714 ,
        1.0894555 ,  0.76035357,  1.8444569 ], dtype=float32)

## Create and Train the Model

In [5]:
model = A2C(policy="MlpPolicy",
            env=env,
            learning_rate=0.00096,
            n_steps=8,
            gamma=0.99,
            gae_lambda=0.9,
            ent_coef=0.0,
            vf_coef=0.4,
            max_grad_norm=0.5,
            use_rms_prop=True,
            use_sde=True,
            normalize_advantage=False,
            tensorboard_log="./tensorboard",
            policy_kwargs=dict(log_std_init=-2, ortho_init=False),
            verbose=1,
            seed=42,
            device="auto")

Using cuda device


In [6]:
model.learn(total_timesteps=2_000_000)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
| time/                 |          |
|    fps                | 612      |
|    iterations         | 34800    |
|    time_elapsed       | 1818     |
|    total_timesteps    | 1113600  |
| train/                |          |
|    entropy_loss       | -3.82    |
|    explained_variance | 0.978    |
|    learning_rate      | 0.00096  |
|    n_updates          | 34799    |
|    policy_loss        | 3.39     |
|    std                | 0.0638   |
|    value_loss         | 1.73     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1e+03    |
|    ep_rew_mean        | 875      |
| time/                 |          |
|    fps                | 612      |
|    iterations         | 34900    |
|    time_elapsed       | 1824     |
|    total_timesteps    | 1116800  |
| train/                |          |
|    entropy_loss       | -3.85    |
|    expla

<stable_baselines3.a2c.a2c.A2C at 0x7f0a438a59d0>

## Save the Model

In [7]:
# Save the model
model.save(f"a2c-{ENV_ID}")

# Save VecNormalize Statistics
env.save("vec_normalize.pkl")

## Evaluate the Agent

In [17]:
# Create evaluation env and load the saved statistics
eval_env = DummyVecEnv([lambda: gym.make(ENV_ID)])
eval_env = VecNormalize.load("vec_normalize.pkl", eval_env)

# Do not update VecNormalize statistics during evaluation
eval_env.training = False

# Do not normalize reward during evaluation. There is no need to do that
eval_env.norm_reward = False

# Load the model
model = A2C.load(f"a2c-{ENV_ID}")



# Evaluate the agent
mean_reward, std_reward = evaluate_policy(model, eval_env)
print(f"Mean Reward: {mean_reward:.2f} +/- {std_reward:.2f}")

Mean Reward: 1566.05 +/- 71.03


## Publish the trained model on Hugging Face Hub

In [10]:
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [11]:
from huggingface_sb3 import package_to_hub, load_from_hub

package_to_hub(model=model,
               model_name=f"a2c-{ENV_ID}",
               model_architecture="A2C",
               env_id=ENV_ID,
               eval_env=eval_env,
               repo_id=f"danieladejumo/a2c-{ENV_ID}",
               commit_message="Initial Commit")

[38;5;4mℹ This function will save, evaluate, generate a video of your agent,
create a model card and push everything to the hub. It might take up to 1min.
This is a work in progress: if you encounter a bug, please open an issue.[0m




Saving video to /tmp/tmphatw_l2r/-step-0-to-step-1000.mp4
[38;5;4mℹ Pushing repo danieladejumo/a2c-AntBulletEnv-v0 to the Hugging Face
Hub[0m
[38;5;4mℹ Your model is pushed to the Hub. You can view your model here:
https://huggingface.co/danieladejumo/a2c-AntBulletEnv-v0/tree/main/[0m


'https://huggingface.co/danieladejumo/a2c-AntBulletEnv-v0/tree/main/'

## Load from Hub

In [19]:
checkpoint = load_from_hub(
	repo_id="danieladejumo/a2c-AntBulletEnv-v0",
	filename=f"a2c-{ENV_ID}.zip",
)


# Load the model
model = A2C.load(checkpoint)
model

<stable_baselines3.a2c.a2c.A2C at 0x7f097c88c650>

In [20]:
# Evaluate the agent
mean_reward, std_reward = evaluate_policy(model, eval_env)
print(f"Mean Reward: {mean_reward:.2f} +/- {std_reward:.2f}")

Mean Reward: 1547.64 +/- 78.30
