In [None]:
# https://github.com/huggingface/deep-rl-class/blob/main/unit1/unit1.ipynb

In [None]:
from datetime import datetime as dt
import numpy as np

In [None]:
from IPython.display import Audio
sound_file = 'telephone-ring-02.wav'

```
!pip3 install pyvirtualdisplay
!pip install gym[box2d]
!pip install stable-baselines3[extra]
!pip install huggingface_sb3
!pip install pyglet
!pip install ale-py==0.7.4 # To overcome an issue with gym (https://github.com/DLR-RM/stable-baselines3/issues/875)
```

In [None]:
# Virtual display
from pyvirtualdisplay import Display

In [None]:
virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

In [None]:
import gym
from huggingface_sb3 import load_from_hub, package_to_hub, push_to_hub
from huggingface_hub import notebook_login # To log to our Hugging Face account to be able to upload models to the Hub.
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env

In [None]:
from stable_baselines3 import DQN

In [None]:
from stable_baselines import PPO2

In [None]:
import stable_baselines3

### Step 5: Create the Model

In [None]:
from stable_baselines3.common.env_util import make_vec_env

In [None]:
2**11

In [None]:
# Create environment
env = gym.make('LunarLander-v2')
#env = make_vec_env('LunarLander-v2', n_envs=2048) # no tensorboard support

In [None]:
stable_baselines3.common.utils.get_device()

In [None]:
# conda install -c anaconda tensorflow-gpu
import tensorflow as tf
#import datetime

In [None]:
# Load the TensorBoard notebook extension
# %load_ext tensorboard

# Train the Model

In [None]:
model = PPO(
    policy = 'MlpPolicy', 
    env = env,
    learning_rate=0.00025, 
    n_steps=2048, 
    batch_size=1024, 
    n_epochs=10, 
    gamma=0.99, 
    gae_lambda=0.95, 
    clip_range=0.2, 
    clip_range_vf=None, 
    normalize_advantage=True, 
    ent_coef=0.0, 
    vf_coef=0.5, 
    max_grad_norm=0.5, 
    use_sde=False, 
    sde_sample_freq=- 1, 
    target_kl=None, 
    tensorboard_log=None, 
    create_eval_env=False, 
    policy_kwargs=None, 
    verbose=0, 
    seed=None, 
    device='cuda', 
    _init_setup_model=True
)

print(dt.now())
# SOLUTION
# Train it for 500,000 timesteps
model.learn(total_timesteps=25000000)
print(dt.now())
# Save the model
model_name = "ppo-LunarLander-v2"
model.save(model_name)


#@title
eval_env = gym.make("LunarLander-v2")
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

### vukpetar parameters

In [None]:
2**20

In [None]:
2**21

In [None]:
2097152

In [None]:
def get_seed():
    np.random.seed()
    return np.random.randint(0, 2**32)

In [None]:
seed = get_seed()
seed

In [None]:
# Clear any logs from previous runs
# rm -rf ./logs/
#log_dir = "./logs/" + dt.now().strftime("%Y%m%d-%H%M%S")+'/'
log_dir = "./logs/"
# tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [None]:
from stable_baselines3.common.callbacks import BaseCallback

In [None]:
class TensorboardCallback(BaseCallback):
    """
    Custom callback for plotting additional values in tensorboard.
    """

    def __init__(self, verbose=0):
        super(TensorboardCallback, self).__init__(verbose)

    def _on_step(self) -> bool:
        # Log scalar value (here a random variable)
        value = np.random.random()
        self.logger.record('random_value', value)
        return True

In [None]:
model = PPO(
    policy = 'MlpPolicy', 
    env = env,
    learning_rate=0.00029, 
    n_steps=2048, 
    batch_size=2**22, 
    n_epochs=8, 
    gamma=0.999, 
    gae_lambda=0.98, 
    clip_range=0.2, 
    clip_range_vf=None, 
    normalize_advantage=True, 
    ent_coef=0.1, 
    vf_coef=0.01, 
    max_grad_norm=0.5, 
    use_sde=False, 
    sde_sample_freq=- 1, 
    target_kl=None, 
    tensorboard_log=log_dir, 
    create_eval_env=False, 
    policy_kwargs=None, 
    verbose=2, 
    seed=seed, 
    device='cuda', 
    _init_setup_model=True
)

In [None]:
model = DQN(
    policy = 'MlpPolicy', 
    env = env,
    learning_rate=0.00035, 
    verbose=0, 
    seed=seed, 
    tensorboard_log=log_dir,
    device='cuda'
)

In [None]:
model = DQN(
    policy,
    env,
    learning_rate=0.00011,
    buffer_size=1000000,
    learning_starts=100000,
    batch_size=32,
    tau=1.0,
    gamma=0.99,
    train_freq=4,
    gradient_steps=1,
    replay_buffer_class=None,
    replay_buffer_kwargs=None,
    optimize_memory_usage=False,
    target_update_interval=10000,
    exploration_fraction=0.1,
    exploration_initial_eps=1.0,
    exploration_final_eps=0.05,
    max_grad_norm=10,
    tensorboard_log=log_dir,
    create_eval_env=False,
    policy_kwargs=None,
    verbose=0,
    seed=seed,
    device='cuda',
    _init_setup_model=True
)

In [None]:
print(dt.now())
# SOLUTION
# Train it for 500,000 timesteps
#model.learn(total_timesteps=50000000)
#model.learn(total_timesteps=500000)
model.learn(total_timesteps=1000000, tb_log_name="first_run")
print(dt.now())
# Save the model
model_name = "ppo-LunarLander-v2"
model.save(model_name)


#@title
eval_env = gym.make("LunarLander-v2")
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

In [None]:
Audio(sound_file, autoplay=True)

### Step 8: Publish our trained model on the Hub

In [None]:
notebook_login()

In [None]:
!git config --global credential.helper store

In [None]:
import gym

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_util import make_vec_env

from huggingface_sb3 import package_to_hub

In [None]:
# PLACE the variables you've just defined two cells above
# Define the name of the environment
env_id = "LunarLander-v2"

# TODO: Define the model architecture we used
model_architecture = "PPO"

## Define a repo_id
## repo_id is the id of the model repository from the Hugging Face Hub (repo_id = {organization}/{repo_name} for instance ThomasSimonini/ppo-LunarLander-v2
## CHANGE WITH YOUR REPO ID
repo_id = "format37/ppo-LunarLander-v2"

## Define the commit message
commit_message = "Upload PPO LunarLander-v2 trained agent"

# Create the evaluation env
eval_env = DummyVecEnv([lambda: gym.make(env_id)])

In [None]:
# conda install x264=='1!152.20180717' ffmpeg=4.0.2 -c conda-forge
# https://stackovergo.com/ru/q/2654785/unknown-encoder-libx264

In [None]:
# PLACE the package_to_hub function you've just filled here
package_to_hub(model=model, # Our trained model
               model_name=model_name, # The name of our trained model 
               model_architecture=model_architecture, # The model architecture we used: in our case PPO
               env_id=env_id, # Name of the environment
               eval_env=eval_env, # Evaluation Environment
               repo_id=repo_id, # id of the model repository from the Hugging Face Hub (repo_id = {organization}/{repo_name} for instance ThomasSimonini/ppo-LunarLander-v2
               commit_message=commit_message)