In [13]:
"""
Experiment with RL algos like PPO on the env
"""

from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import EvalCallback
import wandb
from wandb.integration.sb3 import WandbCallback
from cookie_jars.env import CookieJarsEnv

import os
import random
import numpy as np
import torch as th
from stable_baselines3.common.env_checker import check_env


In [14]:
seed = 42
os.environ['PYTHONHASHSEED']=str(seed) 
random.seed(seed)
np.random.seed(seed)
th.manual_seed(seed)

<torch._C.Generator at 0x112d46150>

In [12]:
env = CookieJarsEnv('train')
check_env(env)



In [10]:
env = CookieJarsEnv('train')
eval_env = CookieJarsEnv('val')
config = {
    "policy_type": "MlpPolicy",
    "total_timesteps": 5000000,
}
run = wandb.init(
    project="cookiejars-scratch",
    config=config,
    sync_tensorboard=True,  # auto-upload sb3's tensorboard metrics
    # monitor_gym=True,  # auto-upload the videos of agents playing the game
    # save_code=True,  # optional
)

model = PPO(config["policy_type"], env, verbose=1, tensorboard_log=f"runs/{run.id}")
wandb_callback = WandbCallback(
    model_save_path=f"models/{run.id}",
    verbose=2,
)
eval_callback = EvalCallback(
    eval_env, 
    best_model_save_path='./logs/',
    log_path='./logs/', 
    n_eval_episodes=1,
    eval_freq=25000,
    deterministic=True, 
    render=False,
)
model.learn(
    total_timesteps=config["total_timesteps"],
    callback=[eval_callback, wandb_callback],
)
run.finish()



Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to runs/bzdlbvms/PPO_1
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 1.51e+03  |
|    ep_rew_mean     | -1.79e+04 |
| time/              |           |
|    fps             | 1064      |
|    iterations      | 1         |
|    time_elapsed    | 1         |
|    total_timesteps | 2048      |
----------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.51e+03    |
|    ep_rew_mean          | -1.78e+04   |
| time/                   |             |
|    fps                  | 942         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.020609234 |
|    clip_fraction        | 0.289       |
|    clip_range      



Eval num_timesteps=25000, episode_reward=1120.77 +/- 0.00
Episode length: 502.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 502         |
|    mean_reward          | 1.12e+03    |
| time/                   |             |
|    total_timesteps      | 25000       |
| train/                  |             |
|    approx_kl            | 0.023064734 |
|    clip_fraction        | 0.3         |
|    clip_range           | 0.2         |
|    entropy_loss         | -39.9       |
|    explained_variance   | -1.19e-07   |
|    learning_rate        | 0.0003      |
|    loss                 | 5.9e+03     |
|    n_updates            | 120         |
|    policy_gradient_loss | -0.0132     |
|    std                  | 0.911       |
|    value_loss           | 1.24e+04    |
-----------------------------------------
New best mean reward!
----------------------------------
| rollout/           |           |
|    ep_len_mean    

VBox(children=(Label(value='1.850 MB of 1.850 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/mean_ep_length,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/mean_reward,▅▅▂▅▆▁▂█▁▂▁▁▁▁▁▁▁▁▁▂▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,▁▃▅▅▅▆▇█▆▆▅▅▆▆▆▆▅▆▆▇▇▇▆▅▆▅▅▅▅▄▅▅▅▅▆█▆▅▅▅
time/fps,████████████████████▃▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▃▃
train/approx_kl,▁▁▂▃▂█▂▂▂▂▂▂▃▂▂▂▂▂▄▁▁▄▁▁▁▁▁▂▁▁▁▁▁▁▃▂▁▃▂▂
train/clip_fraction,▃▅▆▇▆▅▅▆▆▆▅▇█▅▇▆▆▆▅▅▄▇▁▁▅▆▆▆▁▅▅▂▅▅▇▅▂▇▆▅
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▃▄▄▄▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇▆▆▇▆▇▆▇▇▇███▇▇▇▆▆▆▆▆

0,1
eval/mean_ep_length,502.0
eval/mean_reward,-178.31148
global_step,5001216.0
rollout/ep_len_mean,1509.0
rollout/ep_rew_mean,2709.07666
time/fps,521.0
train/approx_kl,0.07243
train/clip_fraction,0.273
train/clip_range,0.2
train/entropy_loss,-28.98191


In [8]:
env = CookieJarsEnv('train')
env.reset()
obs, rew, done, info = env.step(np.zeros(30))
obs

array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 8.28000031e+01, 4.04424896e+01,
       4.08300018e+01, 2.87000008e+01, 1.57399998e+01, 5.62500000e+01,
       5.85499992e+01, 7.92200012e+01, 2.46000004e+01, 2.84249992e+01,
       4.12748222e+01, 6.91900024e+01, 1.18923080e+02, 2.36784744e+01,
       2.87299995e+01, 2.09400005e+01, 1.25889099e+02, 6.45800018e+01,
       4.27900009e+01, 6.26599998e+01, 3.55916023e+01, 3.08500004e+01,
       1.79506645e+01, 6.11300011e+01, 4.96300011e+01, 3.17299995e+01,
      