# Import

In [None]:
# Gym
import gymnasium as gym
from gymnasium import Env
from gymnasium.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete

# helpers
import numpy as np
import random
import os

# Stable-baseline
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

from typing import Any

# Types of Spaces

In [2]:
Discrete(3).sample()

np.int64(1)

In [3]:
Box(0, 1, shape=(3,3)).sample()

array([[0.0691027 , 0.94688255, 0.10282163],
       [0.07845438, 0.49824223, 0.9747431 ],
       [0.59796757, 0.04060211, 0.19825177]], dtype=float32)

In [4]:
Tuple((Discrete(3), Box(0, 1, shape=(3,3)))).sample()

(np.int64(1),
 array([[0.88045573, 0.4239583 , 0.76068455],
        [0.27484947, 0.5558458 , 0.06089921],
        [0.28199726, 0.20176025, 0.9926835 ]], dtype=float32))

In [5]:
Dict({'height' : Discrete(3), 'place' : Box(0,1, shape=(1,2)), "tuple" : Tuple((Discrete(3), Box(0, 1, shape=(3,3))))}).sample()

{'height': np.int64(1),
 'place': array([[0.83603555, 0.37574884]], dtype=float32),
 'tuple': (np.int64(1),
  array([[0.85583925, 0.05233889, 0.5180816 ],
         [0.34671175, 0.08657797, 0.7111978 ],
         [0.687406  , 0.73382556, 0.2242061 ]], dtype=float32))}

In [6]:
MultiBinary((4,2)).sample()

array([[1, 1],
       [0, 1],
       [0, 0],
       [0, 1]], dtype=int8)

In [7]:
MultiDiscrete([2,3,5]).sample()

array([1, 1, 2])

# Building Environment

In [8]:
Box(low=np.array([0]), high=np.array([100])).sample()

array([59.98953], dtype=float32)

In [9]:
Box(low=0, high=100).sample()

array([44.446964], dtype=float32)

In [None]:
class ShowerEnv(Env):
    def __init__(self):
        self.action_space = Discrete(3, start=-1)
        self.observation_space = Box(low=0, high=100)
        self.state = float(38 + random.randint(-3,3))
        self.shower_length = 60

    def step(self, action):
        self.state += action

        self.shower_length -= 1

        # if self.state >= 37 and self.state <= 39:
        #     reward = 1
        # else:
        #     reward = -1

        reward = -abs(self.state - 38)

        if self.shower_length <= 0:
            terminated = True
            truncated = True
        else:
            terminated = False
            truncated = False


        info = {}

        return np.array([self.state], dtype=np.float32), reward, terminated, truncated, info

    def render(self):
        pass

    def reset(
        self,
        *,
        seed: int | None = None,
        options: dict[str, Any] | None = None,
    ):
        self.state = float(38 + random.randint(-3,3))
        self.shower_length = 60
        info = {}

        return self.state, info

In [11]:
env = ShowerEnv()

In [12]:
env.observation_space.sample()

array([12.478146], dtype=float32)

In [13]:
env.action_space.sample()

np.int64(0)

# Test Environment

In [None]:
episodes = 10
for episode in range(1, episodes+1):
    env.reset()
    terminated = False
    truncated = False
    score = 0

    while not terminated and not truncated:
        action = env.action_space.sample()
        obs, reward, terminated, truncated , info = env.step(action)
        score += reward
    print(f"Episode: {episode}  Score {score}")
env.close()

Episode: 1  Score -90.0
Episode: 2  Score -217.0
Episode: 3  Score -147.0
Episode: 4  Score -323.0
Episode: 5  Score -124.0
Episode: 6  Score -147.0
Episode: 7  Score -68.0
Episode: 8  Score -204.0
Episode: 9  Score -128.0
Episode: 10  Score -320.0


# Train

In [15]:
log_path = os.path.join('training','logs')
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [18]:
model.learn(800000, reset_num_timesteps=False)

Logging to training/logs/PPO_17
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 60       |
|    ep_rew_mean     | -101     |
| time/              |          |
|    fps             | 3147     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 202752   |
---------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 60            |
|    ep_rew_mean          | -92.4         |
| time/                   |               |
|    fps                  | 2448          |
|    iterations           | 2             |
|    time_elapsed         | 1             |
|    total_timesteps      | 204800        |
| train/                  |               |
|    approx_kl            | 4.4018787e-05 |
|    clip_fraction        | 0.000635      |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.00751      |
|    explain

<stable_baselines3.ppo.ppo.PPO at 0x12098b430>

# Save

In [21]:
shower_path = os.path.join('training','model','Shower_Model_PPO')

In [22]:
model.save(shower_path)



In [23]:
del model

In [26]:
model = PPO.load(shower_path, env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [33]:
evaluate_policy(model, env, 10)

(np.float64(-58.2), np.float64(1.0770329614269007))