### The goal is to train an agent to optimize the shower experience.
### The agent will interact with an environment where the temperature varies randomly.
### The optimal temperature range is between 37 and 39 degrees Celsius.


In [1]:
import os
from gymnasium import Env
from gymnasium.spaces import Discrete, Box,Dict,Tuple,MultiBinary,MultiDiscrete
import numpy as np
import random
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from gymnasium.utils import seeding

In [2]:
Discrete(3).sample()

1

In [3]:
Box(0,1,shape=(3,3)).sample()

array([[0.84813964, 0.6344467 , 0.08007466],
       [0.19407818, 0.2618616 , 0.29083106],
       [0.8613568 , 0.46262518, 0.5417706 ]], dtype=float32)

In [4]:
Tuple((Discrete(3),Box(0,1,shape=(3,3)))).sample()

(1,
 array([[0.9533279 , 0.43381345, 0.23202573],
        [0.15796058, 0.3391159 , 0.826808  ],
        [0.008001  , 0.31303412, 0.701337  ]], dtype=float32))

In [5]:
Dict({'height':Discrete(2),'speed':Box(0,1,shape=(1,))}).sample()

{'height': 1, 'speed': array([0.59849685], dtype=float32)}

In [6]:
MultiBinary(4).sample()

array([0, 1, 1, 1], dtype=int8)

In [7]:
MultiDiscrete([5,2,12]).sample()

array([3, 1, 6], dtype=int64)

In [None]:
class ShowerEnv(Env):
    def __init__(self):
        super(ShowerEnv, self).__init__()
        
        # Action space: 3 possible actions (decrease temp, no change, increase temp)
        self.action_space = Discrete(3)
        self.observation_space = Box(low=np.array([0]), high=np.array([100]), dtype=np.float32)

        self.state = np.array([38 + random.randint(-3, 3)], dtype=np.float32)
        self.shower_length = 60  

    def step(self, action):
        self.state += action - 1  
        self.shower_length -= 1
        reward = 1 if 37 <= self.state[0] <= 39 else -1
        terminated = self.shower_length <= 0
        truncated = False  # No early stopping condition

        info = {} 

        return self.state, reward, terminated, truncated, info

    def reset(self, seed=None, **kwargs):
        if seed is not None:
            random.seed(seed)
            np.random.seed(seed)

        self.state = np.array([38 + random.randint(-3, 3)], dtype=np.float32)
        self.shower_length = 60
        
        info = {}  

        return self.state, info  

    def render(self):
        #no visualization for now
        pass  

In [9]:
env = ShowerEnv()

In [10]:
env.observation_space.sample()

array([82.129005], dtype=float32)

In [11]:
env.action_space.sample()

1

In [12]:
env.reset()

(array([37.], dtype=float32), {})

In [13]:
epds = 5
for ep in range(1,epds+1):
    obs = env.reset()
    done = False
    score = 0
    truncated = False
    while not (done or truncated):
        env.render()
        action = env.action_space.sample()
        obs,reward,done,truncated, info = env.step(action)
        score += reward
    print(f"ep {ep} Score = {score}")
env.close()

ep 1 Score = -14
ep 2 Score = 12
ep 3 Score = 46
ep 4 Score = 8
ep 5 Score = -56


In [14]:
log_path = os.path.join('Training','Logs')
os.makedirs(log_path, exist_ok=True)

model = PPO('MlpPolicy',env,verbose = 1,tensorboard_log=log_path)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




In [17]:
model.learn(total_timesteps=5000)


Logging to Training\Logs\PPO_7
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 60       |
|    ep_rew_mean     | -35.8    |
| time/              |          |
|    fps             | 444      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 60          |
|    ep_rew_mean          | -30         |
| time/                   |             |
|    fps                  | 391         |
|    iterations           | 2           |
|    time_elapsed         | 10          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009993315 |
|    clip_fraction        | 0.0843      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | 0.000104    |

<stable_baselines3.ppo.ppo.PPO at 0x297225b28f0>

In [18]:
evaluate_policy(model,env,n_eval_episodes=10,render=True)

(-24.0, 54.99090833947008)