# 1. Dependencies#

In [160]:
# gym stuff
import gymnasium as gym
from gymnasium import Env
from gymnasium.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete

# helper
import os
import numpy as np
import random

# stable baselines3 stuff
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# 2. Types of Space

In [161]:
# Discrete(): Invoking this create th ACTION_SPACE 
# from 0 to 2 descision
actionSpaces = Discrete(3)

In [162]:
#Creating box
box = Box(0,1,shape=(3,3))

In [163]:
box.sample()

array([[0.8084121 , 0.34665006, 0.0068737 ],
       [0.9799128 , 0.71945274, 0.49349838],
       [0.783046  , 0.8934588 , 0.3021645 ]], dtype=float32)

In [164]:
# Tuple
tuple = Tuple((actionSpaces, box))
tuple.sample()

(2,
 array([[0.57283753, 0.822394  , 0.38000485],
        [0.7932067 , 0.01089187, 0.23661691],
        [0.53170264, 0.96461344, 0.31651884]], dtype=float32))

In [165]:
Tuple((actionSpaces, box, Box(0,3,shape=(1,)))).sample()

(2,
 array([[0.25513625, 0.12523969, 0.79276216],
        [0.52013123, 0.7806507 , 0.4976875 ],
        [0.36425987, 0.34302422, 0.50235224]], dtype=float32),
 array([2.537261], dtype=float32))

In [166]:
#Dict
dict = Dict({'braking':Discrete(2), 'speed':Box(0,100, shape=(1,))})
dict.sample()

OrderedDict([('braking', 1), ('speed', array([48.109997], dtype=float32))])

In [167]:
Dict({'braking':Discrete(2), 'speed':Box(0,100, shape=(1,)), 'accelerate':Discrete(2)}).sample()

OrderedDict([('accelerate', 1),
             ('braking', 0),
             ('speed', array([69.28961], dtype=float32))])

In [168]:
#MultiBinary
MultiBinary(4).sample()

array([1, 1, 1, 0], dtype=int8)

In [169]:
MultiBinary(2).sample()

array([1, 0], dtype=int8)

In [170]:
#MultiDiscrete 
MultiDiscrete([3, 2, 1]).sample() # see the range randomization

array([2, 0, 0])

In [171]:
MultiDiscrete([4,3,1,4]).sample()

array([0, 2, 0, 3])

In [172]:
Box(low=0, high=100, shape=(1,)).sample()

array([71.50397], dtype=float32)

# 3. Building an environment

    - eg: Building agent that gives the favorable water for showering
    - Temp Flucation : agent act with change in temperature and maintain to 35 to 39

        About Env and agent:
        
         action_space: Discrete(3) => 0: hold, 1: down, 2: up
         
         observation_space: range from 0 to 100 centigrade
         
         state: first initial state then working/tracking state 

         episdoe_len : how long it will end first bathing or episode

         reward: if temp: (35 to 39): +1 else -1

         done/terminated: bool True if episode_len<=0 else keep working

         info: {} give info if system need
        
         

In [227]:
class ShowerEnv(Env):
    def __init__(self):
        self.action_space = Discrete(3)
        self.observation_space = Box(low=0, high=100, shape=(1,))
        self.state = 38 + random.randint(-5,5)
        self.shower_length = 60 # shower duration = 60sec
        
 
        
    def step(self, action):
        #Adjust state of temp
        self.state += action-1

        #Showertime
        self.shower_length -= 1

        #Calculate Reward
        if self.state >=37 and self.state <=39:
            reward = 1 
        else:
            reward = -1
            

        #Termination
        if self.shower_length <=0:
            done = True
        else:
            done = False

        #info
        info = {}
        return self.state, reward, done, info
    
    def render(self):
        #nothing to show to human except text
        pass
        
    def reset(self):
        self.showerlength = 60
        self.state = np.array([38+random.randint(-5,5)]).astype(np.float32)
        return self.state
    

# 4. Test Environment

In [228]:
env = ShowerEnv() #creating env

In [229]:
# env.reset()

In [230]:
# from stable_baselines3.common.env_checker import check_env

In [231]:
# check_env(env, warn=True)

In [235]:
#testing env
episodes = 5
for episode in range(1, episodes+1):
    er = env.reset()
    print(er)
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        er, reward, done, info = env.step(action)
        print(er)
        score +=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()


[40.]
[40.]
Episode:1 Score:-1
[37.]
[37.]
Episode:2 Score:1
[39.]
[40.]
Episode:3 Score:-1
[36.]
[36.]
Episode:4 Score:-1
[38.]
[39.]
Episode:5 Score:1


 # 5. Train Environment

In [238]:
log_path = os.path.join('Training', 'Logs')
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path)
model.learn(total_timesteps=10000, progress_bar=True)

SyntaxError: invalid syntax (3401137256.py, line 3)

# 6. Save Model