In [3]:
import gymnasium
# Path: modelTimetable/DRL/myEnv.ipynb
# Implementing the environment
# Reproduction of the cartpole environment
# 
# Discription: 
# Create a car in a two-dimensional plane with a width of 20, and the coordinates of 
# the center point are the destination of the car to reach.
#
# State:
# The state of the car is represented by the coordinates of the center point of the car.(x,y)
# Action:
# The action of the car is represented by the speed of the car.(vx,vy)
# Reward:
# The reward is the distance between the car and the destination.
# Termination:
# The car reaches the destination.(0,0)
# truncation:
# The car is out of the screen.

In [36]:
from gymnasium import spaces
import numpy as np

'''
gymnasium is the main class that we will use to create our environment.

The gymnasium class has the following methods:
__init__(): This method is used to initialize the environment. It takes the following parameters:

step(): This method is used to take an action and return the next state, reward, and whether the episode is over. 
Physical engine
- input: action
- output: observation, reward,terminated,truncated,info

reset(): This method is used to reset the environment to its initial state.
- input: None
- output: observation

render(): This method is used to render the environment:
Image engine
- input: mode(default='human','human','rgb_array','ansi','rgb_array_list)
- output: None
eg:gymnasium.make('CartPole-v0',render_mode='human')

close(): This method is used to close the environment.
'''

class MyCar(gymnasium.Env):
    metadata = {
        'render.modes': ['human', 'rgb_array'],
        'video.frames_per_second': 2
        }
    def __init__(self):
        self.target_x = 0
        self.target_y = 0

        self.size = 10
        self.action_space = spaces.Discrete(5) # 0:stop, 1:up, 2:down, 3:left, 4:right
        self.observation_space = spaces.Box(np.array([-self.size,-self.size]), np.array([self.size,self.size]))
        
        self.state = None
        self.info = {}
    
    def step(self, action):
        assert self.action_space.contains(action), "%r (%s) invalid"%(action, type(action))
        # update the state by the action
        x,y = self.state
        if action == 0:
            x += 0
            y += 0
        elif action == 1:
            x += 0
            y += 1
        elif action == 2:
            x += 0
            y += -1
        elif action == 3:
            x += -1
            y += 0
        elif action == 4:
            x += 1
            y += 0
        # the next state
        self.state = np.array([x,y])
        self.state = self.state.astype(np.float32)
        reward = self._get_reward()
        terminated = self._get_terminated()
        terminated = bool(terminated)
        truncated = self._get_truncated()
        truncated = bool(truncated)
        info = {}
        return self.state, reward, terminated,truncated, info
    
    def reset(self,seed=None):
        self.state = np.ceil(np.random.rand(2)*2*self.size)-self.size
        self.state = self.state.astype(np.float32)
        self.counts = 0
        self.info = {}
        return self.state,self.info
    
    def render(self, mode='human'):
        print(self.state)
    
    def close(self):
        return super().close()

    def _get_reward(self):
        return -np.sqrt(self.state[0]**2+self.state[1]**2)
    
    def _get_terminated(self):
        x,y = self.state
        return x==self.target_x and y==self.target_y
    
    def _get_truncated(self):
        x,y = self.state
        return x<-self.size or x>self.size or y<-self.size or y>self.size

In [37]:
from stable_baselines3.common.env_checker import check_env
env = MyCar()
check_env(env, warn=True)

In [5]:
env = MyCar()
env.reset()
state,reward,terminated,truncated,info = env.step(env.action_space.sample())
log = 0
while not terminated:
    env.render()
    state,reward,terminated,truncated,info = env.step(env.action_space.sample())
    if truncated:
        env.reset()
    log += 1

[-7.  9.]
[-6.  9.]
[-6.  9.]
[-7.  9.]
[-7. 10.]
[ 9. -1.]
[ 9. -1.]
[9. 0.]
[9. 0.]
[9. 1.]
[9. 2.]
[9. 2.]
[8. 2.]
[7. 2.]
[8. 2.]
[8. 3.]
[7. 3.]
[8. 3.]
[8. 3.]
[8. 2.]
[8. 3.]
[8. 4.]
[8. 5.]
[8. 5.]
[7. 5.]
[7. 5.]
[6. 5.]
[5. 5.]
[5. 4.]
[5. 4.]
[5. 4.]
[4. 4.]
[4. 4.]
[4. 4.]
[3. 4.]
[4. 4.]
[4. 4.]
[4. 5.]
[4. 4.]
[4. 3.]
[4. 3.]
[4. 4.]
[4. 5.]
[4. 5.]
[5. 5.]
[6. 5.]
[6. 6.]
[6. 6.]
[5. 6.]
[5. 6.]
[5. 6.]
[5. 7.]
[6. 7.]
[6. 8.]
[6. 9.]
[6. 8.]
[6. 7.]
[7. 7.]
[6. 7.]
[6. 8.]
[6. 8.]
[7. 8.]
[6. 8.]
[6. 9.]
[ 6. 10.]
[ 6. 10.]
[6. 9.]
[7. 9.]
[6. 9.]
[6. 9.]
[5. 9.]
[6. 9.]
[5. 9.]
[5. 8.]
[5. 7.]
[5. 7.]
[5. 8.]
[6. 8.]
[6. 9.]
[7. 9.]
[7. 8.]
[6. 8.]
[5. 8.]
[5. 7.]
[4. 7.]
[4. 8.]
[4. 7.]
[3. 7.]
[4. 7.]
[4. 6.]
[4. 5.]
[5. 5.]
[4. 5.]
[4. 4.]
[4. 4.]
[4. 4.]
[4. 4.]
[5. 4.]
[6. 4.]
[7. 4.]
[6. 4.]
[7. 4.]
[7. 4.]
[6. 4.]
[6. 5.]
[6. 5.]
[5. 5.]
[5. 5.]
[6. 5.]
[7. 5.]
[8. 5.]
[7. 5.]
[7. 5.]
[7. 5.]
[8. 5.]
[7. 5.]
[8. 5.]
[7. 5.]
[7. 5.]
[8. 5.]
[8. 4.]
[8. 3.]
[7. 3.

AssertionError: `reset()` must return a tuple (obs, info)

In [6]:
from stable_baselines3 import DQN
from stable_baselines3.common import logger
# Train the agent by the stable_baselines3
import os
models_dir = './models/PPO'
logdir = './logs'
if not os.path.exists(models_dir):
    os.makedirs(models_dir)

if not os.path.exists(logdir):
    os.makedirs(logdir)

env = MyCar()
agent = DQN('MlpPolicy', env, verbose=1,tensorboard_log=logdir)
agent.learn(total_timesteps=100000, log_interval=100,tb_log_name='DQN')
agent.save("DQN_MyCar")

  ROMS = resolve_roms()


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./logs/DQN_1
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 72.3     |
|    ep_rew_mean      | -503     |
|    exploration_rate | 0.313    |
| time/               |          |
|    episodes         | 100      |
|    fps              | 9241     |
|    time_elapsed     | 0        |
|    total_timesteps  | 7231     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 55.7     |
|    ep_rew_mean      | -380     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 200      |
|    fps              | 9288     |
|    time_elapsed     | 1        |
|    total_timesteps  | 12797    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 91.6     |
|    e

In [8]:
env = MyCar()
obs = env.reset()
agent = DQN.load('deepq_cartpole.zip',env=env)
terminated = False
while not terminated:
    action,_state = agent.predict(obs)
    obs,rew,terminated,truncated,info = env.step(action)
    print(env.state)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
[6. 5.]
[5. 5.]
[5. 4.]
[4. 4.]
[4. 3.]
[3. 3.]
[2. 3.]
[2. 2.]
[2. 3.]
[2. 2.]
[2. 1.]
[1. 1.]
[1. 0.]
[0. 0.]
