In [13]:
import numpy as np
import time
import os
import matplotlib.pyplot as plt
%matplotlib inline

import gymnasium as gym
import stable_baselines3 as sb

from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.env_checker import check_env

In [14]:
# Renders the model in the enviroment to see it learning progress
def show_progress(model, time_steps = 1000, deterministic = True):
    env = model.get_env()
    obs = env.reset()
    for i in range(time_steps):
        action, _states = model.predict(obs, deterministic = deterministic)
        obs, rewards, dones, info = env.step(action)
        env.render("human")
    return

# Gym Env
Ejemplo

In [15]:
from gymnasium import spaces

class GoLeftEnv(gym.Env):
  """
  Custom Environment that follows gym interface.
  This is a simple env where the agent must learn to go always left. 
  """
  # Because of google colab, we cannot implement the GUI ('human' render mode)
  metadata = {'render.modes': ['console']}
  # Define constants for clearer code
  LEFT = 0
  RIGHT = 1

  def __init__(self, grid_size=10):
    super(GoLeftEnv, self).__init__()

    # Size of the 1D-grid
    self.grid_size = grid_size
    # Initialize the agent at the right of the grid
    self.agent_pos = grid_size - 1

    # Define action and observation space
    # They must be gym.spaces objects
    # Example when using discrete actions, we have two: left and right
    n_actions = 2
    self.action_space = spaces.Discrete(n_actions)
    # The observation will be the coordinate of the agent
    # this can be described both by Discrete and Box space
    self.observation_space = spaces.Box(low=0, high=self.grid_size,
                                        shape=(1,), dtype=np.float32)

  def reset(self, seed = 0):
    """
    Important: the observation must be a numpy array
    :return: (np.array) 
    """
    # Initialize the agent at the right of the grid
    self.agent_pos = self.grid_size - 1
    # here we convert to float32 to make it more general (in case we want to use continuous actions)
    return np.array([self.agent_pos]).astype(np.float32), {}

  def step(self, action):
    if action == self.LEFT:
      self.agent_pos -= 1
    elif action == self.RIGHT:
      self.agent_pos += 1
    else:
      raise ValueError("Received invalid action={} which is not part of the action space".format(action))

    # Account for the boundaries of the grid
    self.agent_pos = np.clip(self.agent_pos, 0, self.grid_size)

    # Are we at the left of the grid?
    terminated = bool(self.agent_pos == 0)
    truncated = False
    
    # Null reward everywhere except when reaching the goal (left of the grid)
    reward = 1 if self.agent_pos == 0 else 0

    # Optionally we can pass additional info, we are not using that for now
    info = {}

    return np.array([self.agent_pos]).astype(np.float32),\
            reward, terminated, truncated, info

  def render(self, mode='console'):
    if mode != 'console':
      raise NotImplementedError()
    # agent is represented as a cross, rest as a dot
    print("." * self.agent_pos, end="")
    print("x", end="")
    print("." * (self.grid_size - self.agent_pos))

  def close(self):
    pass

Env check and test with a coded agent

In [16]:
env = GoLeftEnv()
# If the environment don't follow the interface, an error will be thrown
check_env(env, warn=True)

obs = env.reset()
env.render()

print(env.observation_space)
print(env.action_space)

GO_LEFT = 0
# Hardcoded best agent: always go left!
n_steps = 20
for step in range(n_steps):
  print("Step {}".format(step + 1))
  obs, reward, terminated, truncated, info = env.step(GO_LEFT)
  print('obs=', obs, 'reward=', reward, 'done=', terminated)
  env.render()
  if terminated:
    print("Goal reached!", "reward=", reward)
    break

.........x.
Box(0.0, 10.0, (1,), float32)
Discrete(2)
Step 1
obs= [8.] reward= 0 done= False
........x..
Step 2
obs= [7.] reward= 0 done= False
.......x...
Step 3
obs= [6.] reward= 0 done= False
......x....
Step 4
obs= [5.] reward= 0 done= False
.....x.....
Step 5
obs= [4.] reward= 0 done= False
....x......
Step 6
obs= [3.] reward= 0 done= False
...x.......
Step 7
obs= [2.] reward= 0 done= False
..x........
Step 8
obs= [1.] reward= 0 done= False
.x.........
Step 9
obs= [0.] reward= 1 done= True
x..........
Goal reached! reward= 1


Ahora con un modelo

In [17]:
env = GoLeftEnv(grid_size=10)
# wrap it
env = make_vec_env(lambda: env, n_envs=1)

model = sb.A2C('MlpPolicy', env, verbose=1).learn(5000)

Using cpu device
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 23.4     |
|    ep_rew_mean        | 1        |
| time/                 |          |
|    fps                | 717      |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.344   |
|    explained_variance | 0.439    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 0.0148   |
|    value_loss         | 0.0101   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 13.8     |
|    ep_rew_mean        | 1        |
| time/                 |          |
|    fps                | 790      |
|    iterations         | 200      |
|    time_elapsed       | 1        |
|    total_timesteps    | 1000     |
| train/             

In [35]:
def RK4(fun, x, dt, t = 0, a = 0):

    k1 = fun(t,      x,         a)
    k2 = fun(t+dt/2, x+dt*k1/2, a)
    k3 = fun(t+dt/2, x+dt*k2/2, a)
    k4 = fun(t+dt,   x+dt*k3,   a)

    y = x + dt/6*(k1+2*k2+2*k3+k4)
    return y

# Todo esto deberia ir dentro del Env eventualmente

gamma     = .1
gammath   = .1
L     = .2
G     = .98

def cart_evol(t, x, a = 0):
    vDot = a - gamma * x[1]
    xDot = x[1]
    return np.array([xDot, vDot])

def pend_evol(t, x, a = 0):
    thDotDot = (a * np.cos(x[0]) - G * np.sin(x[0]))/L - gammath * x[1]
    thDot    = x[1]
    return np.array([thDot, thDotDot])

def get_pos_pend(ang, x, L):
    return x - L*np.sin(ang), L*np.cos(ang)

def observations(x,th,xDot,thDot):
    return np.array([x, np.cos(th), np.sin(th), xDot/2, thDot/20]).astype(np.float32)

In [None]:
from gymnasium import spaces

class Pendulo(gym.Env):
    """
    Custom Environment that follows gym interface.
    This is a simple env where the agent must learn to go always left. 
    """
    # Because of google colab, we cannot implement the GUI ('human' render mode)
    metadata = {'render.modes': ['console']}
    # Define constants for clearer code
    # Distancias en mm?

    def __init__(self, fps = 120, target_th = 1, max_steps = 2000):
        super(Pendulo, self).__init__()

        # Size of the 1D-grid
        self.rail_lengh = 1

        # (x,th), (xDot,thDot)
        self.agent_vars = np.array(((0.,0.),(0.,0.)))

        # variables de tiempo
        self.n_step = 0
        self.max_steps = max_steps
        self.dt      = 1000/fps # timestep en microseg

        # threshold del reward
        self.targetH = np.cos(target_th)

        # Define action and observation space
        # They must be gym.spaces objects
        # Example when using discrete actions, we have 3: left, right and still
        self.n_actions = 3
        self.action_space = spaces.Discrete(self.n_actions)
        # The observation will be the coordinate of the agent
        # (x, v, cos th, sin th, thDot)
        # elijo pasarle seno y coseno enves del angulo xq quedan en [-1,1]
        # y no tienen el problema de discontinuidad de th en [-pi,pi]
        self.observation_space = spaces.Box(low = -1, high = 1,
                                            shape=(5,), dtype=np.float32)

    def reset(self, seed = None,):
        """
        Important: the observation must be a numpy array
        :return: (np.array) 
        """
        super().reset(seed=seed)
        # Initialize the agent at the right of the grid
        self.agent_vars = np.array(((0.,0.),(0.,0.)))
        # here we convert to float32 to make it more general (in case we want to use continuous actions)
        return observations(*np.ravel(self.agent_vars)), {}

    def step(self, action):

        x,v = self.agent_vars[0]
        th,thDot = self.agent_vars[1]

        if action > self.n_actions:
            raise ValueError("Received invalid action={} which is not part of the action space".format(action))
        elif abs(x) > 1:
            x = np.clip(x, -1, 1)
            a = -v/self.dt * .2
            thDotDot = -v/self.dt * 1.2
            v = 0
        else:
            thDotDot = a = action - 1

        x, v = RK4(cart_evol, [x, v], self.dt, a = a)
        th, thDot = RK4(pend_evol, [th, thDot], self.dt, a = thDotDot)

        self.agent_vars = np.array(((x,th),(v,thDot)))

        terminated = False
        if self.n_step >= 1500:
            truncated = True
        else: truncated = False
        # Null reward everywhere except when reaching the goal (left of the grid)
        reward = 1 if np.cos(th) < self.targetH else 0
        # Optionally we can pass additional info, we are not using that for now
        info = {}

        return observations(*np.ravel(self.agent_vars)),\
                reward, terminated, truncated, info

    def render(self, mode='console'):
        # if mode != 'console':
        #     # agent is represented as a cross, rest as a dot
        #     print("." * self.agent_pos, end="")
        #     print("x", end="")
        #     print("." * (self.grid_size - self.agent_pos))
        # else:
            raise NotImplementedError()

    def close(self):
        pass

In [41]:
env = Pendulo()
# If the environment don't follow the interface, an error will be thrown
check_env(env, warn=True)

AssertionError: The observation returned by the `step()` method does not match the bounds of the given observation space Box(-1.0, 1.0, (5,), float32). 
3 invalid indices: 
Expected: -1.0 <= obs[0] <= 1.0, actual value: 27.0865478515625 
Expected: -1.0 <= obs[3] <= 1.0, actual value: 2.8123393058776855 
Expected: -1.0 <= obs[4] <= 1.0, actual value: 1.2056987285614014 
