<a href="https://colab.research.google.com/github/emil-freme/rflearnign-autobot/blob/main/Reinforcement_Learning_OOP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gymnasium



In [None]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

# Definição do ambiente
> https://gymnasium.farama.org/tutorials/gymnasium_basics/environment_creation/#sphx-glr-tutorials-gymnasium-basics-environment-creation-py


In [None]:
class AutoBot(gym.Env):

    def __init__(self, render_mode=None, size=5, obstacles_n=1, targets_n=1):
        self.size = size
        self.window_size = 400
        self.obstacles_n = obstacles_n
        self.obstacles = []
        self.targets_n = targets_n
        self.targets = []
        self.agent_position = None

        # We have 4 actions, corresponding to "right", "up", "left", "down"
        self.action_space = spaces.Discrete(4)
        """
        The following dictionary maps abstract actions from `self.action_space` to
        the direction we will walk in if that action is taken.
        I.e. 0 corresponds to "right", 1 to "up" etc.
        """
        self._action_to_direction = {
            0: np.array([1, 0]),
            1: np.array([0, 1]),
            2: np.array([-1, 0]),
            3: np.array([0, -1]),
        }

    def _get_obs(self):
        return {"agent": self.agent_position,
                "targets": self.targets,
                "obstacles": self.obstacles
                }

    def _get_info(self):
        return {
            "distances": [ np.linalg.norm(self.agent_position - target, ord=1) for target in self.targets ]
        }

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)

        self.targets.clear()
        self.obstacles.clear()

        sampler = spaces.Box(0, self.size-1, shape=(2,), dtype=int)

        self.agent_position = sampler.sample()

        for i in range(self.targets_n):
            target_position = self.agent_position
            while(  np.array_equal( target_position, self.agent_position)):
                target_position = sampler.sample();
            self.targets.append(target_position)

        # TODO: evitar que o obstaculo seja um objetivo
        for i in range(self.obstacles_n):
            obstacles_position = self.agent_position
            while(  np.array_equal( obstacles_position, self.agent_position)):
                obstacles_position = sampler.sample();
            self.obstacles.append(obstacles_position)


        observation = self._get_obs()
        info = self._get_info()

        return observation, info

    def step(self,action):

        # Em nosso ambiente determinitico a função de transição é se mover
        # Para o proximo estado
        direction = self._action_to_direction[action]
        self.agent_position = np.clip(
            self.agent_position + direction, 0, self.size-1)



        #naive implementation, primeiro para um depois tenfo fazer para n e para obstaculos
        obs_hit = np.array_equal(self.agent_position, self.obstacles[0])
        terminated = np.array_equal(self.agent_position, self.targets[0])


        # Rewards:
        # 1 Para chegar no objetivo
        # -1 Se bater em um obstaculo
        # -0.01 Para living Penalty
        reward = 1 if terminated else -1 if obs_hit else -0.01

        observation = self._get_obs()
        info = self._get_info()

        return observation, reward, terminated, False, info

    def render(self, mode='human'):
        grid = np.chararray((self.size, self.size))
        grid[:] = "-"
        for obs in self.obstacles:
            grid[obs[0], obs[1]] = "#"  # Marcar obstáculos
        for target in self.targets:
            grid[target[0], target[1]] = "@"  # Marcar pontos de entrega
        grid[self.agent_position[0], self.agent_position[1]] = "X"  # Marcar a posição do agente
        print(grid)




In [None]:
env = AutoBot()

In [None]:
env.reset(42)
for _ in range(1000):
    action = env.action_space.sample()
    step = env.step(action)
    print(f"""
    Ação: {action}
    Obs: {step[0]},
    Reward: {step[1]},
    Terminated: {step[2]},
    Truncated: {step[3]},
    Info: {step[4]}
     """)
    env.render()
    if step[2]:
        break


    Ação: 0
    Obs: {'agent': array([3, 4]), 'targets': [array([0, 0])], 'obstacles': [array([3, 0])]},
    Reward: -0.01,
    Terminated: False,
    Truncated: False, 
    Info: {'distances': [7.0]}
     
[[b'@' b'-' b'-' b'-' b'-']
 [b'-' b'-' b'-' b'-' b'-']
 [b'-' b'-' b'-' b'-' b'-']
 [b'#' b'-' b'-' b'-' b'X']
 [b'-' b'-' b'-' b'-' b'-']]

    Ação: 0
    Obs: {'agent': array([4, 4]), 'targets': [array([0, 0])], 'obstacles': [array([3, 0])]},
    Reward: -0.01,
    Terminated: False,
    Truncated: False, 
    Info: {'distances': [8.0]}
     
[[b'@' b'-' b'-' b'-' b'-']
 [b'-' b'-' b'-' b'-' b'-']
 [b'-' b'-' b'-' b'-' b'-']
 [b'#' b'-' b'-' b'-' b'-']
 [b'-' b'-' b'-' b'-' b'X']]

    Ação: 1
    Obs: {'agent': array([4, 4]), 'targets': [array([0, 0])], 'obstacles': [array([3, 0])]},
    Reward: -0.01,
    Terminated: False,
    Truncated: False, 
    Info: {'distances': [8.0]}
     
[[b'@' b'-' b'-' b'-' b'-']
 [b'-' b'-' b'-' b'-' b'-']
 [b'-' b'-' b'-' b'-' b'-']
 [b'#' b'-