In [3]:
import gymnasium as gym
from stable_baselines3 import PPO

import gymnasium as gym 
import numpy as np
import matplotlib.pyplot as plt
import random
import itertools
import torch
from agents.dqn import DoubleDQNAgent # for typing only
import pygame
from gymnasium.spaces import Dict, Discrete, Box
from collections import OrderedDict

ImportError: /leonardo/home/userexternal/mnunzian/RoboSurgery/.rob/lib/python3.10/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12

In [2]:

class GYMGridEnvDeform(gym.Env):
    
    def __init__(self, maze, l0,h0,l1,h1,render_mode = None):

        self.original_maze = maze
        self.original_maze_shape = maze.shape
 
        self.maze = maze
        self.maze_shape = maze.shape

        # list of possible actions
        self.actions = [0,1,2,3]
        # list of possible orientations
        self.orientations = [0,1,2,3]
        # list of possible observations
        self.obs = list(itertools.product([0,1], repeat=5))
        # list of possible deformations
        self.deformations = [(i,j) for i in range(l0,h0) for j in range(l1,h1)]

        
        # space in which every maze lives (is a 2d matrix)
        self.max_shape = self.original_maze.shape * np.array([h1-1,h0-1]) + np.array([2,2])
        # list of states
        self.states = [((x,y,phi),(i,j)) for x in range(1,self.max_shape[0]-1) for y in range(1,self.max_shape[1]-1) for phi in range(4) for i in range(l0,h0) for j in range(l1,h1)] 
        self.state_dict = {state : i for i, state in enumerate(self.states)}
        
        self.l0 = l0
        self.h0 = h0
        self.l1 = l1
        self.h1 = h1

        self.goal_pos = self.original_maze.shape - np.array([2,2])
        
        self.frames = []
        self.reset()

        self.render_mode = render_mode
        if self.render_mode == "human":
            self.set_rendering()

        # gym attributes
        self.action_space = gym.spaces.Discrete(4)
        self.observation_space =  Dict({
                                    "x": Discrete(self.max_shape[0]),              # Values from 0 to 10
                                    "y": Discrete(self.max_shape[1]),              # Values from 0 to 10
                                    "phi": Discrete(5),             # Values from 0 to 4
                                    "belief": Box(low=0.0, high=1.0, shape=(len(self.deformations),), dtype=float)  # Probability vector
                                })
    
    def step(self, a):

        """take action a from state s (if given) or from actual state of the maze 
        
        return the next state, the reward, if the episode is terminated, if the episode is truncated, info"""
        
        x, y = self.agent_pos
        phi = self.agent_orientation
        x_, y_, phi_ = x, y, phi

        actual_action = (a + phi) % 4
        
        if actual_action == 0:  # Move up
            new_pos = [x - 1, y]
        elif actual_action == 2:  # Move down
            new_pos = [x + 1, y]
        elif actual_action == 3:  # Move left
            new_pos = [x, y - 1]
        elif actual_action == 1:  # Move right
            new_pos = [x, y + 1]
        else:
            raise ValueError("Invalid Action")
        
        # Check if the new position is valid (inside the maze and not a wall)
        if 0 < new_pos[0] < self.max_shape[0]-1 and 0 < new_pos[1] < self.max_shape[1]-1:
            x_, y_ = new_pos
            self.agent_pos = new_pos

        phi_ = (phi + a) % 4
        
        self.agent_orientation = phi_
        
        terminated = np.all((x_,y_) == self.goal_pos)

        if np.all((x_,y_) == self.goal_pos):
            # if the agent is in the goal position
            reward =  1            
        elif np.all((x_,y_) == (x,y)):
            # if the agent has not moved (only at the boundary of the maze)
            reward =  -2 # -50/(self.max_shape[0]*self.max_shape[1])
        elif self.maze[x_, y_] == 1:
            # if the agent has entered a wall
            reward =  -2 # -50/(self.max_shape[0]*self.max_shape[1])
        elif self.maze[x_, y_] == 0:
            # if the agent has moved to a free cell
            reward =  -0.5 # -1/(self.max_shape[0]*self.max_shape[1])

        info = {}
        truncated = False 
                
        self.timestep += 1

        if self.render_mode == "human":
            self.render()

        new_beleif = self.update_belief()

        self.belief = new_beleif

        obs = OrderedDict({
                            "x": np.int64(x_),              # Values from 0 to 10
                            "y": np.int64(y_),              # Values from 0 to 10
                            "phi": np.int64(phi_),             # Values from 0 to 4
                            "belief": self.belief , # Probability vector
                        })

        
        return obs, reward, terminated, truncated, info
    
    def set_render_mode(self, mode):
        self.render_mode = mode
        if self.render_mode == "human":
            self.set_rendering()
        
    def get_observation(self, s=None):

        if s is None:
            agent_pos = self.agent_pos
            agent_orientation = self.agent_orientation
        else:
            prior_state = self.get_state()
            self.set_deformed_maze(s[1])
            agent_pos = s[0][:2]
            agent_orientation = s[0][2]

        ind = [agent_pos + a for a in [np.array([0,-1]),
                                            np.array([-1,-1]),
                                            np.array([-1,0]),
                                            np.array([-1,+1]),
                                            np.array([0,+1]),
                                            np.array([+1,+1]),
                                            np.array([+1,0]),
                                            np.array([+1,-1])]]

        agent_obs = np.array([self.maze[tuple(ind[i%8])] 
                                                for i in range(2*agent_orientation, 2*agent_orientation+5)])
        
        if s is not None:
            self.set_state(prior_state)

        
        return agent_obs

    def set_state(self, s):
        theta0, theta1 = s[1][0], s[1][1]
        self.theta = (theta0, theta1)
        self.agent_pos = np.array(s[0][:2]) 
        self.agent_orientation = s[0][2]
        self.set_deformed_maze(s[1])

    def get_state(self):
        return (self.agent_pos[0],self.agent_pos[1], self.agent_orientation), self.theta

    def set_deformed_maze(self,thetas: tuple):
        self.theta = thetas
        self.maze = self.stretch_maze(thetas)
        # self.goal_pos = self.maze.shape - np.array([thetas[1],thetas[0]])
        self.goal_pos = self.original_maze.shape * np.array([thetas[1],thetas[0]])

        canva1 = np.ones(self.max_shape, dtype=int)  # Start with walls
        # Place the original maze in the canvas
        canva1[1:self.maze.shape[0] + 1, 1:self.maze.shape[1] + 1] = self.maze

        self.maze = canva1
   
    def stretch_maze(self, thetas):
        scale_x, scale_y = thetas
        maze = self.original_maze

        original_height, original_width = maze.shape
        # Calculate new dimensions
        new_height = original_height * scale_y
        new_width = original_width * scale_x
        
        # Create a new maze with stretched dimensions
        stretched_maze = np.ones((new_height, new_width), dtype=int)

        # Fill the new maze with values from the original maze
        for i in range(original_height):
            for j in range(original_width):
                if maze[i, j] == 0:  # Path cell
                    # Fill the corresponding region in the stretched maze
                    stretched_maze[i*scale_y:(i+1)*scale_y, j*scale_x:(j+1)*scale_x] = 0

        return stretched_maze
    
    def update_belief(self):
        """"
        perform update over theta
        
        $$b'_{x,a,o}(theta) = \eta \cdot p(o|x,theta) \cdot b(theta)$$
        
        """

        new_belief = np.zeros_like(self.belief)
        observation = self.get_observation()
        pos = (self.agent_pos[0],self.agent_pos[1],self.agent_orientation)

        for t, theta in enumerate(self.deformations):
            P_o_s_theta = np.all(self.get_observation(s = (pos,theta)) == observation) # 0 or 1 

            new_belief[t] = P_o_s_theta * self.belief[t]
        
        new_belief = new_belief / (np.sum(new_belief) + 1e-10)

        return new_belief

    def set_rendering(self):
        self.screen_width = 800
        self.screen_height = 600
        pygame.init()  # Initialize all pygame modules
        self.screen = pygame.display.set_mode((self.screen_width, self.screen_height))
        pygame.display.set_caption("Maze Environment")
        
        # Handle key events
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                pygame.quit()
                return
            elif event.type == pygame.KEYDOWN:
                # Press 'r' to reset environment
                if event.key == pygame.K_r:
                    self.reset()
                # Press 'q' to quit
                elif event.key == pygame.K_q:
                    pygame.quit()
                    return
                # Press 's' to save current state
                elif event.key == pygame.K_s:
                    self.save_state()
                # Press space to pause/resume
                elif event.key == pygame.K_SPACE:
                    self.pause()
                # Press arrow keys for manual control
                elif event.key == pygame.K_LEFT:
                    self.step(3)  # Left action
                elif event.key == pygame.K_RIGHT:
                    self.step(1)  # Right action
                elif event.key == pygame.K_UP:
                    self.step(0)  # Up action
                elif event.key == pygame.K_DOWN:
                    self.step(2)  # Down action

        # Update display
        pygame.display.flip()

    def render(self):
        """Render the maze using Pygame"""
        
        # Clear the screen
        self.screen.fill((255, 255, 255))

        # Draw the maze
        cell_size = min(self.screen_width, self.screen_height) // max(self.max_shape)
        for x in range(self.max_shape[0]):
            for y in range(self.max_shape[1]):
                if (x, y) == tuple(self.agent_pos):
                    color = (255, 0, 0)  # Red for agent
                elif (x, y) == tuple(self.goal_pos):
                    color = (0, 255, 0)  # Green for goal
                elif self.maze[x, y] == 1:
                    color = (0, 0, 0)  # Black for walls
                else:
                    color = (255, 255, 255)  # White for free space
                pygame.draw.rect(self.screen, color, (y * cell_size, x * cell_size, cell_size, cell_size))

        # Add text for controls
        font = pygame.font.Font(None, 36)
        controls = [
            "Controls:",
            "R - Reset",
            "Q - Quit",
            "Space - Pause/Resume",
            "Arrows - Move agent"
        ]
        
        for i, text in enumerate(controls):
            text_surface = font.render(text, True, (0, 0, 0))
            self.screen.blit(text_surface, (self.screen_width - 200, 20 + i * 30))

        # Handle events
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                pygame.quit()
                return
            elif event.type == pygame.KEYDOWN:
                if event.key == pygame.K_r:
                    self.reset()
                elif event.key == pygame.K_q:
                    pygame.quit()
                    return
                elif event.key == pygame.K_SPACE:
                    self.pause()
                elif event.key == pygame.K_LEFT:
                    self.step(3,execute=True)
                elif event.key == pygame.K_RIGHT:
                    self.step(1,execute=True)
                elif event.key == pygame.K_UP:
                    self.step(0,execute=True)
                elif event.key == pygame.K_DOWN:
                    self.step(2,execute=True)
                

        # Update the display
        pygame.display.flip()

        # Capture the current frame and add it to the list of frames
        frame = pygame.surfarray.array3d(self.screen)
        self.frames.append(frame)

    def reset(self, seed=42):
        randomdeformation = random.choice(self.deformations)
        self.agent_pos = [np.random.randint(1, self.max_shape[0]-1), np.random.randint(1, self.max_shape[1]-1)]
        self.agent_orientation = random.choice(self.orientations)
        self.set_deformed_maze(randomdeformation)
        self.goal_pos = self.original_maze.shape * np.array([randomdeformation[1],randomdeformation[0]])
        self.theta = randomdeformation
        self.timestep = 0
        
        self.belief = np.ones(len(self.deformations)) / len(self.deformations)
        obs = OrderedDict({
                            "x": np.int64(self.agent_pos[0]),              # Values from 0 to 10
                            "y": np.int64(self.agent_pos[1]),              # Values from 0 to 10
                            "phi": np.int64(self.agent_orientation),             # Values from 0 to 4
                            "belief": self.belief , # Probability vector
                        })

        return obs, {}
    

class FULLGYMGridEnvDeform(gym.Env):
    
    def __init__(self, maze, l0,h0,l1,h1,render_mode = None):

        self.original_maze = maze
        self.original_maze_shape = maze.shape
 
        self.maze = maze
        self.maze_shape = maze.shape

        # list of possible actions
        self.actions = [0,1,2,3]
        # list of possible orientations
        self.orientations = [0,1,2,3]
        # list of possible observations
        self.obs = list(itertools.product([0,1], repeat=5))
        # list of possible deformations
        self.deformations = [(i,j) for i in range(l0,h0) for j in range(l1,h1)]

        
        # space in which every maze lives (is a 2d matrix)
        self.max_shape = self.original_maze.shape * np.array([h1-1,h0-1]) + np.array([2,2])
        # list of states
        self.states = [((x,y,phi),(i,j)) for x in range(1,self.max_shape[0]-1) for y in range(1,self.max_shape[1]-1) for phi in range(4) for i in range(l0,h0) for j in range(l1,h1)] 
        self.state_dict = {state : i for i, state in enumerate(self.states)}
        
        self.l0 = l0
        self.h0 = h0
        self.l1 = l1
        self.h1 = h1

        self.goal_pos = self.original_maze.shape - np.array([2,2])
        
        self.frames = []
        self.reset()

        self.render_mode = render_mode
        if self.render_mode == "human":
            self.set_rendering()

        # gym attributes
        self.action_space = gym.spaces.Discrete(4)
        self.observation_space =  Dict({
                                    "x": Discrete(self.max_shape[0]),              # Values from 0 to 10
                                    "y": Discrete(self.max_shape[1]),              # Values from 0 to 10
                                    "phi": Discrete(5),                             # Values from 0 to 4
                                    "theta": Box(low=min(l0,l1), high=max(h0,h1), shape=(2,), dtype=int)  # Probability vector
                        # Probability vector
                                })
    
    def step(self, a):

        """take action a from state s (if given) or from actual state of the maze 
        
        return the next state, the reward, if the episode is terminated, if the episode is truncated, info"""
        
        x, y = self.agent_pos
        phi = self.agent_orientation
        x_, y_, phi_ = x, y, phi

        actual_action = (a + phi) % 4
        
        if actual_action == 0:  # Move up
            new_pos = [x - 1, y]
        elif actual_action == 2:  # Move down
            new_pos = [x + 1, y]
        elif actual_action == 3:  # Move left
            new_pos = [x, y - 1]
        elif actual_action == 1:  # Move right
            new_pos = [x, y + 1]
        else:
            raise ValueError("Invalid Action")
        
        # Check if the new position is valid (inside the maze and not a wall)
        if 0 < new_pos[0] < self.max_shape[0]-1 and 0 < new_pos[1] < self.max_shape[1]-1:
            x_, y_ = new_pos
            self.agent_pos = new_pos

        phi_ = (phi + a) % 4
        
        self.agent_orientation = phi_
        
        terminated = np.all((x_,y_) == self.goal_pos)

        if np.all((x_,y_) == self.goal_pos):
            # if the agent is in the goal position
            reward =  1            
        elif np.all((x_,y_) == (x,y)):
            # if the agent has not moved (only at the boundary of the maze)
            reward =  -2 # -50/(self.max_shape[0]*self.max_shape[1])
        elif self.maze[x_, y_] == 1:
            # if the agent has entered a wall
            reward =  -2 # -50/(self.max_shape[0]*self.max_shape[1])
        elif self.maze[x_, y_] == 0:
            # if the agent has moved to a free cell
            reward =  -0.5 # -1/(self.max_shape[0]*self.max_shape[1])

        info = {}
        truncated = False 
                
        self.timestep += 1

        if self.render_mode == "human":
            self.render()

        new_beleif = self.update_belief()

        self.belief = new_beleif

        obs = OrderedDict({
                            "x": np.int64(x_),              # Values from 0 to 10
                            "y": np.int64(y_),              # Values from 0 to 10
                            "phi": np.int64(phi_),             # Values from 0 to 4
                            "theta": np.array(self.theta) , # Probability vector
                        })

        
        return obs, reward, terminated, truncated, info
    
    def set_render_mode(self, mode):
        self.render_mode = mode
        if self.render_mode == "human":
            self.set_rendering()
        
    def get_observation(self, s=None):

        if s is None:
            agent_pos = self.agent_pos
            agent_orientation = self.agent_orientation
        else:
            prior_state = self.get_state()
            self.set_deformed_maze(s[1])
            agent_pos = s[0][:2]
            agent_orientation = s[0][2]

        ind = [agent_pos + a for a in [np.array([0,-1]),
                                            np.array([-1,-1]),
                                            np.array([-1,0]),
                                            np.array([-1,+1]),
                                            np.array([0,+1]),
                                            np.array([+1,+1]),
                                            np.array([+1,0]),
                                            np.array([+1,-1])]]

        agent_obs = np.array([self.maze[tuple(ind[i%8])] 
                                                for i in range(2*agent_orientation, 2*agent_orientation+5)])
        
        if s is not None:
            self.set_state(prior_state)

        
        return agent_obs

    def set_state(self, s):
        theta0, theta1 = s[1][0], s[1][1]
        self.theta = (theta0, theta1)
        self.agent_pos = np.array(s[0][:2]) 
        self.agent_orientation = s[0][2]
        self.set_deformed_maze(s[1])

    def get_state(self):
        return (self.agent_pos[0],self.agent_pos[1], self.agent_orientation), self.theta

    def set_deformed_maze(self,thetas: tuple):
        self.theta = thetas
        self.maze = self.stretch_maze(thetas)
        # self.goal_pos = self.maze.shape - np.array([thetas[1],thetas[0]])
        self.goal_pos = self.original_maze.shape * np.array([thetas[1],thetas[0]])

        canva1 = np.ones(self.max_shape, dtype=int)  # Start with walls
        # Place the original maze in the canvas
        canva1[1:self.maze.shape[0] + 1, 1:self.maze.shape[1] + 1] = self.maze

        self.maze = canva1
   
    def stretch_maze(self, thetas):
        scale_x, scale_y = thetas
        maze = self.original_maze

        original_height, original_width = maze.shape
        # Calculate new dimensions
        new_height = original_height * scale_y
        new_width = original_width * scale_x
        
        # Create a new maze with stretched dimensions
        stretched_maze = np.ones((new_height, new_width), dtype=int)

        # Fill the new maze with values from the original maze
        for i in range(original_height):
            for j in range(original_width):
                if maze[i, j] == 0:  # Path cell
                    # Fill the corresponding region in the stretched maze
                    stretched_maze[i*scale_y:(i+1)*scale_y, j*scale_x:(j+1)*scale_x] = 0

        return stretched_maze
    
    def update_belief(self):
        """"
        perform update over theta
        
        $$b'_{x,a,o}(theta) = \eta \cdot p(o|x,theta) \cdot b(theta)$$
        
        """

        new_belief = np.zeros_like(self.belief)
        observation = self.get_observation()
        pos = (self.agent_pos[0],self.agent_pos[1],self.agent_orientation)

        for t, theta in enumerate(self.deformations):
            P_o_s_theta = np.all(self.get_observation(s = (pos,theta)) == observation) # 0 or 1 

            new_belief[t] = P_o_s_theta * self.belief[t]
        
        new_belief = new_belief / (np.sum(new_belief) + 1e-10)

        return new_belief

    def set_rendering(self):
        self.screen_width = 800
        self.screen_height = 600
        pygame.init()  # Initialize all pygame modules
        self.screen = pygame.display.set_mode((self.screen_width, self.screen_height))
        pygame.display.set_caption("Maze Environment")
        
        # Handle key events
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                pygame.quit()
                return
            elif event.type == pygame.KEYDOWN:
                # Press 'r' to reset environment
                if event.key == pygame.K_r:
                    self.reset()
                # Press 'q' to quit
                elif event.key == pygame.K_q:
                    pygame.quit()
                    self.render_mode = None
                    return
                # Press 's' to save current state
                elif event.key == pygame.K_s:
                    self.save_state()
                # Press space to pause/resume
                elif event.key == pygame.K_SPACE:
                    self.pause()
                # Press arrow keys for manual control
                elif event.key == pygame.K_LEFT:
                    self.step(3)  # Left action
                elif event.key == pygame.K_RIGHT:
                    self.step(1)  # Right action
                elif event.key == pygame.K_UP:
                    self.step(0)  # Up action
                elif event.key == pygame.K_DOWN:
                    self.step(2)  # Down action

        # Update display
        pygame.display.flip()

    def render(self):
        """Render the maze using Pygame"""
        
        # Clear the screen
        self.screen.fill((255, 255, 255))

        # Draw the maze
        cell_size = min(self.screen_width, self.screen_height) // max(self.max_shape)
        for x in range(self.max_shape[0]):
            for y in range(self.max_shape[1]):
                if (x, y) == tuple(self.agent_pos):
                    color = (255, 0, 0)  # Red for agent
                elif (x, y) == tuple(self.goal_pos):
                    color = (0, 255, 0)  # Green for goal
                elif self.maze[x, y] == 1:
                    color = (0, 0, 0)  # Black for walls
                else:
                    color = (255, 255, 255)  # White for free space
                pygame.draw.rect(self.screen, color, (y * cell_size, x * cell_size, cell_size, cell_size))

        # Add text for controls
        font = pygame.font.Font(None, 36)
        controls = [
            "Controls:",
            "R - Reset",
            "Q - Quit",
            "Space - Pause/Resume",
            "Arrows - Move agent"
        ]
        
        for i, text in enumerate(controls):
            text_surface = font.render(text, True, (0, 0, 0))
            self.screen.blit(text_surface, (self.screen_width - 200, 20 + i * 30))

        # Handle events
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                pygame.quit()
                return
            elif event.type == pygame.KEYDOWN:
                if event.key == pygame.K_r:
                    self.reset()
                elif event.key == pygame.K_q:
                    pygame.quit()
                    return
                elif event.key == pygame.K_SPACE:
                    self.pause()
                elif event.key == pygame.K_LEFT:
                    self.step(3,execute=True)
                elif event.key == pygame.K_RIGHT:
                    self.step(1,execute=True)
                elif event.key == pygame.K_UP:
                    self.step(0,execute=True)
                elif event.key == pygame.K_DOWN:
                    self.step(2,execute=True)
                

        # Update the display
        pygame.display.flip()

        # Capture the current frame and add it to the list of frames
        frame = pygame.surfarray.array3d(self.screen)
        self.frames.append(frame)

    def reset(self, seed=42):
        randomdeformation = random.choice(self.deformations)
        self.agent_pos = [np.random.randint(1, self.max_shape[0]-1), np.random.randint(1, self.max_shape[1]-1)]
        self.agent_orientation = random.choice(self.orientations)
        self.set_deformed_maze(randomdeformation)
        self.goal_pos = self.original_maze.shape * np.array([randomdeformation[1],randomdeformation[0]])
        self.theta = randomdeformation
        self.timestep = 0
        
        self.belief = np.ones(len(self.deformations)) / len(self.deformations)
        obs = OrderedDict({
                            "x": np.int64(self.agent_pos[0]),              # Values from 0 to 10
                            "y": np.int64(self.agent_pos[1]),              # Values from 0 to 10
                            "phi": np.int64(self.agent_orientation),             # Values from 0 to 4
                            "theta": np.array(self.theta) , # Probability vector
                        })

        return obs, {}
    
        

In [3]:
import numpy as np
import torch
import itertools
import matplotlib.pyplot as plt

from agents.dqn import DoubleDQNAgent
from eval import eval_agent, all_data
# maze size
N = 2

# thetas deformations (range(a,b),range(c,d))
l0 = 1
h0 = 10
l1 = 1
h1 = 10

maze = np.load(f"maze/maze_{N}.npy")
env = FULLGYMGridEnvDeform(maze,l0,h0,l1,h1)

states = [((x,y,phi),(i,j)) for x in range(1,env.max_shape[0]-1) for y in range(1,env.max_shape[1]-1) for phi in range(4) for i in range(l0,h0) for j in range(l1,h1)] 
positions = [(x,y,phi) for x in range(1,env.max_shape[0]-1) for y in range(1,env.max_shape[1]-1) for phi in range(4)]
actions = [0,1,2,3]
obs = list(itertools.product([0,1], repeat=5))
thetas = [(i,j) for i in range(l0,h0) for j in range(l1,h1)]

state_dict = {state: i for i, state in enumerate(states)}
position_dict = {position: i for i, position in enumerate(positions)}
obs_dict = {obs : i for i, obs in enumerate(obs)}

# Actions are: 0-listen, 1-open-left, 2-open-right
lenS = len(states)
lenP = len(positions)
lenA = len(actions)
lenO = len(obs)


In [4]:
from stable_baselines3.common.callbacks import BaseCallback
from wandb.integration.sb3 import WandbCallback
import wandb

class My_callback(BaseCallback):
    def __init__(self, verbose=0):
        super(My_callback, self).__init__(verbose)
    def _on_step(self) -> bool:
        if self.num_timesteps % 200 == 0:
            self.training_env.reset()
        return True
    def _on_rollout_end(self) -> None:
        print(f"Rollout end: {self.num_timesteps}")
        return True
    

total_timesteps = 100000
batch_size = 2000
n_steps = 2000

config = {
    "policy_type": "MultiInputPolicy",
    "env_name": "FULLGYMGridEnvDeform",
    "defo_range": (l0,h0,l1,h1),
    "maze_size": N,
    "total_timesteps": total_timesteps,
    "Batch_Size": batch_size,
    "PPO n_steps": n_steps
}

run = wandb.init(
    project="PPO",
    config=config,
    sync_tensorboard=True,  # auto-upload sb3's tensorboard metrics
    monitor_gym=True,  # auto-upload the videos of agents playing the game
    save_code=True,  # optional
)

callbacks = [My_callback(0), 
             WandbCallback(gradient_save_freq=100,
                            model_save_path=f"models/{run.id}",
                            verbose=2,
                            ),
            ]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmatteo-nunziante[0m ([33madv_topics[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# n_steps (int) – The number of steps to run for each environment per update 
# (i.e. rollout buffer size is n_steps * n_envs
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecVideoRecorder

def make_env():
    N = 2

    # thetas deformations (range(a,b),range(c,d))
    l0 = 1
    h0 = 10
    l1 = 1
    h1 = 10
    
    maze = np.load(f"maze/maze_{N}.npy")
    env = FULLGYMGridEnvDeform(maze,l0,h0,l1,h1, render_mode="rgb_array")

    env = Monitor(env)  # record stats such as returns
    return env

env = DummyVecEnv([make_env])


model = PPO("MultiInputPolicy",env,n_steps=n_steps,batch_size=batch_size,verbose=1,tensorboard_log=f"runs/{run.id}", device="cpu")
model.learn(total_timesteps,progress_bar=True, callback=callbacks)
model.save(f"models/PPO_{run.id}")
env.close()
run.finish()


Using cpu device
Logging to runs/ex09yen9/PPO_1


Output()

In [None]:
model.save("ppo_MDP_1")

In [13]:
env = FULLGYMGridEnvDeform(maze,l0,h0,l1,h1,render_mode="human")
obs, _ = env.reset()
done = False
while True:
    action, _states = model.predict(obs, deterministic=False)
    obs, reward, done, _, info = env.step(action)
    if done:
        obs, _ = env.reset()
env.close()

error: display Surface quit

In [None]:
import gymnasium as gym

from stable_baselines3 import PPO

env = gym.make("CartPole-v1", render_mode="human")

model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10_000,progress_bar=True)

vec_env = model.get_env()
obs = vec_env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = vec_env.step(action)
    vec_env.render()
    # VecEnv resets automatically
    # if done:
    #   obs = env.reset()

env.close()

# eval value function MDP

In [None]:
model = PPO("MultiInputPolicy", env, verbose=1)
model.load("ppo_MDP")

In [None]:
env = FULLGYMGridEnvDeform(maze,l0,h0,l1,h1,render_mode="human")
obs, _ = env.reset()
done = False
while True:
    action, _states = model.predict(obs, deterministic=False)
    obs, reward, done, _, info = env.step(action)
    if done:
        OBS, _ = env.reset()
env.close()

In [None]:

# fix orientation and deformation
deformation = (2, 1)

fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes = axes.flatten()

for orientation in range(4):
    Value_matrix_plot = np.zeros(env.maze.shape) - np.inf
    for s, state in enumerate(states):
        if state[1] == deformation and state[0][2] == orientation:
            Value_matrix_plot[state[0][0], state[0][1]] = state_value[s]
    
    ax = axes[orientation]
    ax.imshow(Value_matrix_plot)
    ax.set_title(f"Orientation: {orientation}")
    ax.set_xlabel("X")
    ax.set_ylabel("Y")

plt.suptitle("Value Function Matrices for Different Orientations")
plt.tight_layout()
plt.show()