# Second practical exercise: Grid World and Value iteration

Repo: https://github.com/KRLGroup/RL_2025.git

# A deterministic grid world

Finite grid with some obstacles inside. The agent can move up, left, right and down.

![](imgs/grid_world.png)

In [1]:
%pip install gymnasium

Note: you may need to restart the kernel to use updated packages.


In [2]:
#import
import gymnasium as gym
import numpy as np
from gymnasium import spaces
import random

In [4]:

# custom 2d grid world enviroment
class GridWorld(gym.Env):
    metadata = {'render.modes': ['console']}

    
    # actions available
    UP = 0
    LEFT = 1
    DOWN = 2
    RIGHT = 3


    def __init__(self, width, height):
        super(GridWorld, self).__init__()
        self.ACTION_NAMES = ["UP", "LEFT", "DOWN", "RIGHT"]
        self.num_actions = 4

        self.size = width * height  # size of the grid world
        self.num_states = self.size
        self.width = width
        self.height = height
        self.num_obstacles = int((width+height)/2)
        self.end_state = np.array([height - 1, width - 1], dtype=np.uint8) # goal state = bottom right cell

        # actions of agents : up, down, left and right
        self.action_space = spaces.Discrete(4)
        # observation : cell indices in the grid
        self.observation_space = spaces.MultiDiscrete([self.height, self.width])

        self.obstacles = np.zeros((height, width))

        for i in range(self.num_obstacles):
            obstacle = random.randrange(height) , random.randrange(width)
            while obstacle in [(0, 0),tuple(self.end_state)]:
                obstacle = random.randrange(height), random.randrange(width)
            self.obstacles[obstacle] = 1

        self.num_steps = 0
        self.max_steps = height*width

        self.current_state = np.zeros((2), np.uint8)#init state = [0,0]

        self.directions = np.array([
            [-1,0], #UP
            [0,-1], #LEFT
            [1,0], #DOWN
            [0,1] #RIGHT
        ])
        
    def step(self, action):
        s_prime = self.transition_function(self.current_state, action)
        reward = self.reward_function(s_prime)
        terminated, truncated = self.termination_condition(s_prime)

        self.current_state = s_prime
        self.num_steps += 1

        return self.current_state, reward, terminated, truncated, None
    
    
    def transition_function(self, s, a): # TODO
        
        # Q1
        # (a) -----------------------------------------
        # s_prime = s + a
        # if (s_prime < 0).any(): return s
        # if s_prime[0] >= self.width: return s    
        # if s_prime[1] >= self.height: return s 
        # if self.obstacles[s_prime[0], s_prime[1]] == 1: return s
        
        # (b) -----------------------------------------
        # s_prime = s + self.directions[a]
        # if (s_prime < 0).any(): return s_prime
        # if s_prime[0] <= self.height: return s  
        # if s_prime[1] <= self.width: return s 
        # if self.obstacles[s_prime[0], s_prime[1]] == 1: return s
        
        # (c) -----------------------------------------
        s_prime = s + self.directions[a]
        if (s_prime < 0).any(): return s
        if s_prime[0] >= self.height: return s
        if s_prime[1] >= self.width: return s
        if self.obstacles[s_prime[0], s_prime[1]] == 1: return s
        
        
        return s_prime

    
    def reward_function(self,s): # TODO
        
        # Q2
        # (a) -----------------------------------------
        # r = 0
        # if (s != self.end_state).all():
        #     r = 1

        # (b) -----------------------------------------
        r = 0
        if (s == self.end_state).all():
            r = 1

        # (c) -----------------------------------------
        # r = 1               
        # if (s == self.end_state).all():
        #     r = 0             

        return r

    def termination_condition(self, s):
        truncated = False
        terminated = False

        # Q3
        # (a)
        truncated = self.num_steps >= self.max_steps
        # (b) 
        # truncated = self.num_steps <= self.max_steps
        # (c) 
        # truncated = self.num_steps > 5
        #-----------------------------------------------------

        # Q4
        # (a) 
        # terminated = (s != self.end_state).any() 
        # (b) 
        # terminated = (s == self.end_state).any()
        # (c) 
        terminated = (s == self.end_state).all()

        return terminated, truncated
    
    def transition_probabilities(self, s, a):
        prob_next_state = np.zeros((self.height, self.width))
        s_prime = self.transition_function(s, a)

        prob_next_state[s_prime[0], s_prime[1]] = 1.0

        return prob_next_state#.flatten()
    
    def reset(self):
        self.current_state = np.zeros((2), np.uint8)
        self.num_steps = 0

        return self.current_state
    
    def reward_probabilities(self):
        rewards = np.zeros((self.num_states))
        i = 0
        for r in range(self.height):
            for c in range(self.width):
                state = np.array([r,c], dtype=np.uint8)
                rewards[i] = self.reward_function(state)
                i+=1

        return rewards
    
    def render(self):
        '''
            render the state
        '''

        row = self.current_state[0]
        col = self.current_state[1]

        for r in range(self.height):
            for c in range(self.width):
                if r == row and c == col:
                    print("| A ", end='')
                elif r == self.end_state[0] and c == self.end_state[1]:
                    print("| G ", end='')
                else:
                    if self.obstacles[r,c] == 1:
                        print('|///', end='')
                    else:
                        print('|___', end='')
            print('|')
        print('\n')

Simulate all the four actions

In [6]:
env = GridWorld(5,5)
env.reset()
env.render()

action_sequence = [0,1,2,3]

for a in action_sequence:
    print(env.ACTION_NAMES[a])
    env.step(a)
    env.render()

| A |___|___|///|___|
|___|___|___|___|___|
|___|___|___|___|___|
|///|___|___|___|___|
|///|___|///|///| G |


UP
| A |___|___|///|___|
|___|___|___|___|___|
|___|___|___|___|___|
|///|___|___|___|___|
|///|___|///|///| G |


LEFT
| A |___|___|///|___|
|___|___|___|___|___|
|___|___|___|___|___|
|///|___|___|___|___|
|///|___|///|///| G |


DOWN
|___|___|___|///|___|
| A |___|___|___|___|
|___|___|___|___|___|
|///|___|___|___|___|
|///|___|///|///| G |


RIGHT
|___|___|___|///|___|
|___| A |___|___|___|
|___|___|___|___|___|
|///|___|___|___|___|
|///|___|///|///| G |




Simulate a random episode

In [7]:
done = False
env.reset()
while not done:
    action = env.action_space.sample()
    print(env.ACTION_NAMES[action])
    state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    env.render()


LEFT
| A |___|___|///|___|
|___|___|___|___|___|
|___|___|___|___|___|
|///|___|___|___|___|
|///|___|///|///| G |


RIGHT
|___| A |___|///|___|
|___|___|___|___|___|
|___|___|___|___|___|
|///|___|___|___|___|
|///|___|///|///| G |


LEFT
| A |___|___|///|___|
|___|___|___|___|___|
|___|___|___|___|___|
|///|___|___|___|___|
|///|___|///|///| G |


DOWN
|___|___|___|///|___|
| A |___|___|___|___|
|___|___|___|___|___|
|///|___|___|___|___|
|///|___|///|///| G |


RIGHT
|___|___|___|///|___|
|___| A |___|___|___|
|___|___|___|___|___|
|///|___|___|___|___|
|///|___|///|///| G |


UP
|___| A |___|///|___|
|___|___|___|___|___|
|___|___|___|___|___|
|///|___|___|___|___|
|///|___|///|///| G |


UP
|___| A |___|///|___|
|___|___|___|___|___|
|___|___|___|___|___|
|///|___|___|___|___|
|///|___|///|///| G |


DOWN
|___|___|___|///|___|
|___| A |___|___|___|
|___|___|___|___|___|
|///|___|___|___|___|
|///|___|///|///| G |


LEFT
|___|___|___|///|___|
| A |___|___|___|___|
|___|___|___|___|

## A non deterministic grid world

The agent goes with probability p to the right cell, with probability 1 - p in a different cell

In [8]:
class NonDeterministicGridWorld(GridWorld):
    def __init__(self, width, height, p=0.8):
        super(NonDeterministicGridWorld, self).__init__(width, height)
        self.probability_right_action = p

    def transition_function(self, s, a):
        s_prime = s + self.directions[a, :]

        #with probability 1 - p diagonal movement
        if random.random() <= 1 - self.probability_right_action:
            if random.random() < 0.5:
                s_prime = s_prime + self.directions[(a+1)%self.num_actions, :]
            else:
                s_prime = s_prime + self.directions[(a-1)%self.num_actions, :]


        if s_prime[0] < self.height and s_prime[1] < self.width and (s_prime >= 0).all():
            if self.obstacles[s_prime[0], s_prime[1]] == 0 :
                return s_prime

        return s

    def transition_probabilities(self, s, a):
        cells = []
        probs = []
        prob_next_state = np.zeros((self.height, self.width))
        s_prime_right =  s + self.directions[a, :]
        if s_prime_right[0] < self.height and s_prime_right[1] < self.width and (s_prime_right >= 0).all():
            if self.obstacles[s_prime_right[0], s_prime_right[1]] == 0 :
                prob_next_state[s_prime_right[0], s_prime_right[1]] = self.probability_right_action
                cells.append(s_prime_right)
                probs.append(self.probability_right_action)

        s_prime = s_prime_right + self.directions[(a + 1) % self.num_actions, :]
        if s_prime[0] < self.height and s_prime[1] < self.width and (s_prime >= 0).all():
            if self.obstacles[s_prime[0], s_prime[1]] == 0 :
                prob_next_state[s_prime[0], s_prime[1]] = (1 - self.probability_right_action) / 2
                cells.append(s_prime.copy())
                probs.append((1 - self.probability_right_action) / 2)

        s_prime = s_prime_right + self.directions[(a - 1) % self.num_actions, :]
        if s_prime[0] < self.height and s_prime[1] < self.width and (s_prime >= 0).all():
            if self.obstacles[s_prime[0], s_prime[1]] == 0 :
                prob_next_state[s_prime[0], s_prime[1]] = (1 - self.probability_right_action) / 2
                cells.append(s_prime.copy())
                probs.append((1 - self.probability_right_action) / 2)

        #normalization
        sump = sum(probs)
        #for cell in cells:
        #    prob_next_state[cell[0], cell[1]] /= sump
        prob_next_state[s[0], s[1]] = 1 - sump
        return prob_next_state


Simulate a random episode

In [19]:
env = NonDeterministicGridWorld(3,5)
state = env.reset()
env.render()
#next state if we start from state 0,0 and we do action down
next_state_prob = env.transition_probabilities(state, 2)
print(next_state_prob)
print()
print(env.reward_probabilities())

| A |___|___|
|///|___|___|
|___|///|///|
|___|___|___|
|___|___| G |


[[0.9 0.  0. ]
 [0.  0.1 0. ]
 [0.  0.  0. ]
 [0.  0.  0. ]
 [0.  0.  0. ]]

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]


In [20]:
done = False
while not done:
    action = env.action_space.sample()
    print(env.ACTION_NAMES[action])
    state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    env.render()

UP
| A |___|___|
|///|___|___|
|___|///|///|
|___|___|___|
|___|___| G |


RIGHT
|___|___|___|
|///| A |___|
|___|///|///|
|___|___|___|
|___|___| G |


DOWN
|___|___|___|
|///| A |___|
|___|///|///|
|___|___|___|
|___|___| G |


RIGHT
|___|___| A |
|///|___|___|
|___|///|///|
|___|___|___|
|___|___| G |


RIGHT
|___|___| A |
|///|___|___|
|___|///|///|
|___|___|___|
|___|___| G |


LEFT
|___| A |___|
|///|___|___|
|___|///|///|
|___|___|___|
|___|___| G |


LEFT
| A |___|___|
|///|___|___|
|___|///|///|
|___|___|___|
|___|___| G |


LEFT
| A |___|___|
|///|___|___|
|___|///|///|
|___|___|___|
|___|___| G |


LEFT
| A |___|___|
|///|___|___|
|___|///|///|
|___|___|___|
|___|___| G |


UP
| A |___|___|
|///|___|___|
|___|///|///|
|___|___|___|
|___|___| G |


RIGHT
|___| A |___|
|///|___|___|
|___|///|///|
|___|___|___|
|___|___| G |


LEFT
| A |___|___|
|///|___|___|
|___|///|///|
|___|___|___|
|___|___| G |


RIGHT
|___| A |___|
|///|___|___|
|___|///|///|
|___|___|___|
|___|___| G |


VALUE ITERATION

![](imgs/value_iteration_1.png)

![](imgs/value_iteration_2.png)

![](imgs/value_iteration_3.png)

![](imgs/value_iteration_4.png)

![](imgs/value_iteration_5.png)

![](imgs/value_iteration_6.png)

![](imgs/value_iteration.png)

In [21]:
def value_iteration(env, gamma=0.99, iters=100):
    #initialize values
    values = np.zeros((env.num_states))
    best_actions = np.zeros((env.num_states), dtype=int)
    STATES = np.zeros((env.num_states, 2), dtype=np.uint8)
    REWARDS = env.reward_probabilities()
    print(REWARDS)
    i = 0
    for r in range(env.height):
        for c in range(env.width):
            state = np.array([r, c], dtype=np.uint8)
            STATES[i] = state
            i += 1
    
    for i in range(iters):
        v_old = values.copy()
        for s in range(env.num_states):
            state = STATES[s]

            if (state == env.end_state).all() or i >= env.max_steps or env.obstacles[state[0],state[1]]:
                continue # if we reach the termination condition, we cannot perform any action


            max_va = -np.inf
            best_a = 0
            for a in range(env.num_actions):
                next_state_prob = env.transition_probabilities(state, a).flatten()

                #Q5
                # (a)
                # va = (next_state_prob*(REWARDS + gamma*values)).sum()
                # (b) 
                # va = (REWARDS + gamma*v_old).sum()
                # (c) 
                va = (next_state_prob*(REWARDS + gamma*v_old)).sum()
                
                #Q6
                # (a) 
                if va > max_va:
                    max_va = va
                    best_a = a
                # (b) 
                # if va < max_va:
                    # max_va = a
                    # best_a = va
                # (c) 
                # if va > values:
                    # max_va = va
                    # best_a = a

            values[s] = max_va
            best_actions[s] = best_a

    return values.reshape((env.height, env.width)), best_actions.reshape((env.height, env.width))

estimate values

In [24]:
env = NonDeterministicGridWorld(10,10)
values, best_actions = value_iteration(env, iters=5000)

print(values)
print(best_actions)
env.render()

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 1.]
[[0.76063179 0.7683087  0.77606355 0.78389467 0.79180482 0.79979324
  0.80782357 0.81566173 0.8230812  0.83127853 0.8396508  0.
  0.85321447 0.85324447 0.85644149]
 [0.76829658 0.77604799 0.78387688 0.79180482 0.79979476 0.80786555
  0.81600049 0.82396459 0.83129019 0.83966544 0.84813916 0.85654249
  0.86257645 0

simulate optimal policy


In [25]:
done = False
state = env.reset()
while not done:
    action = best_actions[state[0],state[1]]
    print("Action:",env.ACTION_NAMES[action])
    state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    env.render()


Action: RIGHT
|___| A |___|___|___|___|___|___|___|___|___|///|___|___|___|
|___|___|___|___|___|___|___|___|___|___|___|___|___|___|___|
|___|___|___|___|___|___|___|___|___|___|___|___|___|___|___|
|___|___|___|___|___|___|___|___|___|___|___|___|___|///|___|
|///|___|___|___|___|///|___|___|___|///|___|///|___|___|___|
|___|___|___|___|___|___|___|___|___|___|___|___|___|___|___|
|___|___|___|___|___|___|___|___|___|___|___|___|___|___|___|
|___|___|___|___|///|___|___|___|___|___|___|___|___|___|___|
|___|___|___|___|___|___|___|___|___|___|___|___|___|___|___|
|___|___|___|___|___|___|___|___|___|___|___|___|___|___|___|
|___|___|___|///|///|___|___|///|___|___|___|___|___|___|___|
|___|___|___|___|___|___|___|___|___|___|___|___|___|___|___|
|___|___|___|___|___|___|___|___|___|___|///|///|___|___|___|
|___|___|___|___|___|___|___|___|___|///|___|___|___|___|___|
|___|___|___|///|___|___|___|___|___|___|___|///|___|___| G |


Action: RIGHT
|___| A |___|___|___|___|___|___|___|___

In [None]:
from PIL import Image, ImageDraw

# Crea l'environment
env = NonDeterministicGridWorld(3, 5)

def create_grid_from_env(env, image_paths, output_path):
    """
    Crea un'immagine della griglia usando le informazioni dell'environment
    """
    img_dict = {
        'cell': Image.open(image_paths[0]),
        'start': Image.open(image_paths[1]),
        'obstacle': Image.open(image_paths[2]),
        'end': Image.open(image_paths[3])
    }
    
    # Crea l'immagine della griglia
    grid_img = Image.new('RGB', (env.width * 100, env.height * 100), color='white')
    
    # Itera su tutte le celle
    for r in range(env.height):
        for c in range(env.width):
            # Determina il tipo di cella
            if r == 0 and c == 0:
                cell_type = 'start'
            elif r == env.end_state[0] and c == env.end_state[1]:
                cell_type = 'end'
            elif env.obstacles[r, c] == 1:
                cell_type = 'obstacle'
            else:
                cell_type = 'cell'
            
            # Incolla l'immagine nella posizione corretta (colonna * 100, riga * 100)
            grid_img.paste(img_dict[cell_type], (c * 100, r * 100))
    
    grid_img.save(output_path)
    return grid_img

# Crea l'immagine della griglia
image_paths = ['cell.jpg', 'start.jpg', 'obstacle.jpg', 'end.jpg']
grid_img = create_grid_from_env(env, image_paths, 'grid_from_env.png')
print(f"Griglia creata: {env.height}x{env.width} con {int(env.obstacles.sum())} ostacoli")
env.render()



done = False
state = env.reset()
while not done:
    action = best_actions[state[0],state[1]]
    print("Action:",env.ACTION_NAMES[action])
    state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    

Griglia creata: 5x3 con 4 ostacoli
| A |///|___|
|///|___|___|
|___|___|///|
|___|///|___|
|___|___| G |




In [44]:
def create_red_circle_image(path='pawn_circle.png'):
    """Crea un'immagine 100x100 con un cerchio rosso 80x80 su sfondo trasparente"""
    img = Image.new('RGBA', (100, 100), (0, 0, 0, 0))
    draw = ImageDraw.Draw(img)
    draw.ellipse((10, 10, 90, 90), fill=(255, 0, 0, 255))
    img.save(path)
    return img

def animate_pawn_on_grid(env, image_paths, pawn_img, output_gif):
    """
    Crea una GIF animata che sposta il cerchio rosso su ogni cella della griglia
    """
    img_dict = {
        'cell': Image.open(image_paths[0]),
        'start': Image.open(image_paths[1]),
        'obstacle': Image.open(image_paths[2]),
        'end': Image.open(image_paths[3])
    }
    
    frames = []
    
    # Itera su tutte le celle della griglia
    for r in range(env.height):
        for c in range(env.width):
            # Crea un frame per questa posizione
            frame = Image.new('RGB', (env.width * 100, env.height * 100), color='white')
            
            # Disegna tutte le celle della griglia
            for row in range(env.height):
                for col in range(env.width):
                    # Determina il tipo di cella
                    if row == 0 and col == 0:
                        cell_type = 'start'
                    elif row == env.end_state[0] and col == env.end_state[1]:
                        cell_type = 'end'
                    elif env.obstacles[row, col] == 1:
                        cell_type = 'obstacle'
                    else:
                        cell_type = 'cell'
                    
                    # Incolla l'immagine della cella
                    frame.paste(img_dict[cell_type], (col * 100, row * 100))
            
            # Sovrapponi il cerchio rosso sulla posizione corrente (c * 100, r * 100)
            frame.paste(pawn_img, (c * 100, r * 100), pawn_img)
            frames.append(frame)
    
    # Salva la GIF
    frames[0].save(output_gif, save_all=True, append_images=frames[1:], duration=200, loop=0)
    print(f"GIF creata: {output_gif} con {len(frames)} frames")

# Crea il cerchio rosso
pawn_img = create_red_circle_image()

# Crea la GIF animata
image_paths = ['cell.jpg', 'start.jpg', 'obstacle.jpg', 'end.jpg']
animate_pawn_on_grid(env, image_paths, pawn_img, 'animazione_griglia.gif')

GIF creata: animazione_griglia.gif con 15 frames


In [48]:
def animate_optimal_policy(env, best_actions, image_paths, pawn_img, output_gif):
    """
    Crea una GIF animata che mostra il cerchio rosso seguire la policy ottimale
    """
    img_dict = {
        'cell': Image.open(image_paths[0]),
        'start': Image.open(image_paths[1]),
        'obstacle': Image.open(image_paths[2]),
        'end': Image.open(image_paths[3])
    }
    
    frames = []
    
    # Simula l'episodio seguendo la policy ottimale
    done = False
    state = env.reset()
    step_count = 0
    max_steps = env.height * env.width  # Limite per evitare loop infiniti
    
    while not done and step_count < max_steps:
        # Crea un frame per questa posizione
        frame = Image.new('RGB', (env.width * 100, env.height * 100), color='white')
        
        # Disegna tutte le celle della griglia
        for row in range(env.height):
            for col in range(env.width):
                # Determina il tipo di cella
                if row == 0 and col == 0:
                    cell_type = 'start'
                elif row == env.end_state[0] and col == env.end_state[1]:
                    cell_type = 'end'
                elif env.obstacles[row, col] == 1:
                    cell_type = 'obstacle'
                else:
                    cell_type = 'cell'
                
                # Incolla l'immagine della cella
                frame.paste(img_dict[cell_type], (col * 100, row * 100))
        
        # Sovrapponi il cerchio rosso sulla posizione corrente dello stato
        frame.paste(pawn_img, (state[1] * 100, state[0] * 100), pawn_img)
        frames.append(frame)
        
        # Prendi la migliore azione per questo stato
        action = best_actions[state[0], state[1]]
        print(f"Step {step_count}: State [{state[0]},{state[1]}], Action: {env.ACTION_NAMES[action]}")
        
        # Esegui l'azione
        state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        step_count += 1
    
    # Aggiungi un ultimo frame con lo stato finale
    frame = Image.new('RGB', (env.width * 100, env.height * 100), color='white')
    for row in range(env.height):
        for col in range(env.width):
            if row == 0 and col == 0:
                cell_type = 'start'
            elif row == env.end_state[0] and col == env.end_state[1]:
                cell_type = 'end'
            elif env.obstacles[row, col] == 1:
                cell_type = 'obstacle'
            else:
                cell_type = 'cell'
            frame.paste(img_dict[cell_type], (col * 100, row * 100))
    frame.paste(pawn_img, (state[1] * 100, state[0] * 100), pawn_img)
    frames.append(frame)
    
    # Salva la GIF
    if len(frames) > 0:
        frames[0].save(output_gif, save_all=True, append_images=frames[1:], duration=300, loop=0)
        print(f"\nGIF creata: {output_gif} con {len(frames)} frames (percorso ottimale)")
    else:
        print("Nessun frame generato!")


env = NonDeterministicGridWorld(15,15)
# Crea il cerchio rosso
pawn_img = create_red_circle_image()

# Crea la GIF animata seguendo la policy ottimale
image_paths = ['cell.jpg', 'start.jpg', 'obstacle.jpg', 'end.jpg']
animate_optimal_policy(env, best_actions, image_paths, pawn_img, 'optimal_policy.gif')

Step 0: State [0,0], Action: RIGHT
Step 1: State [0,0], Action: RIGHT
Step 2: State [0,0], Action: RIGHT
Step 3: State [1,1], Action: DOWN
Step 4: State [2,1], Action: DOWN
Step 5: State [3,1], Action: RIGHT
Step 6: State [3,2], Action: DOWN
Step 7: State [3,2], Action: DOWN
Step 8: State [4,3], Action: DOWN
Step 9: State [5,3], Action: RIGHT
Step 10: State [5,4], Action: RIGHT
Step 11: State [5,5], Action: RIGHT
Step 12: State [5,6], Action: RIGHT
Step 13: State [5,7], Action: DOWN
Step 14: State [6,7], Action: RIGHT
Step 15: State [6,8], Action: DOWN
Step 16: State [7,8], Action: RIGHT
Step 17: State [7,9], Action: DOWN
Step 18: State [8,9], Action: RIGHT
Step 19: State [8,10], Action: DOWN
Step 20: State [9,10], Action: RIGHT
Step 21: State [10,11], Action: RIGHT
Step 22: State [9,12], Action: DOWN
Step 23: State [10,12], Action: DOWN
Step 24: State [11,12], Action: DOWN
Step 25: State [12,12], Action: DOWN
Step 26: State [13,11], Action: RIGHT
Step 27: State [13,12], Action: RIGHT


In [None]:
import gymnasium as gym
import swig


# Initialise the environment
env = gym.make("LunarLander-v3", render_mode="human")

# Reset the environment to generate the first observation
observation, info = env.reset(seed=42)
for _ in range(1000):
    # this is where you would insert your policy
    action = env.action_space.sample()

    # step (transition) through the environment with the action
    # receiving the next observation, reward and if the episode has terminated or truncated
    observation, reward, terminated, truncated, info = env.step(action)

    # If the episode has ended then we can reset to start a new episode
    if terminated or truncated:
        observation, info = env.reset()

env.close()

DependencyNotInstalled: Box2D is not installed, you can install it by run `pip install swig` followed by `pip install "gymnasium[box2d]"`