# Second practical exercise: Grid World and Value iteration

# A deterministic grid world

Finite grid with some obstacles inside. The agent can move up, left, right and down.

![](imgs/grid_world.png)

In [2]:
#import
import gym
import numpy as np
from gym import spaces
import random

In [35]:

# custom 2d grid world enviroment
class GridWorld(gym.Env):
    metadata = {'render.modes': ['console']}

    
    # actions available
    UP = 0
    LEFT = 1
    DOWN = 2
    RIGHT = 3


    def __init__(self, width, height):
        super(GridWorld, self).__init__()
        self.ACTION_NAMES = ["UP", "LEFT", "DOWN", "RIGHT"]
        self.num_actions = 4

        self.size = width * height  # size of the grid world
        self.num_states = self.size
        self.width = width
        self.height = height
        self.num_obstacles = int((width+height)/2)
        self.end_state = np.array([height - 1, width - 1], dtype=np.uint8) # goal state = bottom right cell

        # actions of agents : up, down, left and right
        self.action_space = spaces.Discrete(4)
        # observation : cell indices in the grid
        self.observation_space = spaces.MultiDiscrete([self.height, self.width])

        self.obstacles = np.zeros((height, width))

        for i in range(self.num_obstacles):
            self.obstacles[ random.randrange(height) , random.randrange(width)] = 1

        self.num_steps = 0
        self.max_steps = height*width

        self.current_state = np.zeros((2), np.uint8)#init state = [0,0]

        self.directions = np.array([
            [-1,0], #UP
            [0,-1], #LEFT
            [1,0], #DOWN
            [0,1] #RIGHT
        ])
        
    def step(self, action):
        s_prime = self.transition_function(self.current_state, action)
        reward = self.reward_function(s_prime)
        done = self.termination_condition(s_prime)

        self.current_state = s_prime
        self.num_steps += 1

        return self.current_state, reward, done, None
 
    def valid_act(self, s):
        for obst in self.obstacles:
            if (s==obst):
                return False
        if s[0] < 0 or s[0] > 4:
            return False
        if s[1] < 0 or s[1] > 4: 
            return False
        return True

    def transition_function(self, s, a):
        #s_prime =  np.zeros((2), np.uint8)
        s_prime = self.current_state + self.directions[a]
        if (self.valid_act(s_prime)):
            return s_prime
        print("\n______________non valida________________\n")
        return s
    
    
    def reward_function(self,s):
        if (s == self.end_state).all():
            reward = 1
        else: reward = 0
        return reward
    
    def termination_condition(self, s):
        done = False
        if (s == self.end_state).all():
            done = True
        done = self.num_steps > self.max_steps # or ???

        return done
    
    def reset(self):
        self.current_state = np.zeros((2), np.uint8)

        return self.current_state
    
    def render(self):
        '''
            render the state
        '''

        row = self.current_state[0]
        col = self.current_state[1]

        for r in range(self.height):
            for c in range(self.width):
                if r == row and c == col:
                    print("| A ", end='')
                elif r == self.end_state[0] and c == self.end_state[1]:
                    print("| G ", end='')
                else:
                    if self.obstacles[r,c] == 1:
                        print('|///', end='')
                    else:
                        print('|___', end='')
            print('|')
        print('\n')

Simulate all the four actions

In [36]:
env = GridWorld(3,5)
env.reset()
env.render()

action_sequence = [0,1,2,3]

for a in action_sequence:
    env.step(a)
    env.render()

| A |///|///|
|///|___|___|
|___|///|___|
|___|___|___|
|___|___| G |



______________non valida________________

| A |///|///|
|///|___|___|
|___|///|___|
|___|___|___|
|___|___| G |



______________non valida________________

| A |///|///|
|///|___|___|
|___|///|___|
|___|___|___|
|___|___| G |


|___|///|///|
| A |___|___|
|___|///|___|
|___|___|___|
|___|___| G |


|___|///|///|
|///| A |___|
|___|///|___|
|___|___|___|
|___|___| G |




  if (s==obst):


Simulate a random episode

In [37]:
done = False
while not done:
    action = env.action_space.sample()
    print(env.ACTION_NAMES[action])
    state, reward, done, _ = env.step(action)
    env.render()


LEFT
|___|///|///|
| A |___|___|
|___|///|___|
|___|___|___|
|___|___| G |


LEFT

______________non valida________________

|___|///|///|
| A |___|___|
|___|///|___|
|___|___|___|
|___|___| G |


LEFT

______________non valida________________

|___|///|///|
| A |___|___|
|___|///|___|
|___|___|___|
|___|___| G |


LEFT

______________non valida________________

|___|///|///|
| A |___|___|
|___|///|___|
|___|___|___|
|___|___| G |


RIGHT
|___|///|///|
|///| A |___|
|___|///|___|
|___|___|___|
|___|___| G |


LEFT
|___|///|///|
| A |___|___|
|___|///|___|
|___|___|___|
|___|___| G |


LEFT

______________non valida________________

|___|///|///|
| A |___|___|
|___|///|___|
|___|___|___|
|___|___| G |


UP
| A |///|///|
|///|___|___|
|___|///|___|
|___|___|___|
|___|___| G |


LEFT

______________non valida________________

| A |///|///|
|///|___|___|
|___|///|___|
|___|___|___|
|___|___| G |


RIGHT
|___| A |///|
|///|___|___|
|___|///|___|
|___|___|___|
|___|___| G |


RIGHT
|___|///|

  if (s==obst):


## A non deterministic grid world

The agent goes with probability p to the right cell, with probability 1 - p in a different cell

In [38]:
class NonDeterministicGridWorld(GridWorld):
    def __init__(self, width, height, p=0.8):
        super(NonDeterministicGridWorld, self).__init__(width, height)
        self.probability_right_action = p

    def transition_function(self, s, a):
        s_prime = s + self.directions[a, :]

        #with probability 1 - p diagonal movement
        if random.random() <= 1 - self.probability_right_action:
            if random.random() < 0.5:
                s_prime = s_prime + self.directions[(a+1)%self.num_actions, :]
            else:
                s_prime = s_prime + self.directions[(a-1)%self.num_actions, :]


        if s_prime[0] < self.height and s_prime[1] < self.width and (s_prime >= 0).all():
            if self.obstacles[s_prime[0], s_prime[1]] == 0 :
                return s_prime

        return s


Simulate a random episode

In [6]:
env = NonDeterministicGridWorld(3,5)
env.reset()
env.render()

| A |___|///|
|___|///|___|
|___|///|___|
|___|///|___|
|___|___| G |




In [7]:
done = False
while not done:
    action = env.action_space.sample()
    print(env.ACTION_NAMES[action])
    state, reward, done, _ = env.step(action)
    env.render()

UP
| A |___|///|
|___|///|___|
|___|///|___|
|___|///|___|
|___|___| G |


UP
| A |___|///|
|___|///|___|
|___|///|___|
|___|///|___|
|___|___| G |


DOWN
|___|___|///|
| A |///|___|
|___|///|___|
|___|///|___|
|___|___| G |


LEFT
|___|___|///|
| A |///|___|
|___|///|___|
|___|///|___|
|___|___| G |


DOWN
|___|___|///|
|___|///|___|
| A |///|___|
|___|///|___|
|___|___| G |


RIGHT
|___|___|///|
|___|///|___|
| A |///|___|
|___|///|___|
|___|___| G |


LEFT
|___|___|///|
|___|///|___|
| A |///|___|
|___|///|___|
|___|___| G |


LEFT
|___|___|///|
|___|///|___|
| A |///|___|
|___|///|___|
|___|___| G |


DOWN
|___|___|///|
|___|///|___|
|___|///|___|
| A |///|___|
|___|___| G |


UP
|___|___|///|
|___|///|___|
| A |///|___|
|___|///|___|
|___|___| G |


DOWN
|___|___|///|
|___|///|___|
|___|///|___|
| A |///|___|
|___|___| G |


DOWN
|___|___|///|
|___|///|___|
|___|///|___|
|___|///|___|
| A |___| G |


LEFT
|___|___|///|
|___|///|___|
|___|///|___|
|___|///|___|
| A |___| G |


DOWN