<a href="https://colab.research.google.com/github/dhyannn/reinforcment-learing/blob/main/514_RLlab6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np

class GridworldEnv:
    def __init__(self, grid_size=(5, 5), goal=(4, 4), obstacles=[]):
        self.grid_size = grid_size
        self.goal = goal
        self.obstacles = obstacles
        self.state = (0, 0)  # Starting position

    def reset(self):
        self.state = (0, 0)
        return self.state

    def step(self, action):
        x, y = self.state
        if action == 'up': x -= 1
        elif action == 'down': x += 1
        elif action == 'left': y -= 1
        elif action == 'right': y += 1

        # Check boundaries
        x = max(0, min(self.grid_size[0] - 1, x))
        y = max(0, min(self.grid_size[1] - 1, y))

        new_state = (x, y)
        reward = 1 if new_state == self.goal else -1 if new_state in self.obstacles else 0
        done = new_state == self.goal

        self.state = new_state
        return new_state, reward, done

    def render(self):
        grid = np.zeros(self.grid_size)
        grid[self.goal] = 1
        for obs in self.obstacles:
            grid[obs] = -1
        x, y = self.state
        grid[x, y] = 2  # Agent position
        print(grid)

# Test Environment
env = GridworldEnv(obstacles=[(2, 2)])
state = env.reset()
env.render()

state, reward, done = env.step('right')
print("State:", state, "Reward:", reward, "Done:", done)


[[ 2.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  1.]]
State: (0, 1) Reward: 0 Done: False


In [None]:
def value_iteration(env, gamma=0.9, theta=1e-6):
    states = [(x, y) for x in range(env.grid_size[0]) for y in range(env.grid_size[1])]
    actions = ['up', 'down', 'left', 'right']
    V = {state: 0 for state in states}

    while True:
        delta = 0
        for state in states:
            if state == env.goal: continue
            v = V[state]
            V[state] = max(
                sum(
                    0.8 * (reward + gamma * V[next_state])  # Transition with 80% prob
                    for action in actions
                )
            )
            delta = max(delta, abs(v - V[state]))
        if delta < theta:
            break
    return V