
# Gridworld (5×5): Value Iteration → V* and Greedy Policy

- Actions: ↑ ↓ ← →
- Hitting a wall: stay, reward −1
- Teleporters: **A→A'** (+10), **B→B'** (+5)
- Discount: γ = 0.9


In [None]:

import numpy as np
import matplotlib.pyplot as plt

H, W = 5, 5
gamma = 0.9
A, A_prime = (0,1), (4,1)
B, B_prime = (0,3), (2,3)
A_reward, B_reward = 10.0, 5.0

ACTIONS = [(-1,0),(1,0),(0,-1),(0,1)]
A_NAMES = ['↑','↓','←','→']

def in_bounds(r,c):
    return 0 <= r < H and 0 <= c < W

def step(state, aidx):
    r,c = state
    if (r,c) == A:
        return A_prime, A_reward
    if (r,c) == B:
        return B_prime, B_reward
    dr,dc = ACTIONS[aidx]
    nr, nc = r+dr, c+dc
    if in_bounds(nr,nc):
        return (nr,nc), 0.0
    else:
        return (r,c), -1.0

def value_iteration(theta=1e-6, max_iter=10000):
    V = np.zeros((H,W), dtype=float)
    for it in range(max_iter):
        delta = 0.0
        V_old = V.copy()
        for r in range(H):
            for c in range(W):
                q_vals = []
                for a in range(len(ACTIONS)):
                    (nr, nc), rwd = step((r,c), a)
                    q_vals.append(rwd + gamma * V_old[nr, nc])
                V[r,c] = np.max(q_vals)
                delta = max(delta, abs(V[r,c]-V_old[r,c]))
        if delta < theta:
            break
    Pi = np.zeros((H,W), dtype=int)
    for r in range(H):
        for c in range(W):
            q_vals = []
            for a in range(len(ACTIONS)):
                (nr, nc), rwd = step((r,c), a)
                q_vals.append(rwd + gamma * V[nr, nc])
            Pi[r,c] = int(np.argmax(q_vals))
    return V, Pi

V, Pi = value_iteration()
V


In [None]:

plt.figure()
plt.imshow(V, origin="upper")
plt.colorbar(label="V*")
plt.title("Optimal State Values V*")
for r in range(H):
    for c in range(W):
        plt.text(c, r, f"{V[r,c]:.1f}", ha="center", va="center")
plt.xticks(range(W))
plt.yticks(range(H))
plt.show()


In [None]:

plt.figure()
plt.imshow(V, origin="upper")
plt.title("Greedy Policy from V*")
for r in range(H):
    for c in range(W):
        plt.text(c, r, A_NAMES[Pi[r,c]], ha="center", va="center")
plt.xticks(range(W))
plt.yticks(range(H))
plt.show()



**Exercises**
1. Change γ and re-run. How do values/policy shift?
2. Change the wall penalty to −2 or −5.
3. Remove teleporters A/B; compare the baseline world.
