In [6]:
import numpy as np
import matplotlib.pyplot as plt
from grid_world import standard_grid

In [7]:
SMALL_ENOUGH = 1e-3 # threshold for convergence

In [12]:
def print_values(V, g):
    for i in range(g.rows):
        print("---------------------------")
        for j in range(g.cols):
            v = V.get((i,j), 0)
            if v >= 0:
                print(" %.2f|" % v, end="")
            else:
                print("%.2f|" % v, end="") # -ve sign takes up an extra space
        print("")


def print_policy(P, g):
    for i in range(g.rows):
        print("---------------------------")
        for j in range(g.cols):
            a = P.get((i,j), ' ')
            print("  %s  |" % a, end="")
        print("")

In [15]:
grid = standard_grid()

# states will be positions (i,j)
# simpler than tic-tac-toe because we only have one "game piece"
# that can only be at one position at a time
states = grid.all_states()

### uniformly random actions ###
# initialize V(s) = 0
V = {}
for s in states:
    V[s] = 0
    gamma = 1.0 # discount factor
    
loop_count = 0
# repeat until convergence
while True:
    loop_count += 1
    biggest_change = 0
    for s in states:
        old_v = V[s]

        # V(s) only has value if it's not a terminal state
        if s in grid.actions:

            new_v = 0 # we will accumulate the answer
            p_a = 1.0 / len(grid.actions[s]) # each action has equal probability
            for a in grid.actions[s]:
                grid.set_state(s)
                r = grid.move(a)
                new_v += p_a * (r + gamma * V[grid.current_state()])
            V[s] = new_v
            biggest_change = max(biggest_change, np.abs(old_v - V[s]))

    if biggest_change < SMALL_ENOUGH:
        break
print("values for uniformly random actions:")
print_values(V, grid)
print("\n\n")

values for uniformly random actions:
---------------------------
-0.03| 0.09| 0.22| 0.00|
---------------------------
-0.16| 0.00|-0.44| 0.00|
---------------------------
-0.29|-0.41|-0.54|-0.77|





In [16]:
loop_count

25

In [13]:
### fixed policy ###
policy = {
(2, 0): 'U',
(1, 0): 'U',
(0, 0): 'R',
(0, 1): 'R',
(0, 2): 'R',
(1, 2): 'R',
(2, 1): 'R',
(2, 2): 'R',
(2, 3): 'U',
}
print_policy(policy, grid)

# initialize V(s) = 0
V = {}
for s in states:
    V[s] = 0

# let's see how V(s) changes as we get further away from the reward
gamma = 0.9 # discount factor

# repeat until convergence
while True:
    biggest_change = 0
    for s in states:
        old_v = V[s]

        # V(s) only has value if it's not a terminal state
        if s in policy:
            a = policy[s]
            grid.set_state(s)
            r = grid.move(a)
            V[s] = r + gamma * V[grid.current_state()]
            biggest_change = max(biggest_change, np.abs(old_v - V[s]))

    if biggest_change < SMALL_ENOUGH:
        break
print("values for fixed policy:")
print_values(V, grid)

---------------------------
  R  |  R  |  R  |     |
---------------------------
  U  |     |  R  |     |
---------------------------
  U  |  R  |  R  |  U  |
values for fixed policy:
---------------------------
 0.81| 0.90| 1.00| 0.00|
---------------------------
 0.73| 0.00|-1.00| 0.00|
---------------------------
 0.66|-0.81|-0.90|-1.00|
