### Iterative policy evaluation

$v_{k+1}(s) = \sum\limits_{a} \pi (a|s) \sum\limits_{s', r} p(s', r|s, a)[r + \gamma v_{k} (s')] $

In [1]:
import numpy as np

In [52]:
# Set up variables for problem
pi = 0.25
trans_mat = 1.0
reward = -1.0
gamma = 1.0
v0 =np.zeros((4, 4), dtype=np.float32)

In [66]:
def iterative_policy_evaluation(v0, iters=1, round_digits=2):
    """Iterative policy evaluation."""
    rows = v0.shape[0]
    cols = v0.shape[1]

    v = v0.copy()
    for i in range(iters):
        v_prev = v.copy()
        v = np.zeros_like(v_prev)

        for r in range(rows):
            for c in range(cols):
                if (r==0 and c==0) or (r==rows-1 and c==cols-1):
                    continue
                v_s = 0
                v_s += pi * trans_mat * (reward + (gamma * v_prev[max(r-1, 0), c]))
                v_s += pi * trans_mat * (reward + (gamma * v_prev[r, min(c+1, cols-1)]))
                v_s += pi * trans_mat * (reward + (gamma * v_prev[min(r+1, rows-1), c]))
                v_s += pi * trans_mat * (reward + (gamma * v_prev[r, max(c-1, 0)]))

                v[r, c] = np.round(v_s, round_digits)
        
        print(f'Values after iteration {i}')
        print(v, '\n')
        
    return v

In [67]:
v_new = iterative_policy_evaluation(v0, iters=4)

Values after iteration 0
[[ 0. -1. -1. -1.]
 [-1. -1. -1. -1.]
 [-1. -1. -1. -1.]
 [-1. -1. -1.  0.]] 

Values after iteration 1
[[ 0.   -1.75 -2.   -2.  ]
 [-1.75 -2.   -2.   -2.  ]
 [-2.   -2.   -2.   -1.75]
 [-2.   -2.   -1.75  0.  ]] 

Values after iteration 2
[[ 0.   -2.44 -2.94 -3.  ]
 [-2.44 -2.88 -3.   -2.94]
 [-2.94 -3.   -2.88 -2.44]
 [-3.   -2.94 -2.44  0.  ]] 

Values after iteration 3
[[ 0.   -3.07 -3.85 -3.97]
 [-3.07 -3.72 -3.91 -3.85]
 [-3.85 -3.91 -3.72 -3.07]
 [-3.97 -3.85 -3.07  0.  ]] 

