In [None]:
import numpy as np
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

print("Updated sys.path:", sys.path)



from lib.envs.gridworld import GridworldEnv

Updated sys.path: ['/Users/macbook/Downloads/reinforcement-learning/DP', '/opt/anaconda3/lib/python312.zip', '/opt/anaconda3/lib/python3.12', '/opt/anaconda3/lib/python3.12/lib-dynload', '', '/opt/anaconda3/lib/python3.12/site-packages', '/opt/anaconda3/lib/python3.12/site-packages/aeosa', '/Users/macbook/Downloads', '/Users/macbook', '/Users']


In [236]:
env = GridworldEnv()

In [None]:
def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):
    """
    Evaluate a policy given an environment and a full description of the environment's dynamics.

    Args:
        policy: [S, A] shaped matrix representing the policy.
        env: OpenAI env. env.P represents the transition probabilities of the environment.
            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).
            env.nS is a number of states in the environment.
            env.nA is a number of actions in the environment.
        theta: We stop evaluation once our value function change is less than theta for all states.
        discount_factor: Gamma discount factor.

    Returns:
        Vector of length env.nS representing the value function.
    """
    # Start with a random (all 0) value function
    V = np.zeros(env.nS)
    states = env.nS
    iter = 0
    while True:
        change = 0
        iter += 1
        for s in range(states):
            newVal = 0
            for a in range(env.nA):
                #grab the policy, each row corresponds to a state, each column to an action
                pi = policy[s][a]
                #this is a list that contain one 4-tupe
                trans_list = env.P[s][a]

                #destructuring the 4-tuple
                transProb, sNext, reward, done = trans_list[0]

                #update the value function
                newVal += pi * (reward + discount_factor * transProb * V[sNext])

            #take max becuse it represent worse case error. so if worst case is less than theta, then all cases are less than theta
            change = max(change, np.abs(newVal - V[s]))
            V[s] = newVal

        if change < theta:
            print(iter)
            break
    return np.array(V)

In [238]:
random_policy = np.ones([env.nS, env.nA]) / env.nA

v = policy_eval(random_policy, env)

print(v)

1.8984375
1.724609375
1.472412109375
1.4061737060546875
1.3317079544067383
1.2421786189079285
1.1491830237209797
1.0584387693088502
0.9725181825488107
0.8924059502996897
0.8183112404493045
0.7500779695562692
0.6873890613115314
0.6298669391432661
0.5771221272796652
0.5287760186094594
0.48447085001008716
0.4438733987055308
0.4066756472311397
0.372594031842965
0.3413680735003908
0.3127587833753509
0.2865470318891994
0.26253196938627354
0.24052953664261167
0.2203710789440123
0.2019020656849193
0.18498091196034494
0.16947789626083676
0.1552741675450946
0.1422608348641461
0.13033813295070473
0.11941465757337255
0.10940666489176465
0.10023742949022107
0.0918366561932622
0.08413994116554946
0.07708827817126718
0.07062760621066388
0.06470839506731352
0.05928526558949798
0.05431664179498341
0.049764432132345604
0.04559373745524553
0.04177258347101542
0.03827167561236067
0.035064174452937635
0.03212548994506648
0.029433092902149127
0.02696634228047401
0.024706326936318135
0.022635720645226343
0.0

In [239]:
# Test: Make sure the evaluated policy is what we expected
expected_v = np.array([0, -14, -20, -22, -14, -18, -20, -20, -20, -20, -18, -14, -22, -20, -14, 0])
np.testing.assert_array_almost_equal(v, expected_v, decimal=2)