In [3]:
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym # Import the necessary libraries


In [4]:
# Get frozen lake environment
env = gym.make("FrozenLake-v1", render_mode="human", map_name="4x4", is_slippery=False)
# Initialize the environment
# state, info = env.reset()
# Initialize the action space
action_space = env.action_space.n
# Initialize the state space
state_space = env.observation_space.n

# Policy Iteration 

In [39]:
policy_icons = {
    3: "⬆️",  # Up
    2: "➡️",  # Right
    1: "⬇️",  # Down
    0: "⬅️"   # Left
}

In [40]:
policy = np.random.randint(0, action_space, size=state_space)
policied = np.zeros(state_space, dtype=object)
for state in range(state_space):
    policied[state] = policy_icons[policy[state]]
policied

array(['⬇️', '⬆️', '⬇️', '⬅️', '➡️', '➡️', '⬇️', '➡️', '⬇️', '⬆️', '⬅️',
       '⬅️', '⬇️', '➡️', '⬅️', '➡️'], dtype=object)

In [41]:
env = env.unwrapped

In [42]:
def calculate_value_function(policy, env, gamma=0.9, theta=1e-10):
    value_function = np.zeros(env.observation_space.n)
    while True:
        delta = 0
        for state in range(env.observation_space.n):
            v = 0
            for prob, next_state, reward, done in env.P[state][policy[state]]:
                v += prob * (reward + gamma * value_function[next_state] * (not done))
            delta = max(delta, abs(value_function[state] - v))
            value_function[state] = v
        if delta < theta:
            break
    return value_function


In [43]:
def improve_policy(value_function, env, gamma=0.9):
    policy = np.zeros(env.observation_space.n, dtype=int)
    for state in range(env.observation_space.n):
        action_values = np.zeros(env.action_space.n)
        for action in range(env.action_space.n):
            for prob, next_state, reward, done in env.P[state][action]:
                action_values[action] += prob * (reward + gamma * value_function[next_state] * (not done))
        policy[state] = np.argmax(action_values)
    return policy


In [44]:
def policy_iteration(env, gamma=0.9, theta=1e-10):
    policy = np.random.randint(env.action_space.n, size=env.observation_space.n)
    while True:
        value_function = calculate_value_function(policy, env, gamma, theta)
        new_policy = improve_policy(value_function, env, gamma)
        if np.array_equal(policy, new_policy):
            break
        policy = new_policy
    return policy, value_function


In [45]:
policy, value_function = policy_iteration(env)
policied = np.zeros(state_space, dtype=object)

for i in range(state_space):
    policied[i] = policy_icons[policy[i]]

print("Optimal Policy:")
print(policied.reshape(4, 4))
print("\nValue Function:")
print(value_function.reshape(4, 4))
# Close the environment


Optimal Policy:
[['⬇️' '➡️' '⬇️' '⬅️']
 ['⬇️' '⬅️' '⬇️' '⬅️']
 ['➡️' '⬇️' '⬇️' '⬅️']
 ['⬅️' '➡️' '➡️' '⬅️']]

Value Function:
[[0.59049 0.6561  0.729   0.6561 ]
 [0.6561  0.      0.81    0.     ]
 [0.729   0.81    0.9     0.     ]
 [0.      0.9     1.      0.     ]]


In [46]:
env.P[0][1]

[(1.0, 4, 0.0, False)]

In [28]:
v = calculate_value_function(policy, env)
v

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])