In [19]:
import gymnasium as gym
import numpy as np

In [20]:
env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=True)

In [21]:
import numpy as np

def policy_iteration(env, policy, discount_factor=0.9, theta=1e-8, max_iterations=1000):
    """
    Policy Iteration algorithm for solving Markov Decision Processes (MDPs).

    Args:
        env (gym.Env): Initialized OpenAI gym environment object.
        policy (np.ndarray): 2D array of size n(S) x n(A), representing the initial policy.
        discount_factor (float): MDP discount factor.
        theta (float): Threshold for value function change.
        max_iterations (int): Maximum number of iterations.

    Returns:
        np.ndarray: Optimal policy.
        np.ndarray: Optimal value function.
    """
    n_states = env.observation_space.n
    n_actions = env.action_space.n

    value_func = np.zeros(n_states)
    policy_stable = False

    for i in range(max_iterations):
        value_func_old = value_func.copy()

        # Policy Evaluation
        for state in range(n_states):
            value_func[state] = sum([
                policy[state, action] *
                sum([prob * (reward + discount_factor * value_func_old[next_state])
                     for prob, next_state, reward, _ in env.P[state][action]])
                for action in range(n_actions)
            ])

        # Policy Improvement
        policy_stable = True
        for state in range(n_states):
            old_action = np.argmax(policy[state])
            best_action_value = max([
                sum([prob * (reward + discount_factor * value_func[next_state])
                     for prob, next_state, reward, _ in env.P[state][action]])
                for action in range(n_actions)
            ])
            best_action = np.argmax([
                sum([prob * (reward + discount_factor * value_func[next_state])
                     for prob, next_state, reward, _ in env.P[state][action]])
                for action in range(n_actions)
            ])
            policy[state] = np.eye(n_actions)[best_action]
            if old_action != best_action:
                policy_stable = False

        if policy_stable:
            break

        if np.max(np.abs(value_func_old - value_func)) < theta:
            break

    return policy, value_func

In [22]:
import numpy as np

def value_iteration(env, discount_factor=0.9, theta=1e-8, max_iterations=1000):
    """
    Value Iteration algorithm for solving Markov Decision Processes (MDPs).

    Args:
        env (gym.Env): Initialized OpenAI gym environment object.
        discount_factor (float): MDP discount factor.
        theta (float): Threshold for value function change.
        max_iterations (int): Maximum number of iterations.

    Returns:
        np.ndarray: Optimal policy.
        np.ndarray: Optimal value function.
    """
    n_states = env.observation_space.n
    n_actions = env.action_space.n

    value_func = np.zeros(n_states)
    policy = np.zeros([n_states, n_actions])

    for i in range(max_iterations):
        value_func_old = value_func.copy()

        for state in range(n_states):
            q_values = [
                sum([prob * (reward + discount_factor * value_func_old[next_state])
                     for prob, next_state, reward, _ in env.P[state][action]])
                for action in range(n_actions)
            ]
            value_func[state] = max(q_values)
            policy[state] = np.eye(n_actions)[np.argmax(q_values)]

        if np.max(np.abs(value_func_old - value_func)) < theta:
            break

    return policy, value_func

In [23]:
def run_frozen_lake(algorithm, discount_factor=0.9, theta=1e-8, max_iterations=1000):
    """
    Returns:
        np.ndarray: Optimal policy.
        np.ndarray: Optimal value function.
    """
    env = gym.make('FrozenLake-v1')
    n_states = env.observation_space.n
    n_actions = env.action_space.n

    if algorithm == 'policy_iteration':
        initial_policy = np.ones([n_states, n_actions]) / n_actions  # Uniform random policy
        policy, value_func = policy_iteration(env, initial_policy, discount_factor, theta, max_iterations)
    elif algorithm == 'value_iteration':
        policy, value_func = value_iteration(env, discount_factor, theta, max_iterations)
    else:
        raise ValueError("Invalid algorithm specified. Choose 'policy_iteration' or 'value_iteration'.")

    print(f"Optimal Policy (Algorithm: {algorithm}):")
    print(policy)
    print(f"Optimal Value Function (Algorithm: {algorithm}):")
    print(value_func)

    env.close()
    return policy, value_func

# Run Policy Iteration
policy_pi, value_func_pi = run_frozen_lake('policy_iteration')

# Run Value Iteration
policy_vi, value_func_vi = run_frozen_lake('value_iteration')

Optimal Policy (Algorithm: policy_iteration):
[[0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]]
Optimal Value Function (Algorithm: policy_iteration):
[0.00479925 0.00786375 0.02588175 0.00903825 0.0165375  0.
 0.0702255  0.         0.05735925 0.15865125 0.23084275 0.
 0.         0.29375125 0.58571608 0.        ]
Optimal Policy (Algorithm: value_iteration):
[[1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]]
Optimal Value Function (Algorithm: value_iteration):
[0.06889084 0.06141452 0.07440972 0.05580728 0.09185448 0.
 0.11220819 0.         0.14543631 0.24749692 0.29961757 0.
 0.         0.37993588 0.63902014 0.  

In [24]:
def run_episodes(env, policy, n_episodes=5):
    """
    Runs the specified policy on the environment for n_episodes.

    Args:
        env (gym.Env): Initialized OpenAI gym environment object.
        policy (np.ndarray): Optimal policy.
        n_episodes (int): Number of episodes to run.

    Returns:
        int: Number of wins.
        float: Average return.
    """
    wins = 0
    returns = []

    for _ in range(n_episodes):
        state, info = env.reset()
        episode_return = 0
        truncated, terminated = False, False

        while not truncated or terminated:
            action = np.random.choice(env.action_space.n, p=policy[state])
            state, reward, truncated, terminated, _ = env.step(action)
            episode_return += reward

        if episode_return == 1.0:
            wins += 1

        returns.append(episode_return)

    average_return = np.mean(returns)

    return wins, average_return

# Run Policy Iteration
policy_pi, _ = run_frozen_lake('policy_iteration')
env = gym.make('FrozenLake-v1', render_mode="human")
wins_pi, avg_return_pi = run_episodes(env, policy_pi)
env.close()

# Run Value Iteration
policy_vi, _ = run_frozen_lake('value_iteration')
env = gym.make('FrozenLake-v1')
wins_vi, avg_return_vi = run_episodes(env, policy_vi)
env.close()

print(f"Policy Iteration: Wins = {wins_pi}, Average Return = {avg_return_pi:.3f}")
print(f"Value Iteration: Wins = {wins_vi}, Average Return = {avg_return_vi:.3f}")

Optimal Policy (Algorithm: policy_iteration):
[[0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]]
Optimal Value Function (Algorithm: policy_iteration):
[0.00479925 0.00786375 0.02588175 0.00903825 0.0165375  0.
 0.0702255  0.         0.05735925 0.15865125 0.23084275 0.
 0.         0.29375125 0.58571608 0.        ]
Optimal Policy (Algorithm: value_iteration):
[[1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]]
Optimal Value Function (Algorithm: value_iteration):
[0.06889084 0.06141452 0.07440972 0.05580728 0.09185448 0.
 0.11220819 0.         0.14543631 0.24749692 0.29961757 0.
 0.         0.37993588 0.63902014 0.  