In [2]:
import gymnasium as gym
import numpy as np
import random

# Create the FrozenLake environment (as a Grid World substitute)
env_grid = gym.make("FrozenLake-v1", is_slippery=False)


In [3]:
def value_iteration(env, gamma=0.99, theta=1e-9):
    value_table = np.zeros(env.observation_space.n)
    while True:
        delta = 0
        for state in range(env.observation_space.n):
            Q_values = np.zeros(env.action_space.n)
            for action in range(env.action_space.n):
                for prob, next_state, reward, done in env.P[state][action]:
                    Q_values[action] += prob * (reward + gamma * value_table[next_state])
            max_value = np.max(Q_values)
            delta = max(delta, np.abs(max_value - value_table[state]))
            value_table[state] = max_value
        if delta < theta:
            break
    policy = np.zeros(env.observation_space.n)
    for state in range(env.observation_space.n):
        Q_values = np.zeros(env.action_space.n)
        for action in range(env.action_space.n):
            for prob, next_state, reward, done in env.P[state][action]:
                Q_values[action] += prob * (reward + gamma * value_table[next_state])
        policy[state] = np.argmax(Q_values)
    return policy, value_table

policy_grid_vi, value_table_grid = value_iteration(env_grid)
print("Optimal Policy (Grid World - Value Iteration):", policy_grid_vi)
print("Value Table (Grid World - Value Iteration):", value_table_grid)


Optimal Policy (Grid World - Value Iteration): [1. 2. 1. 0. 1. 0. 1. 0. 2. 1. 1. 0. 0. 2. 2. 0.]
Value Table (Grid World - Value Iteration): [0.95099005 0.96059601 0.970299   0.96059601 0.96059601 0.
 0.9801     0.         0.970299   0.9801     0.99       0.
 0.         0.99       1.         0.        ]


  logger.warn(


In [4]:
def policy_iteration(env, gamma=0.99):
    policy = np.random.choice(env.action_space.n, env.observation_space.n)
    value_table = np.zeros(env.observation_space.n)
    while True:
        # Policy Evaluation
        while True:
            delta = 0
            for state in range(env.observation_space.n):
                action = policy[state]
                value = 0
                for prob, next_state, reward, done in env.P[state][action]:
                    value += prob * (reward + gamma * value_table[next_state])
                delta = max(delta, np.abs(value - value_table[state]))
                value_table[state] = value
            if delta < 1e-9:
                break
        # Policy Improvement
        policy_stable = True
        for state in range(env.observation_space.n):
            old_action = policy[state]
            action_values = np.zeros(env.action_space.n)
            for action in range(env.action_space.n):
                for prob, next_state, reward, done in env.P[state][action]:
                    action_values[action] += prob * (reward + gamma * value_table[next_state])
            new_action = np.argmax(action_values)
            if old_action != new_action:
                policy_stable = False
            policy[state] = new_action
        if policy_stable:
            break
    return policy, value_table

policy_grid_pi, value_table_grid = policy_iteration(env_grid)
print("Optimal Policy (Grid World - Policy Iteration):", policy_grid_pi)
print("Value Table (Grid World - Policy Iteration):", value_table_grid)


Optimal Policy (Grid World - Policy Iteration): [1 2 1 0 1 0 1 0 2 1 1 0 0 2 2 0]
Value Table (Grid World - Policy Iteration): [0.95099005 0.96059601 0.970299   0.96059601 0.96059601 0.
 0.9801     0.         0.970299   0.9801     0.99       0.
 0.         0.99       1.         0.        ]


In [5]:
def q_learning(env, episodes=1000, alpha=0.1, gamma=0.99, epsilon=0.1):
    Q = np.zeros((env.observation_space.n, env.action_space.n))
    for _ in range(episodes):
        state = env.reset()[0]
        done = False
        while not done:
            if random.uniform(0, 1) < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(Q[state])
            next_state, reward, done, _, _ = env.step(action)
            best_next_action = np.argmax(Q[next_state])
            Q[state, action] = Q[state, action] + alpha * (reward + gamma * Q[next_state, best_next_action] - Q[state, action])
            state = next_state
    policy = np.argmax(Q, axis=1)
    return policy, Q

policy_grid_ql, Q_table_grid = q_learning(env_grid)
print("Optimal Policy (Grid World - Q-Learning):", policy_grid_ql)
print("Q-Table (Grid World - Q-Learning):", Q_table_grid)


Optimal Policy (Grid World - Q-Learning): [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Q-Table (Grid World - Q-Learning): [[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [6]:
def epsilon_greedy_policy(Q, state, epsilon):
    if random.uniform(0, 1) < epsilon:
        return np.random.choice(len(Q[state]))
    else:
        return np.argmax(Q[state])

# Example usage
state = env_grid.reset()[0]
action = epsilon_greedy_policy(Q_table_grid, state, epsilon=0.1)
print("Selected Action (Epsilon-Greedy):", action)


Selected Action (Epsilon-Greedy): 0


In [7]:
def ucb_selection(Q, N, state, c=1):
    total_counts = np.sum(N[state]) + 1e-10
    ucb_values = Q[state] + c * np.sqrt(np.log(total_counts) / (N[state] + 1e-10))
    return np.argmax(ucb_values)

def ucb_learning(env, episodes=1000, alpha=0.1, gamma=0.99, c=1):
    Q = np.zeros((env.observation_space.n, env.action_space.n))
    N = np.zeros((env.observation_space.n, env.action_space.n))
    for _ in range(episodes):
        state = env.reset()[0]
        done = False
        while not done:
            action = ucb_selection(Q, N, state, c)
            next_state, reward, done, _, _ = env.step(action)
            best_next_action = np.argmax(Q[next_state])
            Q[state, action] = Q[state, action] + alpha * (reward + gamma * Q[next_state, best_next_action] - Q[state, action])
            N[state, action] += 1
            state = next_state
    policy = np.argmax(Q, axis=1)
    return policy, Q

policy_grid_ucb, Q_table_grid = ucb_learning(env_grid)
print("Optimal Policy (Grid World - UCB):", policy_grid_ucb)
print("Q-Table (Grid World - UCB):", Q_table_grid)


  ucb_values = Q[state] + c * np.sqrt(np.log(total_counts) / (N[state] + 1e-10))


Optimal Policy (Grid World - UCB): [2 2 1 0 1 0 1 0 2 2 1 0 0 2 2 0]
Q-Table (Grid World - UCB): [[0.00838082 0.00691285 0.95099005 0.00838082]
 [0.00717478 0.         0.96059601 0.00931946]
 [0.00701678 0.970299   0.00232261 0.01319199]
 [0.00581283 0.         0.00165103 0.00165103]
 [0.00524838 0.00884098 0.         0.00636597]
 [0.         0.         0.         0.        ]
 [0.         0.9801     0.         0.00709953]
 [0.         0.         0.         0.        ]
 [0.00533594 0.         0.01384328 0.0032767 ]
 [0.00311929 0.0176995  0.03228587 0.        ]
 [0.00801649 0.99       0.         0.970299  ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.00831027 0.46414632 0.00309614]
 [0.11611211 0.99       1.         0.9801    ]
 [0.         0.         0.         0.        ]]
