In [74]:
import numpy as np
# use gym==0.17.3 for better compatibility
import gym
import time
from IPython.display import display, clear_output
import ipywidgets as widgets

# Q-Table approach on Frozen Lake

In [75]:
env = gym.make('FrozenLake-v0',is_slippery=False)
# not slippery at first
# Number of states and actions
n_states = env.observation_space.n
n_actions = env.action_space.n

In [76]:
env.reset()
env.render(mode="human")


[41mS[0mFFF
FHFH
FFFH
HFFG


S = starting fied, F = frozen fields that can be stepped on, H = holes (game ends), G = goal

In [77]:
# function to watch the agent play
def watch_agent_play(env, policy, num_episodes=1, delay=0.5):
    for _ in range(num_episodes):
        state = env.reset()
        done = False
        while not done:
            clear_output(wait=True)
            env.render()
            display(time.sleep(delay))  # Sleep for a short period to make the display more understandable
            action = np.argmax(policy[state, :])
            state, _, done, _ = env.step(action)
# random argmax
def random_argmax(b):
    """ a random tie-breaking argmax"""
    return np.random.choice(np.flatnonzero(b == b.max()))

# Without Bayesian

In [78]:
# Initialize Q-values and policy
Q_values = np.zeros((n_states, n_actions))
policy = np.ones((n_states, n_actions)) / n_actions  # Start with a uniform policy

# Hyperparameters
alpha = 0.1
gamma = 0.99
epsilon = 0.1
print(Q_values.shape)
print(policy.shape)

(16, 4)
(16, 4)


In [79]:
# Q-learning with epsilon-greedy action selection
# 10 000 episodes, episodes finish when dropping or winning
for episode in range(10000):
    state = env.reset()
    done = False

    while not done:
        if np.random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        else:
            action = random_argmax(Q_values[state, :])

        next_state, reward, done, _ = env.step(action)
        best_next_action = random_argmax(Q_values[next_state, :])
        
        td_target = reward + gamma * Q_values[next_state, best_next_action]
        td_error = td_target - Q_values[state, action]
        
        Q_values[state, action] += alpha * td_error
        
        state = next_state

    # Update the policy using the Q-values (argmax of Q-table)
    policy = np.zeros((n_states, n_actions))
    best_actions = [random_argmax(row) for row in Q_values]
    policy[range(n_states), best_actions] = 1


In [80]:
print(policy)

[[0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]]


In [81]:
# watch agent play
watch_agent_play(env, policy, num_episodes=1)

  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG


None

# With Bayesian

Some adjustments have to be made:  
Based on the Bayesian Q-Learning paper with assumption that R_sa ha normal distribution.
### Policy representation:
Q-values into distribution. Simple example here: Gaussian distribution for each Q-value characterized by mean (mu) and precision (basically the opposite of standard deviation) (sigma) (twin table)
### Prior Distribution:
Initialize with means zero (as usual), larger sigma suggests more uncertainty and vice versa
### Update:
Updating mechanisma for mu is similar to traditional Q-learning. The Temporal Difference gets computed and this error is used to corect the current Q-value estimate.TDerror​=(reward+γ×V(next_state))−V(state)    
Adjusting sigma (Precision in this case) should be experimented on but I chose the approach that it can grow and shrink.
### Sampling:
When selecting an action we sample from the gaussian distributions for each Q-value and select based on the sampled value.

In [107]:
n_states = env.observation_space.n
n_actions = env.action_space.n
alpha = 0.1
gamma = 0.95
epsilon = 0.1

# Initialize Q-values (means) and their uncertainties (standard deviations)
Q_means = np.zeros((n_states, n_actions))
Q_sigmas = np.full((n_states, n_actions), 1.0)  # starting with high uncertainty

In [108]:
for episode in range(10000):
    state = env.reset()
    done = False

    while not done:
        # Epsilon-greedy action selection
        if np.random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        else:
            # Q-value sampling for action selection
            sampled_Qs = Q_means[state, :] + np.random.randn(n_actions) * Q_sigmas[state, :]
            action = random_argmax(sampled_Qs)

        next_state, reward, done, _ = env.step(action)
        
        # Q-learning update for means
        best_next_action = random_argmax(Q_means[next_state, :])
        td_target = reward + gamma * Q_means[next_state, best_next_action]
        td_error = td_target - Q_means[state, action]
        Q_means[state, action] += alpha * td_error
        
        # Adaptive precision update
        relative_td_error = abs(td_error) / (abs(Q_means[state, action]) + 1e-10)
        alpha_for_precision = 0.1
        Q_sigmas[state, action] *= (1 - alpha_for_precision) + alpha_for_precision * relative_td_error
        
        state = next_state
    
    policy = np.zeros((n_states, n_actions))
    best_actions = [random_argmax(row) for row in Q_values]
    policy[range(n_states), best_actions] = 1


In [None]:
# watch agent play
watch_agent_play(env, policy, num_episodes=1)