Setup

In [1]:
import gymnasium as gym
import numpy as np
import random
import pickle
from gymnasium.envs.toy_text.frozen_lake import generate_random_map


random_map = generate_random_map(size=4, p=0.8)
print("Generated Map:")
for row in random_map:
  print(row)


env = gym.make("FrozenLake-v1", desc=random_map, is_slippery=True)


n_states = env.observation_space.n
n_actions = env.action_space.n


alpha = 0.8
gamma = 0.95
episodes = 20000
max_steps = 100
eps_start = 1.0
eps_end = 0.01
eps_decay = 0.999


Q = np.zeros((n_states, n_actions))
rewards_all_episodes = []

Generated Map:
SHFF
FHHF
FFFF
FFHG


Training

In [3]:
print("Training started")


for episode in range(episodes):
  state, _ = env.reset()
  done = False
  total_reward = 0
  epsilon = max(eps_end, eps_start * (eps_decay ** episode))


  if episode % 2000 == 0:
    print(f"Episode {episode}, epsilon={epsilon:.4f}")


  for step in range(max_steps):
    if random.uniform(0, 1) < epsilon:
      action = env.action_space.sample()
    else:
      action = np.argmax(Q[state])


    next_state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated


    old_value = Q[state, action]
    next_max = np.max(Q[next_state])
    Q[state, action] = old_value + alpha * (reward + gamma * next_max - old_value)


    if episode % 5000 == 0 and step < 3:
      print(f"s={state}, a={action}, r={reward}, s'={next_state}, Q={Q[state, action]:.3f}")


    state = next_state
    total_reward += reward


    if done:
      break


  rewards_all_episodes.append(total_reward)


print("Training finished")


np.save("qtable_random_frozenlake.npy", Q)


with open("qtable_random_frozenlake.pkl", "wb") as f:
  pickle.dump(Q, f)

print(Q)
print("Q-table saved")


env = gym.make("FrozenLake-v1", desc=random_map, is_slippery=True, render_mode="human")
state, _ = env.reset()
done = False

Training started
Episode 0, epsilon=1.0000
s=0, a=1, r=0, s'=4, Q=0.005
s=4, a=1, r=0, s'=5, Q=0.001
Episode 2000, epsilon=0.1352
Episode 4000, epsilon=0.0183
s=0, a=1, r=0, s'=1, Q=0.000
Episode 6000, epsilon=0.0100
Episode 8000, epsilon=0.0100
Episode 10000, epsilon=0.0100
s=0, a=0, r=0, s'=0, Q=0.037
s=0, a=0, r=0, s'=0, Q=0.036
s=0, a=0, r=0, s'=0, Q=0.034
Episode 12000, epsilon=0.0100
Episode 14000, epsilon=0.0100
s=0, a=0, r=0, s'=4, Q=0.174
s=4, a=0, r=0, s'=8, Q=0.098
s=8, a=1, r=0, s'=12, Q=0.066
Episode 16000, epsilon=0.0100
Episode 18000, epsilon=0.0100
Training finished
[[5.14106814e-02 1.02367361e-03 2.06103398e-03 2.45600251e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 9.67580853e-02]
 [6.33998233e-02 4.10783082e-03 1.02751402e-01 5.04970762e-02]
 [2.28973939e-01 1.77159951e-04 2.79837170e-03 1.48010393e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0

Testing

In [None]:
print("Testing started")


for step in range(50):
  action = np.argmax(Q[state])
  next_state, reward, terminated, truncated, _ = env.step(action)
  print(f"step={step}, state={state}, action={action}, reward={reward}")
  state = next_state
  if terminated or truncated:
    print("Success" if reward == 1 else "Failure")
    break


env.close()

Testing started
step=0, state=0, action=0, reward=0
step=1, state=0, action=0, reward=0
step=2, state=4, action=2, reward=0
step=3, state=0, action=0, reward=0
step=4, state=0, action=0, reward=0
step=5, state=0, action=0, reward=0
step=6, state=0, action=0, reward=0
step=7, state=4, action=2, reward=0
step=8, state=8, action=2, reward=0
step=9, state=4, action=2, reward=0
step=10, state=0, action=0, reward=0
step=11, state=0, action=0, reward=0
step=12, state=4, action=2, reward=0
step=13, state=0, action=0, reward=0
step=14, state=0, action=0, reward=0
step=15, state=0, action=0, reward=0
step=16, state=4, action=2, reward=0
step=17, state=5, action=1, reward=0
step=18, state=9, action=0, reward=0
step=19, state=13, action=2, reward=0
step=20, state=9, action=0, reward=0
step=21, state=8, action=2, reward=0
step=22, state=9, action=0, reward=0
step=23, state=13, action=2, reward=0
step=24, state=14, action=1, reward=1
Success
