In [None]:
import gymnasium as gym
import numpy as np


In [None]:
env = gym.make("Taxi-v3", max_episode_steps=150)

q_table = np.zeros((env.observation_space.n, env.action_space.n))  # type: ignore
q_table.shape

(500, 6)

In [None]:
lr = 0.1  # learning rate
gamma = 0.9  # discount factor for future rewards
epochs = 20000  # number of training episodes

checks = 10
check_freq = int(epochs / checks)
window_rewards = 0

# exploration
epsilon = 1  # probability of taking a random action
min_epsilon = 0.1
epsilon_decay = 0.00005


for epoch in range(epochs):
    episode_reward = 0
    obs, info = env.reset()
    done = False

    while not done:
        if np.random.rand() < epsilon:  # Explore
            action = env.action_space.sample()
        else:  # Exploit
            action = np.argmax(q_table[obs])

        next_observation, reward, done, truncated, info = env.step(action)
        episode_reward += reward  # pyright: ignore[reportOperatorIssue]

        max_next_reward = q_table[next_observation][
            np.argmax(q_table[next_observation])
        ]

        # Update Q-table
        q_table[obs][action] += lr * (
            reward + gamma * max_next_reward - q_table[obs][action]
        )

        obs = next_observation

    window_rewards += episode_reward
    epsilon = max(min_epsilon, epsilon - epsilon_decay)

    if epoch % check_freq == 0 and epoch != 0:
        print(
            f"{epoch = }, {epsilon = :.2f}, mean_reward = {window_rewards / check_freq}"
        )
        window_rewards = 0

env.close()


epoch = 2000, epsilon = 0.90, mean_reward = -2766.302
epoch = 4000, epsilon = 0.80, mean_reward = -460.1395
epoch = 6000, epsilon = 0.70, mean_reward = -188.7245
epoch = 8000, epsilon = 0.60, mean_reward = -102.644
epoch = 10000, epsilon = 0.50, mean_reward = -61.328
epoch = 12000, epsilon = 0.40, mean_reward = -36.9845
epoch = 14000, epsilon = 0.30, mean_reward = -20.093
epoch = 16000, epsilon = 0.20, mean_reward = -9.1675
epoch = 18000, epsilon = 0.10, mean_reward = -0.8895


In [None]:
eval_reward = 0
eval_episodes = 100

env = gym.make("Taxi-v3", max_episode_steps=50)
# env = gym.make("Taxi-v3", max_episode_steps=50, render_mode="human")

# evaluate the agent
for _ in range(eval_episodes):
    obs, info = env.reset()
    done = False
    episode_reward = 0

    while not done:
        # action = greedy_policy(q_table, obs, info["action_mask"])
        action = np.argmax(q_table[obs, np.where(info["action_mask"] == 1)[0]])
        next_observation, reward, done, truncated, info = env.step(action)
        episode_reward += reward  # pyright: ignore[reportOperatorIssue]
        obs = next_observation

    eval_reward += episode_reward

print(f"Mean eval reward: {eval_reward / eval_episodes}")
env.close()


Mean eval reward: 7.83


In [9]:
# save the Q-table to a file
np.save("q_table.npy", q_table)