In [None]:
import gymnasium as gym


env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False, render_mode="rgb_array")

observation, info = env.reset()

for _ in range(100):
    action = env.action_space.sample()
    print(f"Action: {action}")
    observation, reward, terminated, truncated, info = env.step(action)

    if terminated or truncated:
        observation, info = env.reset()
        print("Environment reset")

env.close()

Action/Ob Space Inspection

In [2]:
env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False, render_mode="rgb_array")
# create multiple environments
# env = make_vec_env("LunarLander-v2", n_envs=16)
env.reset()
print("_____OBSERVATION SPACE_____ \n")
print("Observation Space Shape", env.observation_space.shape)
print("Sample observation", env.observation_space.sample())  # Get a random observation

print("\n _____ACTION SPACE_____ \n")
print("Action Space Shape", env.action_space.n)
print("Action Space Sample", env.action_space.sample())  # Take a random action

_____OBSERVATION SPACE_____ 

Observation Space Shape ()
Sample observation 8

 _____ACTION SPACE_____ 

Action Space Shape 4
Action Space Sample 3


Model

In [3]:
import gymnasium as gym
import numpy as np
import tqdm

In [4]:
env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False, render_mode="rgb_array")
env.reset()

state_space = env.observation_space.n
action_space = env.action_space.n

def initialize_q_table(state_space, action_space):
    return np.zeros((state_space, action_space))

q_table = initialize_q_table(state_space, action_space)


In [5]:
# policies
def greedy_policy(q_table, state):
    return np.argmax(q_table[state, :])

def epsilon_greedy_policy(q_table, state, epsilon):
    random_num = np.random.random()
    if random_num > epsilon:
        action = greedy_policy(q_table, state)
    else:
        action = env.action_space.sample()
    return action

In [6]:
# hyperparameters
n_training_episodes = 10000
learning_rate = 0.7

n_eval_episodes = 100

env_id = "FrozenLake-v1"
max_steps = 99
gamma = 0.95
eval_seed = []

max_epsilon = 1.0
min_epsilon = 0.05
epsilon_decay = 0.0005

In [9]:
def train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable):
    for episode in range(n_training_episodes):
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
        state, info = env.reset()
        step = 0
        terminated = False
        truncated = False

        for step in range(max_steps):
            action = epsilon_greedy_policy(Qtable, state, epsilon)
            observation, reward, terminated, truncated, info = env.step(action)
            Qtable[state, action] = Qtable[state, action] + learning_rate * (reward + gamma * np.max(Qtable[observation, :]) - Qtable[state, action])
            if terminated or truncated:
                break
            state = observation
    return Qtable


In [10]:
q_table = train(n_training_episodes, min_epsilon, max_epsilon, epsilon_decay, env, max_steps, q_table)

Evaluate

In [14]:
def evaluate_agent(env, max_steps, n_eval_episodes, Q, seed):
  """
  Evaluate the agent for ``n_eval_episodes`` episodes and returns average reward and std of reward.
  :param env: The evaluation environment
  :param max_steps: Maximum number of steps per episode
  :param n_eval_episodes: Number of episode to evaluate the agent
  :param Q: The Q-table
  :param seed: The evaluation seed array (for taxi-v3)
  """
  episode_rewards = []
  for episode in range(n_eval_episodes):
    if seed:
      state, info = env.reset(seed=seed[episode])
    else:
      state, info = env.reset()
    step = 0
    truncated = False
    terminated = False
    total_rewards_ep = 0

    for step in range(max_steps):
      # Take the action (index) that have the maximum expected future reward given that state
      action = greedy_policy(Q, state)
      new_state, reward, terminated, truncated, info = env.step(action)
      total_rewards_ep += reward

      if terminated or truncated:
        break
      state = new_state
    episode_rewards.append(total_rewards_ep)
  mean_reward = np.mean(episode_rewards)
  std_reward = np.std(episode_rewards)

  return mean_reward, std_reward

In [15]:
# Evaluate our Agent
mean_reward, std_reward = evaluate_agent(env, max_steps, n_eval_episodes, q_table, eval_seed)
print(f"Mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")

Mean_reward=1.00 +/- 0.00


Run Model

In [22]:
from gymnasium.envs.toy_text.frozen_lake import generate_random_map


# Run the model for a few episodes
for episode in range(5):
    # Create a new environment for inference
    env = gym.make("FrozenLake-v1", desc=generate_random_map(size=4), is_slippery=False, render_mode="human")

    # Reset the environment
    obs, _ = env.reset()

    episode_reward = 0
    done = False
    truncated = False
    
    while not (done or truncated):
        # Get the model's action
        action = greedy_policy(q_table, obs)
        
        # Take the action in the environment
        obs, reward, done, truncated, info = env.step(action)
        episode_reward += reward
    
    print(f"Episode {episode + 1} reward: {episode_reward}")
    
    # Reset the environment for the next episode
    obs, _ = env.reset()

# Close the environment
env.close()


Episode 1 reward: 0.0
Episode 2 reward: 1.0
Episode 3 reward: 0.0
Episode 4 reward: 0.0
Episode 5 reward: 0.0
