In [None]:
import gymnasium as gym


env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False, render_mode="rgb_array")

observation, info = env.reset()

for _ in range(100):
    action = env.action_space.sample()
    print(f"Action: {action}")
    observation, reward, terminated, truncated, info = env.step(action)

    if terminated or truncated:
        observation, info = env.reset()
        print("Environment reset")

env.close()

Action/Ob Space Inspection

In [23]:
env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=True, render_mode="rgb_array")
# create multiple environments
# env = make_vec_env("LunarLander-v2", n_envs=16)
env.reset()
print("_____OBSERVATION SPACE_____ \n")
print("Observation Space Shape", env.observation_space.shape)
print("Sample observation", env.observation_space.sample())  # Get a random observation

print("\n _____ACTION SPACE_____ \n")
print("Action Space Shape", env.action_space.n)
print("Action Space Sample", env.action_space.sample())  # Take a random action

_____OBSERVATION SPACE_____ 

Observation Space Shape ()
Sample observation 8

 _____ACTION SPACE_____ 

Action Space Shape 4
Action Space Sample 2


Model

In [1]:
import gymnasium as gym
import numpy as np

In [25]:
desc=["SFFF", 
      "FHFH", 
      "FHFH", 
      "HFFG"]

env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=True, render_mode="rgb_array")
env.reset()

state_space = env.observation_space.n
action_space = env.action_space.n

def initialize_q_table(state_space, action_space):
    return np.zeros((state_space, action_space))

q_table = initialize_q_table(state_space, action_space)


In [12]:
# policies
def greedy_policy(q_table, state):
    return np.argmax(q_table[state, :])

def epsilon_greedy_policy(q_table, state, epsilon):
    random_num = np.random.random()
    if random_num > epsilon:
        action = greedy_policy(q_table, state)
    else:
        action = env.action_space.sample()
    return action

In [26]:
# hyperparameters
n_training_episodes = 10000
learning_rate = 0.7

n_eval_episodes = 100

env_id = "FrozenLake-v1"
max_steps = 99
gamma = 0.95
eval_seed = []

max_epsilon = 1.0
min_epsilon = 0.05
epsilon_decay = 0.0005

In [27]:
def train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable):
    for episode in range(n_training_episodes):
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
        state, info = env.reset()
        step = 0
        terminated = False
        truncated = False

        for step in range(max_steps):
            action = epsilon_greedy_policy(Qtable, state, epsilon)
            observation, reward, terminated, truncated, info = env.step(action)
            Qtable[state, action] = Qtable[state, action] + learning_rate * (reward + gamma * np.max(Qtable[observation, :]) - Qtable[state, action])
            if terminated or truncated:
                break
            state = observation
    return Qtable


In [28]:
q_table = train(n_training_episodes, min_epsilon, max_epsilon, epsilon_decay, env, max_steps, q_table)

In [29]:
q_table

array([[1.21722024e-01, 1.05113464e-01, 1.05741147e-01, 9.77178444e-02],
       [3.68374124e-02, 8.40326801e-02, 3.49261921e-03, 1.12802095e-01],
       [1.98353595e-02, 3.11127810e-02, 2.16234886e-02, 8.63071611e-02],
       [3.17553546e-03, 8.28304514e-03, 2.25613064e-04, 6.77094654e-02],
       [1.71238406e-01, 4.55057268e-02, 4.37272370e-02, 1.06350616e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.39135837e-02, 4.48263161e-03, 3.11578411e-03, 1.35992016e-02],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.77657128e-02, 6.91186954e-02, 9.32231490e-02, 2.26787165e-01],
       [1.41175094e-01, 4.13246955e-01, 1.13787193e-01, 1.13323961e-01],
       [1.46569025e-01, 5.90182649e-03, 2.55999895e-02, 9.03568821e-02],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.78687201e-02, 1.75143469e-01, 7.39066821e

Evaluate

In [30]:
def evaluate_agent(env, max_steps, n_eval_episodes, Q, seed):
  """
  Evaluate the agent for ``n_eval_episodes`` episodes and returns average reward and std of reward.
  :param env: The evaluation environment
  :param max_steps: Maximum number of steps per episode
  :param n_eval_episodes: Number of episode to evaluate the agent
  :param Q: The Q-table
  :param seed: The evaluation seed array (for taxi-v3)
  """
  episode_rewards = []
  for episode in range(n_eval_episodes):
    if seed:
      state, info = env.reset(seed=seed[episode])
    else:
      state, info = env.reset()
    step = 0
    truncated = False
    terminated = False
    total_rewards_ep = 0

    for step in range(max_steps):
      # Take the action (index) that have the maximum expected future reward given that state
      action = greedy_policy(Q, state)
      new_state, reward, terminated, truncated, info = env.step(action)
      total_rewards_ep += reward

      if terminated or truncated:
        break
      state = new_state
    episode_rewards.append(total_rewards_ep)
  mean_reward = np.mean(episode_rewards)
  std_reward = np.std(episode_rewards)

  return mean_reward, std_reward

In [31]:
# Evaluate our Agent
mean_reward, std_reward = evaluate_agent(env, max_steps, n_eval_episodes, q_table, eval_seed)
print(f"Mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")

Mean_reward=0.53 +/- 0.50


Run Model

In [33]:
# Run the model for a few episodes
for episode in range(2):
    # Create a new environment for inference
    env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=True, render_mode="human")

    # Reset the environment
    obs, _ = env.reset()

    episode_reward = 0
    done = False
    truncated = False
    
    while not (done or truncated):
        # Get the model's action
        action = greedy_policy(q_table, obs)
        
        # Take the action in the environment
        obs, reward, done, truncated, info = env.step(action)
        episode_reward += reward
    
    print(f"Episode {episode + 1} reward: {episode_reward}")
    
    # Reset the environment for the next episode
    obs, _ = env.reset()

# Close the environment
env.close()


Episode 1 reward: 0.0
Episode 2 reward: 1.0
