In [1]:
!pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1


In [2]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [3]:
def execute_simulation(total_episodes, training_mode=True, visualize=False):
    environment = gym.make('FrozenLake-v1', map_name="4x4", is_slippery=True, render_mode='human' if visualize else None)

    if training_mode:
        q_matrix = np.zeros((environment.observation_space.n, environment.action_space.n))
    else:
        with open('frozen_lake4x4.pkl', 'rb') as file:
            q_matrix = pickle.load(file)

    learning_rate = 0.9
    discount_rate = 0.9
    exploration_rate = 1.0
    decay_rate = 0.0001
    random_generator = np.random.default_rng()

    episode_rewards = np.zeros(total_episodes)

    for episode in range(total_episodes):
        current_state = environment.reset()[0]
        game_over = False
        timeout = False

        while not game_over and not timeout:
            if training_mode and random_generator.random() < exploration_rate:
                chosen_action = environment.action_space.sample()
            else:
                chosen_action = np.argmax(q_matrix[current_state, :])

            next_state, reward, game_over, timeout, _ = environment.step(chosen_action)

            if training_mode:
                q_matrix[current_state, chosen_action] += learning_rate * (
                    reward + discount_rate * np.max(q_matrix[next_state, :]) - q_matrix[current_state, chosen_action]
                )

            current_state = next_state

        exploration_rate = max(exploration_rate - decay_rate, 0)

        if exploration_rate == 0:
            learning_rate = 0.0001

        episode_rewards[episode] = reward

    environment.close()

    accumulated_rewards = np.zeros(total_episodes)
    for i in range(total_episodes):
        accumulated_rewards[i] = np.sum(episode_rewards[max(0, i - 100):(i + 1)])
    plt.plot(accumulated_rewards)
    plt.savefig('frozen_lake4x4.png')

    if training_mode:
        with open("frozen_lake4x4.pkl", "wb") as file:
            pickle.dump(q_matrix, file)


In [None]:
execute_simulation(1000, training_mode=True, visualize=True)