In [1]:
import numpy as np
import gymnasium as gym
import time
import random
from IPython.display import clear_output

In [2]:
env = gym.make("FrozenLake-v1", render_mode="ansi")  # Set render mode during initialization
state_space_size = env.observation_space.n
action_space_size = env.action_space.n
q_table = np.zeros((state_space_size, action_space_size))

In [3]:
num_episodes = 10000
max_steps_per_episodes = 100
learning_rate = 0.1
discount_rate = 0.99
exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

In [4]:
# Q-learning algorithm
rewards_all_episodes = []

for episode in range(num_episodes):
    state, _ = env.reset()  # Reset the environment for the new episode
    done = False
    reward_current_episode = 0

    for step in range(max_steps_per_episodes):
        exploration_rate_threshold = random.uniform(0, 1)

        if exploration_rate_threshold > exploration_rate:
            # Exploit: Choose the action with the highest Q-value
            action = np.argmax(q_table[state, :])
        else:
            # Explore: Choose a random action
            action = env.action_space.sample()

        # Take the action and observe the result
        new_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        # Update Q-table for Q(s, a)
        q_table[state, action] = q_table[state, action] + \
            learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]) - q_table[state, action])

        # Move to the next state
        state = new_state
        reward_current_episode += reward

        if done:
            break

    # Exploration decay rate
    exploration_rate = min_exploration_rate + \
        (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)
    
    rewards_all_episodes.append(reward_current_episode)

# Display the average reward per thousand episodes
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes / 1000)
count = 1000
print("********Average reward per thousand episodes********\n")
for r in rewards_per_thousand_episodes:
    print(count, ": ", str(sum(r / 1000)))
    count += 1000

# Display the final Q-table
print("\n\n********Q-table********\n")
print(q_table)

********Average reward per thousand episodes********

1000 :  0.05600000000000004
2000 :  0.20700000000000016
3000 :  0.4030000000000003
4000 :  0.5680000000000004
5000 :  0.6350000000000005
6000 :  0.6630000000000005
7000 :  0.6770000000000005
8000 :  0.6760000000000005
9000 :  0.6830000000000005
10000 :  0.6950000000000005


********Q-table********

[[0.55991633 0.51888296 0.51738445 0.5172599 ]
 [0.38825556 0.32242106 0.30016811 0.51704216]
 [0.43137211 0.43241209 0.38449642 0.48655849]
 [0.35305305 0.24254757 0.28720969 0.45650481]
 [0.57811256 0.38143188 0.23017874 0.31254987]
 [0.         0.         0.         0.        ]
 [0.17573808 0.14530394 0.3980261  0.09731311]
 [0.         0.         0.         0.        ]
 [0.33773014 0.25552165 0.53690488 0.61814273]
 [0.44207826 0.67355757 0.43370435 0.3823173 ]
 [0.6223053  0.39640149 0.39400322 0.34222534]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.62561678 0.56802145 0.77619992 

In [6]:
import time
from IPython.display import clear_output

# Test the trained agent with a few episodes
for episode in range(3):
    state, _ = env.reset()
    done = False
    print("******Episode ", episode + 1, "******\n\n\n\n")
    time.sleep(1)

    for step in range(max_steps_per_episodes):
        # Remove clear_output and make sure rendering happens correctly
        env.render()  # Render the environment
        time.sleep(0.3)

        action = np.argmax(q_table[state, :])  # Take the action with the highest Q-value
        new_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated  # Check if the episode is done

        if done:
            env.render()  # Render the final state of the episode
            if reward == 1:
                print("****You reached the goal!****")
            else:
                print("****You fell through a hole!****")
            time.sleep(3)
            break

        state = new_state

# Close the environment
env.close()


******Episode  1 ******




****You fell through a hole!****
******Episode  2 ******




****You reached the goal!****
******Episode  3 ******




****You fell through a hole!****
