In [2]:
import numpy as np
import gym
import random
import time
from IPython.display import clear_output

In [22]:
env = gym.make('FrozenLake-v1', render_mode="human")

In [None]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

q_table = np.zeros((state_space_size, action_space_size))
print(q_table)

In [11]:
num_episodes = 8000
max_steps_per_episode = 1000

learning_rate = 0.1
discount_rate = 0.99

exploration_rate = 1
max_exploratin_rate = 1
exploration_decay_rate = 0.015

In [None]:
rewards_all_episodes = []

# Q-Learning Algorithm
for episode in range(num_episodes):
    state = env.reset()
    
    done = False
    rewards_current_episode = 0

    for step in range(max_steps_per_episode):

        # Exploration-exploitation trade-off
        exploration_rate_threshold = random.uniform(0, 1)
        if exploration_rate_threshold > exploration_rate:
            # Exploit the environment and choose the action that has the highest q-value for the current state
            action = np.argmax(q_table[state,:])
        else:
            # Explore the environment and sample an action randomly
            action = env.action_space.sample()
        
        # take the chosen action
        new_state, reward, done, info = env.step(action)

        # Update the Q-table for Q(s, a)
        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))

        state = new_state
        rewards_current_episode += reward

        if done:
            break

    # Exploration rate decay
    exploration_rate = max_exploratin_rate * np.exp(-exploration_decay_rate*episode)

    rewards_all_episodes.append(rewards_current_episode)
    
# Calculate and print the average reward per thousand episodes
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes/1000)
count = 1000
print('-------------------Average reward per thousand episodes---------------------')
for r in rewards_per_thousand_episodes:
    print(count, ':', str(sum(r)/1000))
    count += 1000
    
# Print updated Q-table
print('\n\n--------------Q-Table----------------')
print(q_table)

In [23]:
# Watch the agent play the game
for episode in range(3):
    state = env.reset()
    done = False
    print("-----------Episode", episode+1, "---------\n\n\n")
    time.sleep(1)

    for step in range(max_steps_per_episode):
        clear_output(True)
        env.render()
        time.sleep(0.3)

        action = np.argmax(q_table[state, :])
        new_state, reward, done, info = env.step(action)

        if done:
            clear_output(True)
            env.render()
            if reward == 1:
                print("****** You reached the goal! *******")
            else:
                print("****** You fell through a hole! *******")
            time.sleep(2)
            clear_output(True)
            break
            
        state = new_state

env.close()

****** You reached the goal! *******
