## Libraries

In [1]:
import numpy as np 
import gym
import random
import time
from IPython.display import clear_output

## Creating The Environment

In [2]:
env = gym.make("FrozenLake-v0")

## Creating the Q-Table

In [3]:
actionSpaceSize = env.action_space.n
stateSpaceSize = env.observation_space.n

qTable = np.zeros((stateSpaceSize, actionSpaceSize))

print(qTable)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


## Initializing Q-Learning Parameters

In [9]:
numEpisodes = 10000
maxStepsPerEpisode = 100

learningRate = 0.1
discountRate = 0.99

explorationRate = 1
maxExplorationRate = 1
minExplorationRate = 0.01
explorationDecayRate = 0.01

## Q-Learning Algorithm Training Loop

In [12]:
rewardsAllEpisodes = []

# Q-Learning Algorithm
for episode in range(numEpisodes):
    # initialize new episode params
    state = env.reset()
    done = False
    rewardsCurrentEpisode = 0

    for step in range(maxStepsPerEpisode):
        # Exploration-exploitation trade-off
        explorationRateThreshold = random.uniform(0, 1)
        if (explorationRateThreshold > explorationRate):
            action = np.argmax(qTable[state,:])
        else:
            action = env.action_space.sample()

        # Take new action
        newState, reward, done, info = env.step(action)

        # Update Q-table for Q(s,a)
        qTable[state, action] = qTable[state, action] * (1 - learningRate) + learningRate * (reward + discountRate * np.max(qTable[newState, :]))

        # Set new state
        state = newState
        rewardsCurrentEpisode += reward

        if (done == True):
            break
        
    # Exploration rate decay
    explorationRate = minExplorationRate + (maxExplorationRate - minExplorationRate) * np.exp(-explorationDecayRate * episode)

    # Add current episode reward to total rewards list
    rewardsAllEpisodes.append(rewardsCurrentEpisode)


# Calculate and print the average reward per thousand episodes
rewardsPerThousandEpisodes = np.split(np.array(rewardsAllEpisodes), numEpisodes/1000)
count = 1000

print("********* Average reward per thousand episodes *********\n")

for r in rewardsPerThousandEpisodes:
    print(f"{count}: {str(sum(r/1000))}")
    count += 1000

********* Average reward per thousand episodes *********

1000: 0.5280000000000004
2000: 0.6740000000000005
3000: 0.6900000000000005
4000: 0.7060000000000005
5000: 0.6630000000000005
6000: 0.6780000000000005
7000: 0.6780000000000005
8000: 0.6530000000000005
9000: 0.6550000000000005
10000: 0.6860000000000005


## Watch the Agent Play the Game

In [15]:
for episode in range(3):
    # initialize new episode params
    state = env.reset()
    done = False
    print(f"***** EPISODE {episode + 1} *****\n\n\n\n")
    time.sleep(1)

    for step in range(maxStepsPerEpisode):
        # Show current state of environment on screen
        clear_output(wait=True)
        env.render()
        time.sleep(0.3)

        # Choose action with highest Q-value for current state
        action = np.argmax(qTable[state, :])
        # Take new action
        newState, reward, done, info = env.step(action)

        if done:
            if (reward == 1):
                # agent reached goal and won episode
                print("****You reached the goal!****")
                time.sleep(3)
            else:
                # Agent stepped in a hole and lost episode
                print("****You fell through the hole!****")
                time.sleep(3)
                clear_output(wait=True)
            break

        # set new state
        state = newState

env.close()

  (Left)
SFFF
FH[41mF[0mH
FFFH
HFFG
****You fell through the hole!****
