# Frozen Lake

The following is a solution to the [FrozenLake](https://gym.openai.com/envs/FrozenLake-v0/) environment using q-learning, adopted from Thomas Simonini's RL blog posts [here](https://www.freecodecamp.org/news/diving-deeper-into-reinforcement-learning-with-q-learning-c18d0db58efe/).


In [3]:
import numpy as np
import gym
import random

In [42]:
# Setup environment
env = gym.make("FrozenLake-v0")

action_size = env.action_space.n
state_size = env.observation_space.n

qtable = np.zeros((state_size, action_size))
print(qtable)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [44]:
# Create hyperparameters
total_episodes = 15000        # Total episodes
learning_rate = 0.8           # Learning rate
max_steps = 200                # Max steps per episode
gamma = 0.95                  # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.005             # Exponential decay rate for exploration prob

In [54]:
# Implement q-learning algorithm

"""
1. Implement Q-values (Q(s, a)) arbitrarily for all state-action pairs (done)
2. For life or until learning is stopped...
3. Choose an action (a) in the current world state (s) based on current Q-value estimates 
4. Take the action (a) and observe the outcome state (s') and reward (r)
5. Update Q(s, a) := Q(s, a) + learning_rate(r + \gamma )
"""
rewards = []

def exploit():
    """Returns True if we are to take an action."""
    # Select a random number
    n = random.uniform(0, 1)
    return n > epsilon # If n > epislon, we exploit.

def update_qtable(state, action, new_state):
    qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])

for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for s in range(max_steps):
        # Choose an action.
        if exploit():
            # Exploit.
            action = np.argmax(qtable[state, :])
        else:
            # Explore; choose and take a random action.
            action = env.action_space.sample()
            
        new_state, reward, done, info = env.step(action)
            
        # Update the qtable.
        update_qtable(state, action, new_state)
            
        # Update other variables.
        total_rewards += reward
        state = new_state
            
        # Check if we're dead.
        if done == True:
            break

    # Reduce epsilon.
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
    rewards.append(total_rewards)
    
    # Print score every few steps to make sure that we're learning.
    if episode % 1000 == 0:
        average_score = round(sum(rewards)/total_episodes, 2)
        print("Episode {episode}: {average_score}".format(episode=episode, average_score=average_score))

print(qtable)

Episode 0: 0.0
Episode 1000: 0.0
Episode 2000: 0.01
Episode 3000: 0.02
Episode 4000: 0.03
Episode 5000: 0.04
Episode 6000: 0.05
Episode 7000: 0.05
Episode 8000: 0.06
Episode 9000: 0.07
Episode 10000: 0.08
Episode 11000: 0.09
Episode 12000: 0.09
Episode 13000: 0.1
Episode 14000: 0.11
Episode 15000: 0.12
Episode 16000: 0.13
Episode 17000: 0.13
Episode 18000: 0.14
Episode 19000: 0.15
Episode 20000: 0.16
Episode 21000: 0.17
Episode 22000: 0.18
Episode 23000: 0.18
Episode 24000: 0.19
Episode 25000: 0.2
Episode 26000: 0.21
Episode 27000: 0.22
Episode 28000: 0.23
Episode 29000: 0.23
Episode 30000: 0.24
Episode 31000: 0.25
Episode 32000: 0.26
Episode 33000: 0.27
Episode 34000: 0.28
Episode 35000: 0.28
Episode 36000: 0.29
Episode 37000: 0.3
Episode 38000: 0.31
Episode 39000: 0.32
Episode 40000: 0.33
Episode 41000: 0.33
Episode 42000: 0.34
Episode 43000: 0.35
Episode 44000: 0.36
Episode 45000: 0.36
Episode 46000: 0.37
Episode 47000: 0.38
Episode 48000: 0.39
Episode 49000: 0.4
Episode 50000: 0.41

In [59]:
# Play frozen lake

env.reset()

for episode in range(10):
    state = env.reset()
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):
        
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(qtable[state,:])
        
        new_state, reward, done, info = env.step(action)
        
        if done:
            # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
            env.render()
            
            # We print the number of step it took.
            print("Number of steps", step)
            break
        state = new_state
env.close()

****************************************************
EPISODE  0
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 74
****************************************************
EPISODE  1
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 12
****************************************************
EPISODE  2
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 10
****************************************************
EPISODE  3
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 54
****************************************************
EPISODE  4
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 16
****************************************************
EPISODE  5
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 13
****************************************************
EPISODE  6
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 84
****************************************************
EPISODE  7
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 21
********************************