# Frozen Lake environment test for Risk DRL

In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt 
import torch
import random

Create environment.

In [2]:
# Make environment
env = gym.make("FrozenLake-v0") 

# Random seed
env.seed(0)

# Reset and render environment
env.reset()
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


Meaning of characters in the environment:
- S: initial state
- F: frozen lake
- H: hole
- G: the goal
- Red square: indicates the current position of the player

Show the action and observation space

In [3]:
print("Action space:", env.action_space)
print("Observation space:", env.observation_space)

Action space: Discrete(4)
Observation space: Discrete(16)


## Random exploration routine
Make random actions and explore the observation space

In [4]:
def random_exploration(env, MAX_ITERATIONS):
    env.reset()
    for i in range(MAX_ITERATIONS):
        # Sample random action
        random_action = env.action_space.sample()

        # Make action in the environment
        new_state, reward, done, info = env.step(random_action)
        env.render()
        
        # If done, break the loop
        if done:
            break

In [5]:
MAX_ITERATIONS = 10
random_exploration(env, MAX_ITERATIONS)

  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Up)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Up)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Down)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG


# Vanilla Q-learning

## Epsilon-greedy functions

Epsilon-greedy action selection function

In [6]:
def eps_greedy(env, curr_state, q_matrix, epsilon):
    # random number to compare
    random_roulette = random.uniform(0, 1)

    # epsilon roulette
    if (epsilon < random_roulette):

        # Take greedy action
        action = np.argmax(q_matrix[curr_state, :])
        
    else:

        # Take random action
        action = env.action_space.sample
    return action 

Epsilon update function

In [7]:
def epsilon_update(epsilon, EPSILON_MIN, EPSILON_DECAY):
    epsilon = max(EPSILON_MIN, epsilon * EPSILON_DECAY)
    return epsilon

## Q-value functions

Update Q-matrix

In [8]:
def update_q(q_matrix, curr_state, new_state, curr_action, reward, learning_rate, gamma):

    # Compute value function
    value_function = np.max(q_matrix[new_state, :])

    # Update Q-values
    q_matrix[curr_state, curr_action] = q_matrix[curr_state, curr_action] + learning_rate * (reward + gamma * value_function - q_matrix[curr_state, curr_action])
    print(reward)

    return q_matrix

Q-learning RL

In [9]:
def qlearning_exploration(env, q_matrix, EPISODES, MAX_ITERATIONS, epsilon, learning_rate, gamma):
    
    for i in range(EPISODES):
        curr_state = env.reset()

        for t in range(MAX_ITERATIONS):
            # Get epsilon-greedy action
            curr_action = eps_greedy(env, curr_state, q_matrix, epsilon)
            # print(curr_action)

            # Make action in the environment
            new_state, reward, done, info = env.step(curr_action)
            env.render()

            # Update Q-matrix
            q_matrix = update_q(q_matrix, curr_state, new_state, curr_action, reward, learning_rate, gamma)
            curr_state = new_state
            # print(q_matrix)

            # Update 
            epsilon = epsilon_update(epsilon, EPSILON_MIN, EPSILON_DECAY)

            # If done, break the loop
            if done:
                break

## Call the learning routine

In [10]:
# Define maximum number of iterations and number of episodes
MAX_ITERATIONS = 100
EPISODES = 100

# epsilon-greedy parameter
EPSILON_MIN = 0.0001
EPSILON_DECAY = 0.9999
epsilon = 0.2

# learning rate parameter
learning_rate = 0.1

# Discount factor
gamma = 0.9999

# Size of observation/action space
state_size = env.observation_space.n
action_size = env.action_space.n

# Create Q-matrix
q_matrix = np.zeros((state_size, action_size))

# Call the learning routine
qlearning_exploration(env, q_matrix, EPISODES, MAX_ITERATIONS, epsilon, learning_rate, gamma)

  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
0.0
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
0.0
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
0.0
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
0.0
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
0.0
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
0.0
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
0.0
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
0.0
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
0.0
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
0.0
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
0.0
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
0.0
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
0.0
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
0.0


KeyError: <bound method Discrete.sample of Discrete(4)>