In [2]:
import numpy as np
import gym
import random

In [5]:
env = gym.make("FrozenLake-v0")

In [6]:
action_size = env.action_space.n
state_size = env.observation_space.n
print(state_size,action_size)

(16, 4)


In [7]:
qtable = np.zeros((state_size, action_size))
print(qtable)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [21]:
total_episodes = 15000
learning_rate = 0.8
max_steps = 99
gamma = 0.95

# Exploration params
epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.005 # exponential decay rate for exploration prob

In [34]:
rewards = []
# 2. Run till forever or until learning is stopped
for episode in range(total_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        # 3. choose an action a in the current world state (s)
        exp_tradeoff = random.uniform(0, 1)
        
        # if exploration tradeoff > epsilon = exploitation (taking the biggest Q value for this state)
        if exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])

        # else, doing random choice = exploitation
        else:
            action = env.action_space.sample()
        

        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info = env.step(action)
        
        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)] (w/ Bellman Equation)
        # qtable[new_state,:] : all the actions we can take from new state
        qtable[state, action] = qtable[state, action] + learning_rate  * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])
        total_rewards += reward
        
        # Our new_state is state
        state = new_state
        
        if done == True:
            break
            
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
    rewards.append(total_rewards)
    
print("Score over time:", str(sum(rewards)/total_episodes))
print(qtable)
        

('Score over time:', '0.479266666667')
[[1.03152121e-01 9.95906999e-02 7.72235724e-02 7.93208796e-02]
 [6.74643176e-03 3.99772215e-03 4.35975063e-03 9.64659559e-02]
 [3.41145262e-03 8.86059183e-03 4.89456558e-03 4.30552610e-02]
 [3.69093434e-03 3.20176134e-03 8.02524386e-04 3.67885091e-02]
 [1.01143502e-01 1.10399672e-03 6.15961069e-03 1.22246598e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [5.68770636e-02 1.08157622e-07 2.40630746e-04 3.80308832e-07]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.35299623e-04 4.41967265e-02 2.80232798e-02 8.48357368e-02]
 [2.44424662e-02 6.32075510e-01 4.41352239e-04 1.65393536e-02]
 [7.43279293e-01 1.45410929e-02 9.37756463e-04 1.33663801e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [4.46357125e-02 2.12811817e-02 8.64510107e-01 2.55184710e-02]
 [1.83744005e-01 9.65165650e-01 3.37403521e-01 3.86667047e-01]
 [0.00000000e+00

In [38]:
env.reset()

for episode in range(5):
    state = env.reset()
    step = 0
    done = False
    print("*************************************************")
    print("EPISODE", episode)
    
    for step in range(max_steps):
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(qtable[state,:])
        new_state, reward, done, info = env.step(action)
        
        if done:
            # we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
            env.render()
            
            # print the number of step it took
            print("Number of steps:", step)
            break
        state = new_state
env.close()

*************************************************
('EPISODE', 0)
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
('Number of steps:', 53)
*************************************************
('EPISODE', 1)
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
('Number of steps:', 23)
*************************************************
('EPISODE', 2)
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
('Number of steps:', 8)
*************************************************
('EPISODE', 3)
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
('Number of steps:', 18)
*************************************************
('EPISODE', 4)
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
('Number of steps:', 12)
