In [2]:
import numpy as np
import gym
import random

In [5]:
env = gym.make("FrozenLake-v0")

In [6]:
action_size = env.action_space.n
state_size = env.observation_space.n
print(state_size,action_size)

(16, 4)


In [7]:
qtable = np.zeros((state_size, action_size))
print(qtable)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [17]:
total_episodes = 15000
learning_rate = 0.8
max_steps = 99
gamma = 0.95

# Exploration params
epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.005 # exponential decay rate for exploration prob

In [19]:
rewards = []
# 2. Run till forever or until learning is stopped
for episode in range(total_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        # 3. choose an action a in the current world state (s)
        exp_tradeoff = random.uniform(0, 1)
        
        # if exploration tradeoff > epsilon = exploitation (taking the biggest Q value for this state)
        if exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])

        # else, doing random choice = exploitation
        else:
            action = env.action_space.sample()
        
        
        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info = env.step(action)
        
        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)] (w/ Bellman Equation)
        # qtable[new_state,:] : all the actions we can take from new state
        qtable[state, action] = qtable[state, action] + learning_rate  * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])
        total_rewards += reward
        
        # Our new_state is state
        state = new_state
        
        if done == True:
            break
            
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
    rewards.append(total_rewards)
    
print("Score over time:", str(sum(rewards)/total_episodes))
print(qtable)
        

('Score over time:', '0.0238')
[[-2.59349524e+200 -9.58508399e+199 -1.26178150e+200  1.12172502e+200]
 [-1.35241977e+200 -2.92754826e+199 -4.05048212e+199 -7.52938366e+199]
 [-1.67225833e+199 -3.23475973e+199 -1.00782213e+198 -1.33459205e+199]
 [-1.45663125e+192 -3.95654395e+188  1.08504817e+072 -2.86853069e+197]
 [-9.43380096e+199 -8.72024504e+199 -6.14937075e+199 -1.33932020e+200]
 [ 0.00000000e+000  0.00000000e+000  0.00000000e+000  0.00000000e+000]
 [-1.76071768e+198 -1.48661097e+198 -6.45285806e+198 -3.16342795e+197]
 [ 0.00000000e+000  0.00000000e+000  0.00000000e+000  0.00000000e+000]
 [-6.75878526e+199  4.29521353e+198 -1.25568843e+199 -3.45930724e+199]
 [-6.73405547e+198 -6.58016698e+198 -8.26500286e+198 -9.97846238e+198]
 [-1.40908142e+198 -3.59034483e+198 -1.62961692e+198 -1.38497617e+198]
 [ 0.00000000e+000  0.00000000e+000  0.00000000e+000  0.00000000e+000]
 [ 0.00000000e+000  0.00000000e+000  0.00000000e+000  0.00000000e+000]
 [ 1.32470604e+198 -4.73090637e+198 -3.4712994

In [20]:
env.reset()

for episode in range(5):
    state = env.reset()
    step = 0
    done = False
    print("*************************************************")
    print("EPISODE", episode)
    
    for step in range(max_steps):
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(qtable[state,:])
        new_state, reward, done, info = env.step(action)
        
        if done:
            # we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
            env.render()
            
            # print the number of step it took
            print("Number of steps:", step)
            break
        state = new_state
env.close()

*************************************************
('EPISODE', 0)
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
('Number of steps:', 5)
*************************************************
('EPISODE', 1)
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
('Number of steps:', 1)
*************************************************
('EPISODE', 2)
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
('Number of steps:', 1)
*************************************************
('EPISODE', 3)
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
('Number of steps:', 5)
*************************************************
('EPISODE', 4)
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
('Number of steps:', 5)
