https://github.com/openai/gym/wiki/FrozenLake-v0

In [1]:
import gym
import numpy as np
import time
import random
from IPython.display import clear_output

In [2]:
env=gym.make('FrozenLake-v0')
env.reset()

0

In [3]:
action_space_size=env.action_space.n
state_space_size=env.observation_space.n

q_table=np.zeros((state_space_size,action_space_size))
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [11]:
num_episodes=15000
max_steps_per_episode=99

learning_rate=0.8
discount_rate=0.95

exploration_rate=1
max_exploration_rate=1
min_exploration_rate=.01
exploration_decay_rate=.005 #or .001

In [12]:
rewards_all_episodes=[]

#Q-learning algorithm
for episode in range(num_episodes):
    state=env.reset()
    done=False
    rewards_current_episode=0
    
    for step in range(max_steps_per_episode):
        
        exploration_rate_threshold=random.uniform(0,1)
        if exploration_rate_threshold > exploration_rate:
            action=np.argmax(q_table[state,:])
        else:
            action=env.action_space.sample()
        
        new_state, reward, done, info = env.step(action)
        
        #update q-table
        q_table[state, action]=q_table[state,action] + \
        learning_rate*(reward+discount_rate*np.max(q_table[new_state,:]) - q_table[state,action])
        
        state=new_state
        rewards_current_episode += reward
        
        if done: break
            
    exploration_rate=min_exploration_rate +(max_exploration_rate-min_exploration_rate) * np.exp(-exploration_decay_rate*episode)
    
    rewards_all_episodes.append(rewards_current_episode)
    

#after all episodes
rewards_per_thousand_episodes=np.split(np.array(rewards_all_episodes),num_episodes/1000)
count=1000
print('Avg reward per 1000 episodes')
for r in rewards_per_thousand_episodes:
    print(count,':', str(sum(r/1000)))
    count+=1000

print('q-table')
print(q_table)
    

Avg reward per 1000 episodes
1000 : 0.19500000000000015
2000 : 0.47100000000000036
3000 : 0.5440000000000004
4000 : 0.4940000000000004
5000 : 0.4880000000000004
6000 : 0.5210000000000004
7000 : 0.5470000000000004
8000 : 0.46700000000000036
9000 : 0.47000000000000036
10000 : 0.5240000000000004
11000 : 0.5240000000000004
12000 : 0.4890000000000004
13000 : 0.48000000000000037
14000 : 0.5140000000000003
15000 : 0.4850000000000004
q-table
[[1.73211729e-01 7.05297359e-02 6.88991873e-02 5.81112148e-02]
 [1.64621289e-02 6.77759942e-03 1.44131563e-03 1.61247959e-01]
 [6.19247918e-02 1.34525849e-03 2.91145828e-03 3.16872220e-02]
 [2.32222969e-02 2.90490173e-03 7.04893052e-03 4.52997110e-02]
 [1.97818527e-01 2.58163837e-03 2.89895601e-02 3.74514273e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [5.56284167e-02 1.09089164e-06 4.66036263e-05 5.87358682e-05]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.19467001e-02 1.84788270e-03 2.00034000e-02 3.89250409e

Now to watch it play

In [10]:
for episode in range(2):
    state=env.reset()
    done=False
    print('Episode', episode+1, "***\n\n\n\n")
    time.sleep(1)
    
    for step in range(max_steps_per_episode):
        clear_output(wait=True)
        env.render()
        time.sleep(0.3)
        
        action = np.argmax(q_table[state,:])
        print(action)
        new_state, reward, done, info = env.step(action)
        
        if done:
            clear_output(wait=True)
            env.render()
            if reward==1:
                print('You reached the goal! AWESOME!!!')
                time.sleep(3)
            else:
                print('You fell through the hole.')
                time.sleep(3)
            clear_output(wait=True)
            break
        
    state=new_state

env.close()

  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
You fell through the hole.


In [7]:
q_table[0,0]=0
q_table[0,3]=0
q_table

array([[0.00000000e+00, 9.84387316e-02, 5.30953679e-02, 0.00000000e+00],
       [1.55591289e-02, 1.53927853e-02, 1.63976664e-02, 3.05635428e-02],
       [7.39991825e-03, 1.91580544e-02, 9.78082293e-03, 2.00496930e-02],
       [1.02737735e-02, 5.65908111e-03, 1.71923051e-03, 2.09966914e-02],
       [1.94593760e-01, 2.81938310e-03, 4.23260203e-02, 4.31782079e-02],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.60732079e-04, 3.12689358e-06, 7.36007861e-04, 9.25999515e-07],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [5.27838248e-02, 4.09561223e-02, 5.44863298e-02, 1.46562617e-01],
       [2.63873734e-02, 4.16752025e-02, 1.65742258e-02, 4.36006856e-03],
       [1.11411153e-02, 2.86908620e-03, 4.31673576e-03, 2.02188453e-03],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.68424241e-02, 2.64240277e-02, 5.69993195e

In [8]:
rewards_all_episodes = []

# Q-learning algorithm
for episode in range(num_episodes):
    # initialize new episode params

    for step in range(max_steps_per_episode): 

    # Exploration-exploitation trade-off
        exploration_rate_threshold = random.uniform(0, 1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state,:]) 
        else:
            action = env.action_space.sample()
