https://github.com/openai/gym/wiki/FrozenLake-v0

In [1]:
import gym
import numpy as np
import time
import random
from IPython.display import clear_output

In [2]:
env=gym.make('FrozenLake-v0')
env.reset()

0

In [3]:
action_space_size=env.action_space.n
state_space_size=env.observation_space.n

q_table=np.zeros((state_space_size,action_space_size))
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [10]:
num_episodes=10000
max_steps_per_episode=100

learning_rate=0.8
discount_rate=0.95

exploration_rate=1
max_exploration_rate=1
min_exploration_rate=.01
exploration_decay_rate=.001 #or .001

In [11]:
rewards_all_episodes=[]

#Q-learning algorithm
for episode in range(num_episodes):
    state=env.reset()
    done=False
    rewards_current_episode=0
    
    for step in range(max_steps_per_episode):
        
        exploration_rate_threshold=random.uniform(0,1)
        if exploration_rate_threshold > exploration_rate:
            action=np.argmax(q_table[state,:])
        else:
            action=env.action_space.sample()
        
        new_state, reward, done, info = env.step(action)
        
        #update q-table
        q_table[state, action]=q_table[state,action]*(1-learning_rate) + learning_rate*(reward+discount_rate*np.max(q_table[new_state,:]))
        
        state=new_state
        rewards_current_episode += reward
        
        if done: break
            
    exploration_rate=min_exploration_rate +(max_exploration_rate-min_exploration_rate) * np.exp(-exploration_decay_rate*episode)
    
    rewards_all_episodes.append(rewards_current_episode)
    

#after all episodes
rewards_per_thousand_episodes=np.split(np.array(rewards_all_episodes),num_episodes/1000)
count=1000
print('Avg reward per 1000 episodes')
for r in rewards_per_thousand_episodes:
    print(count,':', str(sum(r/1000)))
    count+=1000

print('q-table')
print(q_table)
    

Avg reward per 1000 episodes
1000 : 0.03200000000000002
2000 : 0.08700000000000006
3000 : 0.16100000000000012
4000 : 0.2950000000000002
5000 : 0.3830000000000003
6000 : 0.4910000000000004
7000 : 0.46900000000000036
8000 : 0.47400000000000037
9000 : 0.4980000000000004
10000 : 0.46100000000000035
q-table
[[2.73320637e-01 4.13300722e-02 4.50751374e-02 6.51525140e-02]
 [1.74928705e-02 1.70932982e-02 9.01888102e-03 5.63473771e-02]
 [6.47406511e-03 3.48646101e-03 4.33025426e-03 1.44413043e-02]
 [3.90438214e-04 1.33734517e-03 1.82008789e-03 1.58995824e-02]
 [4.39217180e-01 3.86438839e-02 2.81922482e-03 2.50535458e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [3.36807097e-05 1.42485315e-05 1.21417968e-04 1.74725502e-08]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.39490261e-02 4.73018627e-04 4.54004683e-04 4.49439909e-01]
 [3.82223053e-03 6.86158380e-01 1.97224653e-02 2.94628896e-02]
 [1.42431954e-01 1.90209270e-03 8.36198354e-04 9.18541964e-04]
 [0

Now to watch it play

In [12]:
for episode in range(2):
    state=env.reset()
    done=False
    print('Episode', episode+1, "***\n\n\n\n")
    time.sleep(1)
    
    for step in range(max_steps_per_episode):
        clear_output(wait=True)
        env.render()
        time.sleep(0.3)
        
        action = np.argmax(q_table[state,:])
        print(action)
        new_state, reward, done, info = env.step(action)
        
        if done:
            clear_output(wait=True)
            env.render()
            if reward==1:
                print('You reached the goal! AWESOME!!!')
                time.sleep(3)
            else:
                print('You fell through the hole.')
                time.sleep(3)
            clear_output(wait=True)
            break
        
    state=new_state

env.close()

  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
You fell through the hole.


In [7]:
q_table[0,0]=0
q_table[0,3]=0
q_table

array([[0.00000000e+00, 2.90650006e-01, 5.22125288e-01, 0.00000000e+00],
       [6.70978393e-02, 8.12194220e-04, 1.93956823e-02, 5.89523441e-01],
       [1.10974466e-01, 9.22953159e-02, 9.32758899e-02, 2.80063211e-01],
       [2.93015834e-02, 7.01149305e-03, 3.08735615e-02, 2.20756039e-01],
       [6.59379758e-01, 1.35504478e-01, 4.76926286e-01, 1.08108660e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.61362957e-02, 5.20324226e-09, 9.35781575e-04, 2.40236821e-06],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [6.60891902e-03, 5.15468621e-01, 1.68087870e-02, 8.65609207e-01],
       [1.23008304e-03, 9.28298138e-01, 1.24863635e-01, 1.38745333e-01],
       [3.33435842e-01, 9.43860907e-04, 6.27862558e-05, 3.45342035e-03],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [5.97350868e-02, 3.85121083e-02, 8.75811346e

In [8]:
rewards_all_episodes = []

# Q-learning algorithm
for episode in range(num_episodes):
    # initialize new episode params

    for step in range(max_steps_per_episode): 

    # Exploration-exploitation trade-off
        exploration_rate_threshold = random.uniform(0, 1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state,:]) 
        else:
            action = env.action_space.sample()
