In [2]:
import numpy as np
import gym
import random
import time
from IPython.display import clear_output

In [3]:
my_env = gym.make("FrozenLake-v0")

In [4]:
size_of_action_space = my_env.action_space.n
size_of_state_space = my_env.observation_space.n

q_table = np.zeros((size_of_state_space,size_of_action_space))
#The initial empty q-table
print("The initial empty q-table")
print(q_table)

The initial empty q-table
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


Parameters to be used

In [5]:
number_of_episodes = 10000
#number_of_episodes = 2000
maximum_steps_per_episode = 100

learning_rate = 0.1
discount_rate = 0.99

exploration_rate = 1
maximum_exploration_rate = 1
minimum_exploration_rate = 0.01
rate_of_exploration_decay = 0.001


Now the training code

In [6]:
all_episode_rewards =[]

#THe Q-Learning algorithm here
#for all the episodes
for episode in range(number_of_episodes):
    state = my_env.reset()
    
    finished = False
    rewards_current_episode = 0
    
    #for all steps in for each episodes
    for step in range(maximum_steps_per_episode):
        
        #Exploration-exploitation trade-off
        exploration_rate_threshold = random.uniform(0,1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state,:])
        else:
            action = my_env.action_space.sample()
            
        new_state, reward, finished, info = my_env.step(action)
        
        #To update the Q-table for q(s,a)
        q_table[state,action] = (1-learning_rate) * q_table[state,action] + learning_rate * (reward + discount_rate * np.max(q_table[new_state,:]))
        state = new_state
        rewards_current_episode += reward
       
        if finished ==True:
            break
                         
                         
        
    #Exploration rate decay
    exploration_rate = minimum_exploration_rate + \
    (maximum_exploration_rate - minimum_exploration_rate) * np.exp(-rate_of_exploration_decay*episode)
                         
    all_episode_rewards.append(rewards_current_episode)
        


In [7]:
#calculate and print the average rewards per thousand episodes
                         
rewards_per_thousand_episodes = np.split(np.array(all_episode_rewards),number_of_episodes/1000)
count = 1000
print("********Average rewards per thousand episodes********\n")
for r in rewards_per_thousand_episodes:
    print(count, ": ", str(sum(r/1000)))
    count +=1000

                     
#Print the updated q-table
print("\n\n*******the updated Q-TABLE after training*******\n")
print(q_table)

********Average rewards per thousand episodes********

1000 :  0.05200000000000004
2000 :  0.23300000000000018
3000 :  0.4270000000000003
4000 :  0.5230000000000004
5000 :  0.6210000000000004
6000 :  0.6630000000000005
7000 :  0.6790000000000005
8000 :  0.7050000000000005
9000 :  0.6680000000000005
10000 :  0.6500000000000005


*******the updated Q-TABLE after training*******

[[0.51986909 0.43379899 0.45420433 0.44421998]
 [0.30882641 0.2651956  0.32296555 0.39995914]
 [0.35538308 0.2308098  0.25625948 0.27101558]
 [0.11474967 0.041074   0.03321536 0.1121695 ]
 [0.54074005 0.36489186 0.32892762 0.38870645]
 [0.         0.         0.         0.        ]
 [0.37718596 0.1179113  0.15457639 0.07974381]
 [0.         0.         0.         0.        ]
 [0.41124377 0.40513504 0.29805339 0.56333238]
 [0.35834963 0.60458632 0.42061289 0.39198793]
 [0.60847364 0.28716834 0.40044903 0.26106866]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.57423

Everthing worked up to this place!

Now that we are done witht he training, we now begin to test out trained agent


In [8]:
win=0
num_episd = 10
for the_episode in range(num_episd):
    game_state = my_env.reset()
    game_finished = False
    print("*******EPISODE  ", the_episode+1, "*********\n\n\n\n")
    time.sleep(2)
    
    for step in range(maximum_steps_per_episode):
        clear_output(wait=True)
        my_env.render()
        time.sleep(0.6)
        
        game_action = np.argmax(q_table[game_state,:])
        game_new_state, game_reward, game_finished, game_info = my_env.step(game_action)
        
        if game_finished:
            clear_output(wait = True)
            my_env.render()
            if game_reward == 1:
                win += 1
                print("*****WOW!: You reached the goal!******")
                time.sleep(3.1)
            else:
                print("*****LOST: You fell through a hole******")
                time.sleep(3.1)
            clear_output(wait=True)
            break
        
        game_state = game_new_state
    
my_env.close()
print("Number of of Trials: %d\n" %num_episd)
print("Number of Successes: %d\n" %win)
print("Number of Failures: %d\n" %(num_episd-win))
print("Testing accuracy: %2.2f" %((win/num_episd)*100), "%")

Number of of Trials: 10

Number of Successes: 7

Number of Failures: 3

Testing accuracy: 70.00 %
