Import the dependencies

In [34]:
import numpy as np
import gym
import random
import time
from IPython.display import clear_output

Create the environment

In [35]:
env = gym.make("FrozenLake-v0")

Create the Q-table and initialize it 

Now, we'll create our Q-table, to know how much rows (states) and columns (actions) we need, we need to calculate the action_size and the state_size
OpenAI Gym provides us a way to do that: env.action_space.n and env.observation_space.n

In [36]:
action_size = env.action_space.n
state_size = env.observation_space.n

In [37]:
q_table = np.zeros((state_size, action_size))
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


#### Algorithm parameters
Here, we'll specify the hyperparameters 

In [38]:
num_episodes = 10000           # Total episodes
max_steps_per_episode = 100    # Max steps per episode

learning_rate = 0.1           # Learning rate
discount_rate = 0.99          # Discounting rate

# Exploration parameters
exploration_rate = 1                   # Exploration rate
max_exploration_rate = 1               # Exploration probability at start
min_exploration_rate = 0.01            # Minimum exploration probability 
exploration_decay_rate = 0.001         # Exponential decay rate for exploration prob

Coding The Q-Learning Algorithm Training Loop
### Q-learning algorithm
for episode in range(num_episodes):
    # initialize new episode params

    for step in range(max_steps_per_episode): 
        # Exploration-exploitation trade-off
        # Take new action
        # Update Q-table
        # Set new state
        # Add new reward        

    # Exploration rate decay   
    # Add current episode reward to total rewards list

In [39]:
# List of rewards
rewards_all_episodes = []

#  first for-loop contains everything that happens within a single episode. 
#  This second nested loop contains everything that happens for a single time-step.

# Q- Learning algorithm
for episode in range(num_episodes):
    # Reset the environment
    state = env.reset()
    # initialize new episode params
    done = False
    rewards_current_episode = 0
    
    for step in range(max_steps_per_episode):
        
        # Exploration-exploitation trade-off
        exploration_rate_threshold = random.uniform(0,1)
        if exploration_rate_threshold > exploration_rate:
            #exploitation (taking the biggest Q value for this state)
            action = np.argmax(q_table[state,:])
        else:
            #Else doing a random choice --> exploration
            action = env.action_space.sample()

        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info = env.step(action)
        # The function step() returns a tuple containing the new state, the reward for the action we took, 
        #         whether or not the action ended our episode, and diagnostic information regarding our environment, 
        #         which may be helpful for us if we end up needing to do any debugging.
        
        # Update Q-table for Q(s,a) : [(1-a)*old_value+a*learned_value]
        q_table[state, action] = q_table[state, action]*(1-learning_rate ) + learning_rate * ( reward + discount_rate * np.max(q_table[new_state, :]))
        
        state = new_state
        rewards_current_episode +=reward
                                                                                              
        # If done (if we're dead) : finish episode
        if done == True: 
            break        
                                                                                              
    # Reduce epsilon (because we need less and less exploration)
    # Exploration rate decay
    exploration_rate = min_exploration_rate + (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate*episode)  
    rewards_all_episodes.append(rewards_current_episode)
 # Calculate and print the average reward per thousand episodes
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes),num_episodes/1000)
count = 1000

print("********Average reward per thousand episodes********\n")
for r in rewards_per_thousand_episodes:
    print(count, ": ", str(sum(r/1000)))
    count += 1000                                                                                         

********Average reward per thousand episodes********

1000 :  0.058000000000000045
2000 :  0.22800000000000017
3000 :  0.4140000000000003
4000 :  0.5700000000000004
5000 :  0.6470000000000005
6000 :  0.6560000000000005
7000 :  0.6850000000000005
8000 :  0.7040000000000005
9000 :  0.6620000000000005
10000 :  0.6650000000000005


From this printout, we can see our average reward per thousand episodes did indeed progress over time. 
When the algorithm first started training, the first thousand episodes only averaged a reward of 0.05, 
but by the time it got to its last thousand episodes, the reward drastically improved to 0.65. 

Our agent played 10,000 episodes. At each time step within an episode, the agent received a reward of 1 if it reached the frisbee, otherwise, it received a reward of 0. If the agent did indeed reach the frisbee, then the episode finished at that time-step.

So, that means for each episode, the total reward received by the agent for the entire episode is either 1 or 0. So, for the first thousand episodes, we can interpret this score as meaning that 5%
of the time, the agent received a reward of 1 and won the episode. And by the last thousand episodes from a total of 10,000, the agent was winning
65%
of the time.

By analyzing the grid of the game, we can see it's a lot more likely that the agent would fall in a hole or perhaps reach the max time steps than it is to reach the frisbee, so reaching the frisbee 65%
of the time isn't too shabby, especially since the agent had no explicit instructions to reach the frisbee. It learned that this is the correct thing to do. 

In [40]:
# Print updated Q-table
print("\n\n********Q-table********\n")
print(q_table)



********Q-table********

[[0.56651888 0.47726023 0.47557092 0.47259415]
 [0.32582891 0.30168938 0.34903522 0.48050215]
 [0.40513376 0.40875344 0.40643236 0.43960892]
 [0.34950421 0.2179258  0.22658587 0.42347658]
 [0.57995993 0.41259932 0.38440829 0.43633523]
 [0.         0.         0.         0.        ]
 [0.18186762 0.16966767 0.33757415 0.12303268]
 [0.         0.         0.         0.        ]
 [0.37902219 0.40392625 0.3995311  0.62214574]
 [0.38632498 0.67204516 0.31229379 0.40910617]
 [0.61871364 0.43214808 0.34375371 0.32742847]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.57159051 0.49424274 0.75323657 0.38604726]
 [0.68890127 0.86400588 0.69426378 0.70536482]
 [0.         0.         0.         0.        ]]


#### Watch our agent play Frozen Lake by playing the best action 
#### from each state according to the Q-table
for episode in range(3):
    # initialize new episode params

    for step in range(max_steps_per_episode):        
        # Show current state of environment on screen
        # Choose action with highest Q-value for current state       
        # Take new action

        if done:
            if reward == 1:
                # Agent reached the goal and won episode
            else:
                # Agent stepped in a hole and lost episode            

        # Set new state

env.close()

In [None]:
for episode in range(3):
    state = env.reset()
    done = False
    print("*****EPISODE ", episode+1, "*****\n\n\n\n")
    time.sleep(1)
    
    for step in range(max_steps_per_episode):        
        clear_output(wait=True)
        env.render()
        time.sleep(0.3)
        action = np.argmax(q_table[state,:])        
        new_state, reward, done, info = env.step(action)
        if done:
            clear_output(wait=True)
            env.render()
            if reward == 1:
                print("****You reached the goal!****")
                time.sleep(3)
            else:
                print("****You fell through a hole!****")
                time.sleep(3)
            clear_output(wait=True)
            break
    state = new_state
env.close()