In [11]:
import gym
import numpy as np
env = gym.make('FrozenLake-v1')


In [12]:
def value_iteration(env):

    #set the number of iterations
    num_iterations = 1000
    
    #set the threshold number for checking the convergence of the value function
    threshold = 1e-20
    
    #we also set the discount factor
    gamma = 1.0
    
    #now, we will initialize the value table, with the value of all states to zero
    value_table = np.zeros(env.observation_space.n)
    
    #for every iteration
    for i in range(num_iterations):
        
        #update the value table, that is, we learned that on every iteration, we use the updated value
        #table (state values) from the previous iteration
        updated_value_table = np.copy(value_table) 
             
        #now, we compute the value function (state value) by taking the maximum of Q value.
        
        #thus, for each state, we compute the Q values of all the actions in the state and then
        #we update the value of the state as the one which has maximum Q value as shown below:
        for s in range(env.observation_space.n):
            
            Q_values = [sum([prob*(r + gamma * updated_value_table[s_])
                             for prob, s_, r, _ in env.P[s][a]]) 
                                   for a in range(env.action_space.n)] 
                                        
            value_table[s] = max(Q_values) 
                        
        #after computing the value table, that is, value of all the states, we check whether the
        #difference between value table obtained in the current iteration and previous iteration is
        #less than or equal to a threshold value if it is less then we break the loop and return the
        #value table as our optimal value function as shown below:
    
        if (np.sum(np.fabs(updated_value_table - value_table)) <= threshold):
             break
    
    return value_table

In [13]:
def extract_policy(value_table):
    
    #set the discount factor
    gamma = 1.0
     
    #first, we initialize the policy with zeros, that is, first, we set the actions for all the states to
    #be zero
    policy = np.zeros(env.observation_space.n) 
    
    #now, we compute the Q function using the optimal value function obtained from the
    #previous step. After computing the Q function, we can extract policy by selecting action which has
    #maximum Q value. Since we are computing the Q function using the optimal value
    #function, the policy extracted from the Q function will be the optimal policy. 
    
    #As shown below, for each state, we compute the Q values for all the actions in the state and
    #then we extract policy by selecting the action which has maximum Q value.
    
    #for each state
    for s in range(env.observation_space.n):
        
        #compute the Q value of all the actions in the state
        Q_values = [sum([prob*(r + gamma * value_table[s_])
                             for prob, s_, r, _ in env.P[s][a]]) 
                                   for a in range(env.action_space.n)] 
                
        #extract policy by selecting the action which has maximum Q value
        policy[s] = np.argmax(np.array(Q_values))        
    
    return policy

In [14]:
optimal_value_function = value_iteration(env=env)
optimal_policy = extract_policy(optimal_value_function)
print(optimal_policy)


[0. 3. 3. 3. 0. 0. 0. 0. 3. 1. 0. 0. 0. 2. 1. 0.]
