In [1]:
import gym

In [2]:
import numpy as np
#make environment
env = gym.make('FrozenLake-v1')
# as the environment is continues there cannot be finite number of states 
states = env.observation_space.n #used if discrete environment

In [3]:
#check number of actions that can be 
actions = env.action_space.n

#initialize value table randomly
value_table = np.zeros((states,1))

In [4]:
def value_iterations(env , n_iterations , gamma = 1.0 , threshold = 1e-30):
    for i in range(n_iterations):
        new_valuetable = np.copy(value_table)
        for state in range(states):
            q_value = []
            for action in range(actions):
                next_state_reward = []
                for next_state_parameters in env.env.P[state][action]:
                    transition_prob, next_state, reward_prob, _ = next_state_parameters
                    reward = transition_prob*(reward_prob+gamma*new_valuetable[next_state])
                    next_state_reward.append(reward)
                    
                    
                q_value.append((np.sum(next_state_reward)))
            value_table[state] = max(q_value)
            
        if (np.sum(np.fabs(new_valuetable - value_table))<=threshold):
            break
    return value_table

In [5]:
def extract_policy(value_table, gamma = 1.0):
  policy = np.zeros(env.observation_space.n)
  for state in range(env.observation_space.n):
    Q_table = np.zeros(env.action_space.n)
    for action in range(env.action_space.n):
      for next_sr in env.env.P[state][action]:
        transition_prob, next_state, reward_prob, _ = next_sr
        Q_table[action] += (transition_prob * (reward_prob + gamma *value_table[next_state]))
    policy[state] = np.argmax(Q_table)
  return policy
value_table = value_iterations(env,10000)
policy = extract_policy(value_table)
print(policy)

[0. 3. 3. 3. 0. 0. 0. 0. 3. 1. 0. 0. 0. 2. 1. 0.]


In [6]:
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [7]:
print(value_table.reshape([4, -1]))

[[0.82352941 0.82352941 0.82352941 0.82352941]
 [0.82352941 0.         0.52941176 0.        ]
 [0.82352941 0.82352941 0.76470588 0.        ]
 [0.         0.88235294 0.94117647 0.        ]]


In [8]:
print(policy.reshape([4, -1]))

[[0. 3. 3. 3.]
 [0. 0. 0. 0.]
 [3. 1. 0. 0.]
 [0. 2. 1. 0.]]


In [11]:
env.env.P[5]

{0: [(1.0, 5, 0, True)],
 1: [(1.0, 5, 0, True)],
 2: [(1.0, 5, 0, True)],
 3: [(1.0, 5, 0, True)]}