In [1]:
import gym
import numpy as np

In [2]:
env = gym.make('FrozenLake-v0')

In [3]:
print("observation_space: ", env.observation_space.n)
print("action_space: ", env.action_space.n)


observation_space:  16
action_space:  4


In [4]:

def value_iteration(env, gamma=1.0):
    value_table = np.zeros(env.observation_space.n)
    no_of_iterations = 10000
    threshold = 1e-20
    
    for i in range(no_of_iterations):
        updated_value_table = value_table.copy()

        for state in range(env.observation_space.n):
            Q_value = []
            for action in range(env.action_space.n):
                next_states_rewards = []
                for next_sr in env.P[state][action]:
                    trans_prob, next_state, reward_prob, _ = next_sr
                    next_states_rewards.append((trans_prob* (reward_prob+gamma*updated_value_table[next_state])))
                Q_value.append(np.sum(next_states_rewards))
            value_table[state] = max(Q_value)
#         print(value_table)
#         print()
        if(np.sum(np.fabs(updated_value_table-value_table))<=threshold):
            print("Value-iteration converged at iteration# %d."%(i+1))
            break
            
    return value_table

            

In [5]:
optimal_value_function = value_iteration(env)

Value-iteration converged at iteration# 1373.


In [6]:
def extract_policy(value_table, gamma = 1.0):
    policy = np.zeros(env.observation_space.n)
    for state in range(env.observation_space.n):
        Q_table = np.zeros(env.action_space.n)
        for action in range(env.action_space.n):
            for next_sr in env.P[state][action]:
                trans_prob, next_state, reward_prob, _ = next_sr
                Q_table[action]+=(trans_prob*(reward_prob+gamma*value_table[next_state]))
        policy[state] = np.argmax(Q_table)
    return  policy

In [7]:
optimal_policy_function = extract_policy(optimal_value_function)
print(optimal_policy_function)

[0. 3. 3. 3. 0. 0. 0. 0. 3. 1. 0. 0. 0. 2. 1. 0.]


In [8]:
for state in range(env.observation_space.n):
    for action in range(env.action_space.n):
        print("state: {}, action: {}".format(state, action))
        print(env.P[state][action])
        print()

state: 0, action: 0
[(0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 4, 0.0, False)]

state: 0, action: 1
[(0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 4, 0.0, False), (0.3333333333333333, 1, 0.0, False)]

state: 0, action: 2
[(0.3333333333333333, 4, 0.0, False), (0.3333333333333333, 1, 0.0, False), (0.3333333333333333, 0, 0.0, False)]

state: 0, action: 3
[(0.3333333333333333, 1, 0.0, False), (0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 0, 0.0, False)]

state: 1, action: 0
[(0.3333333333333333, 1, 0.0, False), (0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 5, 0.0, True)]

state: 1, action: 1
[(0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 5, 0.0, True), (0.3333333333333333, 2, 0.0, False)]

state: 1, action: 2
[(0.3333333333333333, 5, 0.0, True), (0.3333333333333333, 2, 0.0, False), (0.3333333333333333, 1, 0.0, False)]

state: 1, action: 3
[(0.3333333333333333, 2, 0.0, False), (0.33333333333

In [40]:
def compute_value_function(policy, gamma = 1.0):
    value_table = np.zeros(env.nS)
    threshold = 1e-10
    while True:
        updated_value_table = value_table.copy()
        for state in range(env.nS):
            action = policy[state]
                
            value_table[state] = sum([trans_prob * (reward_prob + gamma *updated_value_table[next_state])\
                                      for trans_prob, next_state, reward_prob, _ in env.P[state][action]])

        #print(np.sum(np.fabs(updated_value_table-value_table)))
        if(np.sum(np.fabs(updated_value_table-value_table))<=threshold):
            break
    return value_table
                                      

In [41]:
print(env.nS)

16


In [42]:
random_policy = np.zeros(env.nS)
optimize_value_table = compute_value_function(random_policy)

In [43]:
def policy_iteration(env,gamma = 1.0):
    random_policy = np.zeros(env.observation_space.n)
    no_of_iterations = 200
    for i in range(no_of_iterations):
        new_value_function = compute_value_function(random_policy, gamma)
        new_policy = extract_policy(new_value_function, gamma)
        if(np.all(random_policy==new_policy)):
            print('Policy-Iteration converged at step %d.' %(i+1))
            break
        if(i%10==0):
            print('iter: ',i,new_policy)
        random_policy = new_policy
    return new_policy

In [44]:
optimal_policy = policy_iteration(env,gamma=1.0)

iter:  0 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
Policy-Iteration converged at step 7.


In [45]:
print(optimal_policy)

[0. 3. 3. 3. 0. 0. 0. 0. 3. 1. 0. 0. 0. 2. 1. 0.]


In [52]:
# frozen-lake-ex2.py
import gym
 
MAX_ITERATIONS = 10
 
env = gym.make("FrozenLake-v0")
env.reset()
env.render()
for i in optimal_policy:
    random_action = env.action_space.sample()
    print(i)
    new_state, reward, done, info = env.step(
       int(i))
    env.render()
    if done:
        break


[41mS[0mFFF
FHFH
FFFH
HFFG
0.0
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
3.0
  (Up)
S[41mF[0mFF
FHFH
FFFH
HFFG
3.0
  (Up)
SF[41mF[0mF
FHFH
FFFH
HFFG
3.0
  (Up)
SFF[41mF[0m
FHFH
FFFH
HFFG
0.0
  (Left)
SF[41mF[0mF
FHFH
FFFH
HFFG
0.0
  (Left)
SF[41mF[0mF
FHFH
FFFH
HFFG
0.0
  (Left)
S[41mF[0mFF
FHFH
FFFH
HFFG
0.0
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
