In [16]:
import gym
import numpy as np
from IPython import display
import matplotlib.pyplot as plt
import time
%matplotlib inline

In [17]:
env = gym.make('FrozenLake-v0')
env = env.unwrapped
print(env)

<FrozenLakeEnv<FrozenLake-v0>>


In [18]:
print("number of states: ", env.observation_space.n)
print("number of actions: ", env.action_space.n)

number of states:  16
number of actions:  4


In [19]:
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [20]:
def value_iteration(env, gamma = 1.0):
    # 초기 value table 0으로 초기화
    value_table = np.zeros(env.observation_space.n)
    
    # 반복횟수와 threshold 설정
    no_of_iterations = 10000
    threshold = 1e-20
    
    for i in range(no_of_iterations):
        
        # 각 반복에 대해서 value table 업데이트
        updated_value_table = np.copy(value_table)
        
        # 각 action에 대해 Q value 계산 
        # value of state를 업데이트 Q value가 maxximum할때
        
        for state in range(env.observation_space.n):
            # 현재 state에 대한 Q -> Q_value= [Q(s,a1), Q(s,a2)...]
            Q_value = []
            
            for action in range(env.action_space.n):
                # Q value(각 action에 대한) nest_states_rewards = [rewards(s,a1,s2), rewards(s,a1,s3),...]
                next_states_rewards=[]
                
                for next_sr in env.P[state][action]:
                    trans_prob, next_state, reward_prob, _ = next_sr
                    next_states_rewards.append((trans_prob*(reward_prob+gamma*updated_value_table[next_state])))
                    
                Q_value.append(np.sum(next_states_rewards))
            
            value_table[state] = max(Q_value)
            
        if(np.sum(np.fabs(updated_value_table - value_table))<= threshold):
            print('Value-iteration converged at iteration# %d.'%(i+1))
            break
            
    return value_table
                

In [21]:
def extract_policy(value_table, gamma = 1.0):
    
    # 초기 policy 설정
    policy = np.zeros(env.observation_space.n, np.int32)
    
    for state in range(env.observation_space.n):
        
        # Q table 초기화
        Q_table = np.zeros(env.action_space.n)
        
        #Q value 계산 (모든 state에 대해서)
        for action in range(env.action_space.n):
            for next_sr in env.P[state][action]:
                trans_prob, next_state, reward_prob, _ = next_sr
                Q_table[action] += (trans_prob*(reward_prob+gamma*value_table[next_state]))
                
        # Maximum Q value 선택 (최적의 state에서)
        policy[state] = np.argmax(Q_table)
    
    return policy

In [22]:
def play(env, optimal_policy, max_step=1000):
    state = 0
    for i in range(max_step):
        env.render()
        time.sleep(1)
        display.clear_output(wait=True)
        display.display(plt.gcf())
        state, _, done, _ = env.step(optimal_policy[state])
        
        if done:
            env.render()
            break;

In [23]:
optimal_value_function = value_iteration(env=env, gamma=1.0)

Value-iteration converged at iteration# 1373.


In [24]:
optimal_policy = extract_policy(optimal_value_function, gamma=1.0)

In [25]:
print(optimal_policy)

[0 3 3 3 0 0 0 0 3 1 0 0 0 2 1 0]


In [26]:
play(env, optimal_policy)

<matplotlib.figure.Figure at 0x405d37c940>

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m


<matplotlib.figure.Figure at 0x405d37c940>