# Monte Carlo Prediction
---
> Monte Carlo
1. Policy Iteration
2. Value Iteration

* 불확실성을 예측하기 위해 sampling 사용 (trial - error)
* Value Function 을 사용하는 것은 model - free 에서 못쓰므로 여기서부터 Q Function 사용

In [3]:
import gym
import numpy as np
import random
from gym.envs.registration import register

In [4]:
'''
환경셋팅 한 후에 환경을 추가등록한다.
'''

register(
    id='FrozenLake-v1',
    entry_point="gym.envs.toy_text:FrozenLakeEnv",
    kwargs={'map_name':'4x4','is_slippery':False})

In [5]:
'''
환경 생성
'''
env = gym.make('FrozenLake-v1')

In [185]:
q_table = np.zeros([env.action_space.n, env.observation_space.n], dtype = np.float16)
gamma = .9
epsilon = 1
episode = 0
max_episode = 1000
history = np.zeros([1, 3])

state = env.reset()
action = env.action_space.sample()
step = 0

'''
compute discounted
'''
def discounted_reward(rewards):
    discounted = np.zeros_like(rewards)
    sum_tmp = 0
    
    for i in reversed(range(0, len(rewards))):
        sum_tmp = gamma * sum_tmp + rewards[i]
        discounted[i] = sum_tmp
    
    return discounted

'''
main
'''
while(episode < max_episode):
    
    step += 1
    state_next, reward, done, _ = env.step(action)
        
    if(random.random() > epsilon):
        action_next = np.argmax(q_table[ : , state_next])
    else:
        action_next = env.action_space.sample()
    
    
    state_old = state
    action_old = action
    
    state = state_next
    action = action_next
    
    # history 쌓기
    history = np.vstack((history, [state_old, action_old, reward]))
    
    # episode 끝나면 업데이트
    if(done):
        if(reward):
            if(epsilon > .1):
                epsilon = 1 / (episode/(max_episode/10) + 1)
            else:
                epsilon = .1
        else:
            history[-1, 2] = -1
            
        epr = discounted_reward(history[ : , 2])
        
        # M.C 업데이트
        for i in range(len(history)):
            s = int(history[i, 0])
            a = int(history[i, 1])
            r = epr[i]
            q_table[a, s] += gamma * (r - q_table[a, s])
            
        step = 0
        episode += 1
        env.reset()
        history = np.zeros([1, 3])

env.close()

In [186]:
q_table

array([[ 0.531   , -0.2311  , -0.8813  , -0.5576  , -0.815   , -0.04794 ,
        -1.      , -0.589   , -0.89    , -0.6646  ,  0.7134  ,  0.0314  ,
        -0.3142  , -1.      ,  0.81    ,  0.014175],
       [ 0.567   , -1.      , -0.4736  , -1.      ,  0.6562  , -0.5312  ,
        -0.729   , -0.5884  , -1.      ,  0.81    ,  0.9     , -0.06555 ,
        -0.5312  ,  0.81    ,  0.9     ,  0.584   ],
       [-0.1616  , -0.7915  , -0.619   , -0.89    , -1.      , -0.07764 ,
        -1.      , -0.817   ,  0.729   ,  0.655   , -1.      , -0.833   ,
        -0.8916  ,  0.9     ,  1.      , -0.01907 ],
       [ 0.3845  , -0.089   , -0.8013  , -0.2213  , -0.7373  ,  0.1327  ,
        -0.5195  , -0.03546 ,  0.4521  , -1.      ,  0.729   , -0.601   ,
        -0.01228 ,  0.715   ,  0.81    , -0.2607  ]], dtype=float16)

In [188]:
s = env.reset()

step = 1
while(True):
    env.render()
    a = np.argmax(q_table[ : , s])
    s,r,d,_ = env.step(a)
    if(d):
        env.render()
        break
env.close()


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
