In [57]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from gym.envs.registration import register
%matplotlib inline

In [59]:
register(
    id = 'FrozenLake-v3',
    entry_point = 'gym.envs.toy_text:FrozenLakeEnv',
    kwargs={
        'map_name': '4x4',
        'is_slippery': False
    }
)

env = gym.make("FrozenLake-v3")

In [60]:
print("env.observation_space.n", env.observation_space.n)
print("env.action_space.n", env.action_space.n)
Q = np.zeros([env.observation_space.n, env.action_space.n])
print(Q)

env.observation_space.n 16
env.action_space.n 4
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [61]:
observation = env.reset()
print(observation)

0


In [62]:
Q[observation, :]

array([0., 0., 0., 0.])

In [63]:
next_observation, reward, done, information = env.step(1)
print("next_observation", next_observation)

next_observation 4


In [71]:
# 테이블을 모두 0으로 초기화
Q = np.zeros([env.observation_space.n, env.action_space.n])

# 학습 매캐변수를 설정한다.
learning_rate = .85
discount_reward = .7
num_episodes = 5000
max_step = 100
max_early_stop_length = 20

# 보상의 총합계를 담을 리스트를 생성한다.
rewardList = []

early_stop_length = 0
stopped_episode_num = 0
for current_episode_num in range(num_episodes):
  # 환경을 리셋하고 첫 번째 새로운 관찰(observation)을 얻는다.
  observation = env.reset()
  rewardAll = 0
  done = False
  step = 0
  # Q 테이블 학습 알고리즘
  while step < max_step:
    # Q 테이블로부터 (노이즈와 함께) 그리디하게 액션을 선택
    action = np.argmax(Q[observation, :] + 
                       np.random.randn(1, env.action_space.n) * 
                       ( 1. / (current_episode_num + 1)))
    
    # 환경으로부터 새로운 상태와 보상을 얻는다.
    next_observation, reward, done, information = env.step(action)
    
    # 새로운 지식을 통해 Q 테이블을 업데이트한다.
    Q[observation, action] = Q[observation, action] + learning_rate * (reward + discount_reward * np.max(Q[next_observation, :]) - Q[observation, action])
    rewardAll += reward
    observation = next_observation
    if done:
      break
    step += 1
  early_stop_length += 1
  if reward != 1.:
    early_stop_length = 0
  if early_stop_length >= max_early_stop_length:
    break
  rewardList.append(rewardAll)
  stopped_episode_num = current_episode_num

In [72]:
print("Score over time: " + str(sum(rewardList) / num_episodes))
print("Stopped episode num: " + str(current_episode_num))

Score over time: 0.0048
Stopped episode num: 158


In [73]:
print("Final Q-Table Values")
print(Q)

Final Q-Table Values
[[0.      0.16807 0.      0.     ]
 [0.      0.      0.      0.     ]
 [0.      0.      0.      0.     ]
 [0.      0.      0.      0.     ]
 [0.      0.2401  0.      0.     ]
 [0.      0.      0.      0.     ]
 [0.      0.      0.      0.     ]
 [0.      0.      0.      0.     ]
 [0.      0.      0.343   0.     ]
 [0.      0.      0.49    0.     ]
 [0.      0.7     0.      0.     ]
 [0.      0.      0.      0.     ]
 [0.      0.      0.      0.     ]
 [0.      0.      0.      0.     ]
 [0.      0.      1.      0.     ]
 [0.      0.      0.      0.     ]]


In [74]:
Q_list = list(Q)
for i, line in enumerate(Q):
    Q_list[i] = list(map(float, line))

In [75]:
for line in Q_list:
    print(line)

[0.0, 0.16806999999986247, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.24009999999999176, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.3429999999999996, 0.0]
[0.0, 0.0, 0.48999999999999994, 0.0]
[0.0, 0.7, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 1.0, 0.0]
[0.0, 0.0, 0.0, 0.0]


In [76]:
env.reset()
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [77]:
macro = {
  'LEFT':0,
  'DOWN':1,
  'RIGHT':2,
  'UP':3
}

newLine_interval = 4
ni = 0
for cell in Q:
  action = np.argmax(cell[:])
  for key, value in macro.items():
    if value == action:
      action = key
      break
  print(action, end=', ')
  ni += 1
  if ni >= 4:
    ni = 0
    print()
    

DOWN, LEFT, LEFT, LEFT, 
DOWN, LEFT, LEFT, LEFT, 
RIGHT, RIGHT, DOWN, LEFT, 
LEFT, LEFT, RIGHT, LEFT, 
