## Frozen Lake with Q-table

In [1]:
import gym,sys,numpy as np
import tensorflow as tf
from gym.envs.registration import register

np.random.seed(56776)

In [2]:
register(
    id='FrozenLakeNotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '4x4', 'is_slippery': False},
    max_episode_steps=2000,
    reward_threshold=0.78, # optimum = .8196
)

In [3]:
# make the env
env = gym.make('FrozenLakeNotSlippery-v0')
env.seed(0)

[0]

In [5]:
print(env.observation_space)
print(env.action_space)

Discrete(16)
Discrete(4)


In [6]:
q_learning_table = np.zeros([env.observation_space.n,env.action_space.n])
print(q_learning_table)
print(env.render())

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]

[41mS[0mFFF
FHFH
FFFH
HFFG
None


In [7]:
# -- hyper --
num_epis = 5000
num_iter = 2000
learning_rate = 0.3
discount = 0.8

In [9]:
# -- training the agent ----
for epis in range(num_epis):
    state = env.reset()
    for iter in range(num_iter):
        action = np.argmax(q_learning_table[state,:] + np.random.randn(1,4))
        state_new,reward,done,_ = env.step(action)
        q_learning_table[state,action] = (1-learning_rate)* q_learning_table[state,action] + \
                                         learning_rate * (reward + discount*np.max(q_learning_table[state_new,:]) )
        state = state_new
        if done: break

In [10]:
print(np.argmax(q_learning_table,axis=1))
print(np.around(q_learning_table,6))
print('-------------------------------')

[1 2 1 0 1 0 1 0 2 1 1 0 0 2 2 0]
[[0.262144 0.32768  0.32768  0.262144]
 [0.262144 0.       0.4096   0.32768 ]
 [0.32768  0.512    0.32768  0.4096  ]
 [0.4096   0.       0.32768  0.32768 ]
 [0.32768  0.4096   0.       0.262144]
 [0.       0.       0.       0.      ]
 [0.       0.64     0.       0.4096  ]
 [0.       0.       0.       0.      ]
 [0.4096   0.       0.512    0.32768 ]
 [0.4096   0.64     0.64     0.      ]
 [0.512    0.8      0.       0.512   ]
 [0.       0.       0.       0.      ]
 [0.       0.       0.       0.      ]
 [0.       0.64     0.8      0.512   ]
 [0.64     0.8      1.       0.64    ]
 [0.       0.       0.       0.      ]]
-------------------------------


In [11]:
# visualize no uncertainty
s = env.reset()
for _ in range(100):
    action  = np.argmax(q_learning_table[s,:])
    state_new,_,done,_ = env.step(action)
    env.render()
    s = state_new
    if done: break
print('-------------------------------')

  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
-------------------------------


In [12]:
# make the env
env = gym.make('FrozenLake-v0')
env.seed(0)
np.random.seed(56776)

In [13]:
q_learning_table = np.zeros([env.observation_space.n,env.action_space.n])
print(q_learning_table)
print(env.render())

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]

[41mS[0mFFF
FHFH
FFFH
HFFG
None


In [14]:
# -- hyper --
num_epis = 500
num_iter = 200
learning_rate = 0.3
discount = 0.8

In [15]:
# -- training the agent ----
for epis in range(num_epis):
    
    state = env.reset()

    for iter in range(num_iter):
        action = np.argmax(q_learning_table[state,:] + np.random.randn(1,4))
        state_new,reward,done,_ = env.step(action)
        q_learning_table[state,action] = (1-learning_rate)* q_learning_table[state,action] + \
                                         learning_rate * (reward + discount*np.max(q_learning_table[state_new,:]) )
        state = state_new

        if done: break

In [16]:
print(np.argmax(q_learning_table,axis=1))
print(np.around(q_learning_table,6))
print('-------------------------------')

[1 3 2 3 0 0 2 0 3 0 2 0 0 2 1 0]
[[0.020348 0.026125 0.02331  0.019298]
 [0.016515 0.01085  0.023092 0.035422]
 [0.037097 0.038034 0.0484   0.018565]
 [0.011072 0.007907 0.005415 0.019216]
 [0.028192 0.017752 0.022289 0.016612]
 [0.       0.       0.       0.      ]
 [0.028921 0.005853 0.092386 0.013324]
 [0.       0.       0.       0.      ]
 [0.039444 0.050597 0.056708 0.063208]
 [0.083636 0.071177 0.039744 0.067552]
 [0.130934 0.23156  0.23394  0.050257]
 [0.       0.       0.       0.      ]
 [0.       0.       0.       0.      ]
 [0.086129 0.077684 0.223174 0.052278]
 [0.194623 0.673348 0.387477 0.549202]
 [0.       0.       0.       0.      ]]
-------------------------------


In [17]:
s = env.reset()
for _ in range(100):
    action  = np.argmax(q_learning_table[s,:])
    state_new,_,done,_ = env.step(action)
    env.render()
    s = state_new
    if done: break
# -- end code --

  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
