In [33]:
import numpy as np
import gym
import random as r

In [34]:
# Create env
env = gym.make("FrozenLake-v0")

In [35]:
# Create Q-table
action_size = env.action_space.n #cols
state_size = env.observation_space.n #rows

qtable = np.zeros((state_size, action_size))
print(qtable)
print("size: {}".format(qtable.shape))

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
size: (16, 4)


In [44]:
# Model params
total_episodes = 20000 # num training runs
learning_rate = 0.8 # step-size multiplier
max_steps = 200 # max num steps in a training run
gamma = 0.95 # discount for future rewards given last action

# Explore params
eps = 1.0 # exlore probability
max_eps = 1.0 # starting explore probability
min_eps = 0.01 # min explore probability (i.e. 0.99 prob to use Q-table)
decay = 0.005 # discount for exploration vs. exploitation 


In [45]:
# Track rewards
rewards = []

# Training loop
for episode in range(total_episodes):
    
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        
        dont_explore = r.uniform(0, 1)
        
        if dont_explore > eps: # q table
            action = np.argmax(qtable[state, :]) # from set of actions in state, take action w highest q val
            
        else: # random sample
            action = env.action_space.sample()
        
        new_state, reward, done, info = env.step(action) # execute action
        
        # Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :] - qtable[state, action]))
        # q_table[new_state, :] = q vals for all actions possible in new state resulting from action
        
        total_rewards += reward
        
        state = new_state
        
        if done:
            break
            
    eps = min_eps + (max_eps - min_eps) * np.exp(-decay*episode) # increase chance of dont_explore (i.e. use q-table)
    rewards.append(total_rewards)

print(eps)

print("Score over time: "+ str(sum(rewards) / total_episodes))
print(qtable)

0.01
Score over time: 0.01625
[[1.04957896e+00 1.04924743e+00 1.05042661e+00 1.05044916e+00]
 [4.94669854e-02 2.49422478e-01 2.51308112e-01 1.05044916e+00]
 [7.49403133e-01 1.03834732e+00 1.00654492e+00 1.05044916e+00]
 [1.00430335e+00 4.94307254e-02 4.94303389e-02 1.05044916e+00]
 [1.04247102e+00 7.35169443e-01 7.85147784e-01 2.51471972e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [3.43218102e-03 7.63792524e-04 9.99978326e-01 3.67266053e-05]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.41142850e-01 2.19155230e-01 2.05658706e-01 1.01476726e+00]
 [4.68632302e-02 9.25995901e-01 4.68475533e-02 2.14050281e-01]
 [9.67256370e-01 5.63111052e-02 2.24530349e-02 4.33071151e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [3.90452702e-02 2.44807356e-01 9.20715060e-01 5.18679904e-02]
 [9.43216111e-01 1.02928913e+00 5.57545951e-01 6.51659298e-01]
 [0.00000000e+00 0.000000

In [46]:
# Play Frozen Lake
env.reset()

for episode in range(5):
    state = env.reset()
    step = 0 
    done = False
    
    print("****************************************************")
    print("EPISODE ", episode)
    
    for step in range(max_steps):
        
        action = np.argmax(qtable[state, :]) # given state, take action with highest q val
        
        new_state, reward, done, info = env.step(action) # execute action
        
        if done:   
            env.render()
            print("Num Steps: {}".format(step))
            break
        
        state = new_state
        


****************************************************
EPISODE  0
  (Up)
S[41mF[0mFF
FHFH
FFFH
HFFG
Num Steps: 99
****************************************************
EPISODE  1
  (Up)
S[41mF[0mFF
FHFH
FFFH
HFFG
Num Steps: 99
****************************************************
EPISODE  2
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
Num Steps: 99
****************************************************
EPISODE  3
  (Up)
S[41mF[0mFF
FHFH
FFFH
HFFG
Num Steps: 99
****************************************************
EPISODE  4
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
Num Steps: 99
