In [1]:
import numpy as np
import gym
import random as r

In [2]:
env = gym.make("Taxi-v2")
env.render()

+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[43mB[0m: |
+---------+



In [3]:
action_size = env.action_space
print("Action Size: {}".format(action_size))
state_size = env.observation_space
print("State Size: {}".format(state_size))

Action Size: Discrete(6)
State Size: Discrete(500)


In [4]:
qtable = np.zeros((env.observation_space.n, env.action_space.n))
print(qtable)
print(qtable.shape)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]
(500, 6)


In [5]:
# Training params
total_episodes = 50000
total_test_episodes = 100
max_steps = 99
learning_rate = 0.7 # scale updates to qtable vals
gamma = 0.618 # discount for future reward given last action

# Explore params
eps = 1.0 # explore probability
max_eps = 1.0 # starting explore prob
min_eps = 0.01 # minimum explore prob (i.e. 0.99 prob to use q table)
decay = 0.05 # discount for exploration vs. using q table

In [6]:
# Training loop
for episode in range(total_episodes):
    
    state = env.reset()
    step = 0
    done = False
    
    # Action loop
    for step in range(max_steps):
        
        dont_explore = r.uniform(0, 1) # random n that when compared to eps will dictate if we explore / use q table
        
        if dont_explore > eps:
            action = np.argmax(qtable[state, :]) # sample action from q table
            
        else:
            action = env.action_space.sample() # sample random action
        
        new_state, reward, done, info = env.step(action) # take action
        
#         print(reward)
        
        qtable[state, action] = (qtable[state, action] + # update qval 
        learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action]))
        
        state = new_state # update state
        
        if done:
            break
    eps = min_eps + (max_eps - min_eps) * np.exp(-decay * episode)

In [7]:
print(qtable)

[[  0.           0.           0.           0.           0.
    0.        ]
 [ -2.32036714  -2.13660537  -2.32039854  -2.13701391  -1.83910189
  -11.13647852]
 [ -1.83964108  -1.35783327  -1.83911265  -1.35798077  -0.57891593
  -10.35776441]
 ...
 [ -2.06537874  -1.94299292  -1.94631341  -1.35780386  -7.
   -7.        ]
 [ -2.50122595  -2.46929711  -2.50122595  -2.43401294  -9.40282
   -7.        ]
 [ -1.21282     -1.21282     -1.21282     11.35999991  -7.
   -7.        ]]


In [8]:
env.reset()
rewards = []

# play
for episode in range(total_test_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    print("****************************************************")
    print("EPISODE ", episode)
    
    for step in range(max_steps):
        
        env.render() # show env as we're playing
        
        action = np.argmax(qtable[state, :]) # sample action from qtable with highest val
        
        new_state, reward, done, info = env.step(action) # take action
        
        total_rewards += reward # track rewards
        
        if done:
            rewards.append(total_rewards)
            print("Episode {} Score: {}".format(episode, total_rewards))
            break
        
        state = new_state
        
env.close()
print ("Score over time: " +  str(sum(rewards) / total_test_episodes))

****************************************************
EPISODE  0
+---------+
|R: | : :[34;1mG[0m|
| : : : : |
| : : : : |
| | : | : |
|[43mY[0m| : |[35mB[0m: |
+---------+

+---------+
|R: | : :[34;1mG[0m|
| : : : : |
| : : : : |
|[43m [0m| : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :[34;1mG[0m|
| : : : : |
|[43m [0m: : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :[34;1mG[0m|
| : : : : |
| :[43m [0m: : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (East)
+---------+
|R: | : :[34;1mG[0m|
| :[43m [0m: : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :[34;1mG[0m|
| : :[43m [0m: : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (East)
+---------+
|R: | : :[34;1mG[0m|
| : : :[43m [0m: |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (East)
+---------+
|R: | :[43m [0m:[34;1mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |

Episode 78 Score: 9
****************************************************
EPISODE  79
+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : : :[43m [0m: |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+

+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : :[43m [0m: : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (West)
+---------+
|[34;1mR[0m: | : :[35mG[0m|
| :[43m [0m: : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (West)
+---------+
|[34;1mR[0m: | : :[35mG[0m|
|[43m [0m: : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (West)
+---------+
|[34;1m[43mR[0m[0m: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|[42mR[0m: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Pickup)
+---------+
|R:[42m_[0m| : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (East)
+---------+
|R: | : :[35mG[0m|
| :[42m_[0m: : : |
| : : : : |
| | : | : |
|Y| : |B: 

+---------+
|R: | :[42m_[0m:[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|R: | : :[35m[42mG[0m[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (East)
Episode 88 Score: 9
****************************************************
EPISODE  89
+---------+
|[35mR[0m: |[43m [0m: :[34;1mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+

+---------+
|[35mR[0m: | :[43m [0m:[34;1mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (East)
+---------+
|[35mR[0m: | : :[34;1m[43mG[0m[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (East)
+---------+
|[35mR[0m: | : :[42mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Pickup)
+---------+
|[35mR[0m: | :[42m_[0m:G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (West)
+---------+
|[35mR[0m: |[42m_[0m: :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (West)