In [1]:
import numpy as np
import gym
import random

In [22]:
# Setup environment
env = gym.make("Taxi-v3")
env.render()

action_size = env.action_space.n
state_size = env.observation_space.n

qtable = np.zeros((state_size, action_size))
print(qtable)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m|[43m [0m: |[34;1mB[0m: |
+---------+

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


In [23]:
# Create hyperparameters
total_episodes = 100000        # Total episodes
learning_rate = 0.7           # Learning rate
max_steps = 500                # Max steps per episode
gamma = 0.6                  # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.001            # Minimum exploration probability 
decay_rate = 0.005             # Exponential decay rate for exploration prob

In [24]:
# Implement q-learning algorithm

"""
1. Implement Q-values (Q(s, a)) arbitrarily for all state-action pairs (done)
2. For life or until learning is stopped...
3. Choose an action (a) in the current world state (s) based on current Q-value estimates 
4. Take the action (a) and observe the outcome state (s') and reward (r)
5. Update Q(s, a) := Q(s, a) + learning_rate(r + \gamma )
"""
rewards = []

def exploit():
    """Returns True if we are to take an action."""
    # Select a random number
    n = random.uniform(0, 1)
    return n > epsilon # If n > epislon, we exploit.

def update_qtable(state, action, new_state):
    qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])

for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for s in range(max_steps):
        # Choose an action.
        if exploit():
            # Exploit.
            action = np.argmax(qtable[state, :])
        else:
            # Explore; choose and take a random action.
            action = env.action_space.sample()
            
        new_state, reward, done, info = env.step(action)
            
        # Update the qtable.
        update_qtable(state, action, new_state)
            
        # Update other variables.
        total_rewards += reward
        state = new_state
            
        # Check if we're dead.
        if done == True:
            break

    # Reduce epsilon.
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
    rewards.append(total_rewards)
    
    # Print score every few steps to make sure that we're learning.
    if episode % 10000 == 0:
        average_score = round(sum(rewards)/total_episodes, 2)
        print("Episode {episode}: {average_score}".format(episode=episode, average_score=average_score))

print(qtable)

Episode 0: -0.01
Episode 10000: -0.16
Episode 20000: 0.63
Episode 30000: 1.42
Episode 40000: 2.2
Episode 50000: 2.99
Episode 60000: 3.78
Episode 70000: 4.56
Episode 80000: 5.35
Episode 90000: 6.14
[[  0.           0.           0.           0.           0.
    0.        ]
 [ -2.41803474  -2.37171865  -2.41831196  -2.36411402  -2.27325184
  -11.34330637]
 [ -2.05340436  -1.46008636  -1.90165451  -1.50835036  -0.7504
  -10.45146355]
 ...
 [ -1.64437      0.22957941  -1.675744    -1.6414972  -10.213
  -10.213     ]
 [ -2.21605084  -2.12141245  -2.1767709   -2.22236765 -11.05009856
  -10.484656  ]
 [  5.59998815   1.43960008   3.71        11.          -3.40044686
   -3.40277028]]


In [38]:
# Play Taxi.

env.reset()

n_trials = 10000
rewards = []

for episode in range(n_trials):
    state = env.reset()
    step = 0
    done = False
    #print("****************************************************")
    #print("EPISODE ", episode)
    total_reward = 0
    
    for step in range(max_steps):
        
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(qtable[state,:])
        
        new_state, reward, done, info = env.step(action)
        total_reward += reward
        
        # env.render()
        if done:
            # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
            # env.render()
            
            #print("Score", reward)
            rewards.append(total_reward)
            break
        state = new_state
        
print("Score over {n} trials: ".format(n=n_trials) + str(sum(rewards)/n_trials))
env.close()

78613
Score over 10000 trials: 7.8613
