In [1]:
import numpy as np
import gym
import random

In [2]:
env = gym.make('Taxi-v3',render_mode="ansi")
num_steps = 15000

# learning rate similar to supervised learning
alpha = 0.15
# parameter on how much to emphasize future gains
gamma = 0.6
# parameter to control exploitation/exploration
epsilon = 0.1

In [3]:
q_table = np.zeros((env.observation_space.n, env.action_space.n))

In [4]:
for i in range(num_steps):
    state, info = env.reset()
    done = False
    while not done:
        # Choose best possible action
        # Used epsilon to introduce exploration
        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state, :])

        # Do the action
        next_state, reward, done, _, info = env.step(action)

        # Update q-table
        new_q_value = (1-alpha)*q_table[state, action] + \
            alpha*(reward + gamma*np.max(q_table[next_state, :]))

        # Assign new values and go the next state
        q_table[state, action] = new_q_value
        state = next_state
        
print("Training finished")

Training finished


In [6]:
from IPython.display import clear_output
import time

done = False
state,info = env.reset()
total_penalty = 0
steps = 0

while not done:
    clear_output(wait=True)
    time.sleep(0.5)
    
    steps += 1
    action = np.argmax(q_table[state,:])
    next_state,reward,done,_,info = env.step(action)
    
    if reward == -10:
        total_penalty += 1
    state = next_state
    print(env.render())
    
print(f"Total penalty {total_penalty} and no. of steps is {steps}")
env.close()

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35m[34;1m[43mB[0m[0m[0m: |
+---------+
  (Dropoff)

Total penalty 0 and no. of steps is 13
