# Taxi-v3

## About the Environment

    MAP:

        +---------+
        |R: | : :G|
        | : | : : |
        | : : : : |
        | | : | : |
        |Y| : |B: |
        +---------+

    Actions:
    There are 6 discrete deterministic actions:
    - 0: move south
    - 1: move north
    - 2: move east
    - 3: move west
    - 4: pickup passenger
    - 5: drop off passenger

    Rendering:
    - blue: passenger
    - magenta: destination
    - yellow: empty taxi
    - green: full taxi
    - other letters (R, G, Y and B): locations for passengers and destinations

    state space is represented by:
    (taxi_row, taxi_col, passenger_location, destination)



    **Rewards:**
    - -1 per step reward unless other reward is triggered.
    - +20 delivering passenger.
    - -10  executing "pickup" and "drop-off" actions illegally.


    State:
    (int(s), r, d, {"prob": p})

In [1]:
import gym
import time
import numpy as np

from tqdm import tqdm

from IPython.display import clear_output

In [2]:
env = gym.make('Taxi-v3')

## Random Action

In [3]:
def play_env(policy=lambda s: env.action_space.sample(), sleep_time=0.1, env_seed=None):
    if env_seed is not None:
        env.seed(env_seed)
        
    state = env.reset()
    max_steps = env.spec.max_episode_steps
    total_reward = 0
    is_done = False
    current_step = 0

    while is_done == False:
        # Get a random action
        action = policy(state)

        state, reward, is_done, info = env.step(action)

        total_reward += reward
        current_step += 1

        clear_output(wait=True)

        # Print header
        print('Step: {:03d}/{}, Reward: {}\n'.format(
            current_step,
            max_steps,
            total_reward,
        ))
        env.render()

        time.sleep(sleep_time)
        
    if current_step < max_steps:
        print('\nResult: Done with {} steps and total reward is {}.'.format(
            current_step,
            total_reward,
        ))
    else:
        print('\nResult: Unsolved')

In [4]:
play_env(sleep_time=0.01, env_seed=1)

Step: 200/200, Reward: -767

+---------+
|R: | : :[34;1mG[0m|
| : | :[43m [0m: |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (Dropoff)

Result: Unsolved


## Q-Learning

Credit: https://towardsdatascience.com/reinforcement-learning-teach-a-taxi-cab-to-drive-around-with-q-learning-9913e611028f

In [5]:
q_table_shape = [env.observation_space.n, env.action_space.n]
learning_rate = 0.1  # Learning rate, i.e. alpha
discount_factor = 0.99  # Discount factor, i.e. gamma
epsilon = 0.1  # Exploring vs exploiting
training_episodes = 100_000
# training_episodes = 1000
env_seed = 1

In [6]:
# Initialize the q-table with zero values
q_table = np.zeros(q_table_shape)

In [7]:
# Random generator
rng = np.random.default_rng()

### Train

In [8]:
for i in tqdm(range(training_episodes)):
    # Reset the environment first
    # env.seed(env_seed)
    state = env.reset()
    
    # env.seed(i)
    
    # print('state', state)
    
    done = False
    
    while not done:
        if rng.random() < epsilon:
            action = env.action_space.sample()  # Explore the action space (with a random action)
        else:
            action = np.argmax(q_table[state]) # Exploit leared values

        # Apply the action and see what happens
        next_state, reward, done, info = env.step(action)
        # print(i, next_state, reward, done, info)

        current_value = q_table[state, action]  # Current Q-value for the state-action pair
        next_max = np.max(q_table[next_state])  # Next best Q-value

        q_table[state, action] = (1 - learning_rate) * current_value + learning_rate * (reward + discount_factor * next_max)
        # print(i, state, action, q_table[state, action])
        
        state = next_state

100%|██████████| 100000/100000 [00:42<00:00, 2352.33it/s]


### Eval

In [9]:
play_env(
    policy=lambda s: np.argmax(q_table[s]),
    sleep_time=0.5,
    env_seed=1,
)

Step: 013/200, Reward: 8

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35m[34;1m[43mB[0m[0m[0m: |
+---------+
  (Dropoff)

Result: Done with 13 steps and total reward is 8.


In [20]:
play_env(
    policy=lambda s: np.argmax(q_table[s]),
    sleep_time=0.5,
    env_seed=None,
)

Step: 017/200, Reward: 4

+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Result: Done with 17 steps and total reward is 4.
