In [1]:
import gym
import numpy as np
import random

In [4]:
env = gym.make('Taxi-v3')
env.render()

+---------+
|[35mR[0m: | : :G|
| : | : :[43m [0m|
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+



In [5]:
# Values for Q Table:

action_size = env.action_space.n
print('Action Space: ', action_size)

state_size = env.observation_space.n
print('State Size: ', state_size)

Action Space:  6
State Size:  500


In [6]:
# Build Q Table:

q_table = np.zeros((state_size, action_size))
q_table

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [7]:
# Hyper params:

total_ep = 1500
total_test_ep = 100
max_steps = 100

lr = 0.81
gamma = 0.96

# Exploration Params:

epsilon = 0.9
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.01

![alt text](https://cdn-media-1.freecodecamp.org/images/0*voKUaGu68-cDuncy.)

In [8]:
# Implementing the Q Learning Algorithm:

for episode in range(total_ep):

  # Reset Environment:
  state = env.reset()
  step = 0
  done = False

  for step in range(max_steps):

    # Choose an action a in the current world state(s) (step 3)
    # First we randomize a number
    exp_exp_tradeoff = random.uniform(0, 1)

    # If this number > greater than epsilon --> exploitation (taking the biggest q value for the current state):
    if exp_exp_tradeoff > epsilon:
      action = np.argmax(q_table[state, :])

    # Else, doing random choice:
    else:
      action = env.action_space.sample()

    # Take the action (a) and observe the outcome state (s') and the reward (r)
    new_state, reward, done, info = env.step(action)

    # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
    q_table[state, action] = q_table[state, action] + lr * (reward + gamma * 
                                    np.max(q_table[new_state, :]) - q_table[state, action])
    
    # Our new state:
    state = new_state

    # If done True, finish the episode:
    if done == True:
      break

  # Increment number of episodes:
  episode += 1

  # Reduce epsilon (because we need less and less exploration):
  epsilon = min_epsilon + (max_epsilon - min_epsilon) *np.exp(-decay_rate*episode)

In [9]:
# Using Q Table:

env.reset()
rewards = []

for episode in range(total_test_ep):
  state = env.reset()
  step = 0
  done = False
  total_rewards = 0
  print('=========================')
  print('EPISODE: ', episode)

  for step in range(max_steps):

    env.render()
    
    # Take the action based on the Q Table:
    action = np.argmax(q_table[state, :])

    new_state, reward, done, info = env.step(action)

    total_rewards += reward

    # If episode finishes:
    if done:
      rewards.append(total_rewards)
      print('Score: ', total_rewards)
      break

    state = new_state

env.close()
print('Score Over Time: {}'.format(sum(rewards)/total_test_ep))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
|Y| : |B: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :G|
| : | : : |
|[42m_[0m: : : : |
| | : | : |
|Y| : |B: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :G|
|[42m_[0m: | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|[35m[42mR[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
Score:  8
EPISODE:  54
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : :[43m [0m|
| | : | : |
|[35mY[0m| : |B: |
+---------+

+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (West)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (West)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (West)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
|[43m [0m: : : 