### Implement Q-learning to solve the Taxi problem with optimal policy. The OpenAI Gym environment (https://www.openai.com/)



In [7]:
# Load OpenAI Gym and other necessary packages
import gym
import random
import numpy
import time
import numpy as np


In [8]:
# Make environment
env = gym.make("Taxi-v3")
env.render()
print(f"Action Space: {env.action_space}")
print(f"State Space: {env.observation_space}")

# Training parameters for Q learning
alpha = 0.9 # Learning rate
gamma = 0.9 # Future reward discount factor num_of_episodes = 1000
epsilon = 0.1
num_of_episodes = 1000
num_of_steps = 500 # per each episode

# Q tables for rewards
Q_reward = -10000*np.ones((500,6))
Q_reward


+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | :[43m [0m|
|[34;1mY[0m| : |[35mB[0m: |
+---------+

Action Space: Discrete(6)
State Space: Discrete(500)


array([[-10000., -10000., -10000., -10000., -10000., -10000.],
       [-10000., -10000., -10000., -10000., -10000., -10000.],
       [-10000., -10000., -10000., -10000., -10000., -10000.],
       ...,
       [-10000., -10000., -10000., -10000., -10000., -10000.],
       [-10000., -10000., -10000., -10000., -10000., -10000.],
       [-10000., -10000., -10000., -10000., -10000., -10000.]])

In [9]:
# Training function
def train(num_of_episodes, num_of_steps, alpha, gamma, Q_reward):
    
    for ep in range(num_of_episodes):#No. of episodes
        state = env.reset() # set environment
        
        for step in range(num_of_steps):
            '''
            if np.random.random() < epsilon:
                action = np.random.choice(6) # random action
            else:
                action = np.argmax(Q_reward[state]) # greedy action 
            '''
            action = np.random.choice(6) # random action
            new_state, reward, done, info = env.step(action)# observation next step
          
            q_value = Q_reward[state,action] - alpha*(Q_reward[state,action]) + alpha*(reward + gamma * np.max(Q_reward[new_state]))  # new q-value
            Q_reward[state,action] =  q_value  # update q-value
            state = new_state # update stat
            
    return Q_reward



In [10]:
%%time
# Train
Q_reward = train(num_of_episodes, num_of_steps, alpha, gamma, Q_reward)
  

CPU times: user 15.6 s, sys: 165 ms, total: 15.8 s
Wall time: 16 s


In [11]:
# Test 
state = env.reset()
total_reward = 0
total_action = 0
q_table = Q_reward # q-tables
for steps in range(50):
    action = np.argmax(q_table[state]) # action 
    new_state, reward, done, info = env.step(action)# observe
    state = new_state # update new state
    total_reward += reward # reward
    total_action += 1 # count action
    env.render() 
    time.sleep(1)

    # Drop passenger successfully
    if done:
        print(f"Total reward: {total_reward}")
        print(f"Total actions: {total_action}")
        break

+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (West)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (West)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (West)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
|[43m [0m: : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (West)
+---------+
|[34;1mR[0m: | : :G|
|[43m [0m: | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|[34;1m[43mR[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|[42mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (Pickup)
+---------+
|R: | : :G|
|[42m_[0m: | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
|[42

In [12]:

# Test with 10 Episodes for finding the averages of rewards and actions
avg_total_reward = []
avg_total_actions = []
q_table_2 = Q_reward # q-tables

for eps in range(10):
    total_reward = 0
    total_actions = 0
    state = env.reset() 
    
    for steps in range(50):
        action = np.argmax(q_table_2[state]) # action 
        new_state, reward, done, info = env.step(action)# observe
        state = new_state # update new state
        total_reward += reward # reward
        total_actions += 1 # count action
        env.render() 
        time.sleep(1)
        
        # if sucessfuly drop passenger
        if done:
            print(f"Total reward: {total_reward}")
            print()
            break
            
    # append each episode values
    avg_total_reward.append(total_reward)
    avg_total_actions.append(total_actions)
    
print(f"Average total reward: {np.array(avg_total_reward).mean()}")
print(f"Average total actions: {np.array(avg_total_actions).mean()}")


+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | :[43m [0m|
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : :[43m [0m|
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :[34;1mG[0m|
| : | : :[43m [0m|
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :[34;1m[43mG[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :[42mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (Pickup)
+---------+
|R: | : :G|
| : | : :[42m_[0m|
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : :[42m_[0m|
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | :[42m_[0m|
|Y| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |

+---------+
|R: | : :G|
| : | : : |
| : : : : |
|[43m [0m| : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1m[43mY[0m[0m| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[42mY[0m| : |[35mB[0m: |
+---------+
  (Pickup)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
|[42m_[0m| : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :G|
| : | : : |
|[42m_[0m: : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :G|
| : | : : |
| :[42m_[0m: : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (East)
+---------+
|R: | : :G|
| : | : : |
| : :[42m_[0m: : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (East)
+---------+
|R: | : :G|
| : | : : |
| : : :[42m_[0m: |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (East)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : |[42m_[0m: |
|Y| : |[35mB[