In [1]:
import gym
import numpy as np
import random
import matplotlib.pyplot as plt

In [2]:
# Make an environment

env = gym.make("Taxi-v3").env

In [3]:
env.render()

+---------+
|R: | : :[35mG[0m|
| : | :[43m [0m: |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



In [4]:
# Q Table

q_table = np.zeros([env.observation_space.n, env.action_space.n])
q_table.shape

(500, 6)

In [5]:
# Arrange Hyperparameters(learning rate, discount rate, epsilon rate for exploitation-exploration)

alpha = 0.1 #Learning Rate
gamma = 0.9 #Discount Rate 
epsilon = 0.1 #Epsilon Rate

In [6]:
# Plotting Metrics

reward_list = []
dropout_list = []

In [7]:
episode_number = 1001

for i in range(1,episode_number):
    
    # Initialize Environment
    state = env.reset()
    
    reward_count = 0
    dropouts = 0
    
    while True:
        # Exploit-Explore decision to find an action(%10 explore, %90 exploit)
        if random.uniform(0,1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state])#En yuksek action in bulundugu index i veriyor
        
        # Take action and observe outcome state(s') and reward.
        next_state, reward, done, _ = env.step(action)
        
        # Q Learning Function
        old_value = q_table[state, action]#old_value
        next_max = np.max(q_table[next_state])#next_max
        
        next_value = (1-alpha) * old_value + alpha * (reward+ gamma*next_max)# Main Function
        
        # Update Q Table
        q_table[state, action] = next_value
        
        # Update State
        state = next_state
        
        # Find Wrong Dropouts(just for fun) and update total reward
        if reward == -10:
            dropouts += 1
        
        reward_count += reward
        
        if done:
            break
    
    if i%10 == 0:
        dropout_list.append(dropouts)
        reward_list.append(reward_count)
        print("Episode: {}, Reward: {}, Dropouts: {}".format(i, reward_count, dropouts))
        

Episode: 10, Reward: -2078, Dropouts: 87
Episode: 20, Reward: -1823, Dropouts: 85
Episode: 30, Reward: -488, Dropouts: 18
Episode: 40, Reward: -357, Dropouts: 10
Episode: 50, Reward: -353, Dropouts: 6
Episode: 60, Reward: -187, Dropouts: 7
Episode: 70, Reward: -234, Dropouts: 6
Episode: 80, Reward: -300, Dropouts: 8
Episode: 90, Reward: -619, Dropouts: 31
Episode: 100, Reward: -189, Dropouts: 7
Episode: 110, Reward: -332, Dropouts: 11
Episode: 120, Reward: -255, Dropouts: 9
Episode: 130, Reward: -141, Dropouts: 4
Episode: 140, Reward: -505, Dropouts: 19
Episode: 150, Reward: -119, Dropouts: 1
Episode: 160, Reward: -100, Dropouts: 3
Episode: 170, Reward: -238, Dropouts: 10
Episode: 180, Reward: -217, Dropouts: 7
Episode: 190, Reward: -233, Dropouts: 10
Episode: 200, Reward: -59, Dropouts: 1
Episode: 210, Reward: -295, Dropouts: 13
Episode: 220, Reward: 10, Dropouts: 0
Episode: 230, Reward: -194, Dropouts: 5
Episode: 240, Reward: -102, Dropouts: 3
Episode: 250, Reward: -364, Dropouts: 16

In [8]:
q_table[246]

array([-4.78071684, -4.74384053, -4.74453871, -4.79151472, -5.78053726,
       -6.2296297 ])

In [9]:
state = env.reset()
env.render()
while True:
    # Exploit-Explore decision to find an action(%10 explore, %90 exploit)
    if random.uniform(0,1) < epsilon:
        action = env.action_space.sample()
    else:
        action = np.argmax(q_table[state])#En yuksek action in bulundugu index i veriyor
        
    # Take action and observe outcome state(s') and reward.
    next_state, reward, done, _ = env.step(action)
        
    # Q Learning Function
    old_value = q_table[state, action]#old_value
    next_max = np.max(q_table[next_state])#next_max
        
    next_value = (1-alpha) * old_value + alpha * (reward+ gamma*next_max)# Main Function
        
    # Update Q Table
    q_table[state, action] = next_value
        
    # Update State
    state = next_state
        
    
    env.render()
        
    if done:
        break


+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m:[43m [0m|
+---------+

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m:[43m [0m|
+---------+
  (East)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m:[43m [0m|
+---------+
  (Dropoff)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1m[43mB[0m[0m: |
+---------+
  (West)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[42mB[0m: |
+---------+
  (Pickup)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : |[42m_[0m: |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|R: | : :G|
| : | : : |
| : : :[42m_[0m: |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|R: | : :G|
| : | : : |
| : :[42m_[0m: : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (West)
+---------+
|R: | : :G|
| : | : : |
| :[42m_[0m: : : |
| | : | : |
