In [1]:
import gym
import random 
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Taxi environment
# Passenger location : 0(Red), 1(Green), 2(Yellow), 3(Blue), 4(in taxi)  --> yolcunun bulunduğu yerler
# Destinations : 0(Red), 1(Green), 2(Yellow), 3(Blue) --> yolcuyu nereye bırakıcaz
# Action Space : 
#                 0: Move south (down)
#                 1: Move north (up)
#                 2: Move east (right)
#                 3: Move west (left)
#                 4: Pickup passenger
#                 5: Drop off passenger

In [2]:
env = gym.make("Taxi-v3", render_mode = "ansi")
env.reset()
print(env.render())

+---------+
|R: |[43m [0m: :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+




In [3]:
action_space = env.action_space.n
action_space

6

In [4]:
state_space = env.observation_space.n
state_space

500

In [5]:
q_table = np.zeros((state_space,action_space))
q_table

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [None]:
# training : Ajanın, environment'ı keşfetmesini sağlar. Ajan, environment içinde actionlara göre hareket ederek hedefine 
# ulaşmaya çalışır. Actionlar sonucunda içerisinde bulunduğu state durumlarına göre Q-table oluşur. 

# test : Ajanın, environment içinde q_table'ı kullanarak hareket etmesi ve ödüle ulaşması sağlanır. Her bölümün ortalama olarak
# ne kadar sürede çözüldüğü ve ne kadar ceza aldığı yazdırılır.

In [6]:
env = gym.make("Taxi-v3", render_mode = "ansi")
env.reset()

action_space = env.action_space.n
state_space = env.observation_space.n
q_table = np.zeros((state_space,action_space))

alpha = 0.1    # learning rate
gamma = 0.6    # discount rate
epsilon = 0.1  # epsilon

# training
for i in tqdm(range(1, 100001)):
    
    state, _ = env.reset()
    done = False
    
    while not done:
        
        if random.uniform(0,1) < epsilon:        # explore
            action = env.action_space.sample()   
        else:                                    # exploit
            action = np.argmax(q_table[state])
            
        new_state, reward, done, info, _ = env.step(action)
        
        # update q table
        q_table[state,action] = q_table[state,action] + alpha*(reward + gamma* np.max(q_table[new_state]) - q_table[state,action])
        
        state = new_state
        
print("Training Finished")


# test
total_epoch , total_penalties = 0,0
episodes = 100

for i in tqdm(range(episodes)):
    
    state, _ = env.reset()
    epochs, penalties, reward = 0,0,0
    done = False
    
    while not done:
        
        action = np.argmax(q_table[state])
        new_state , reward , done , info , _  = env.step(action)        
        state = new_state
        
        if reward == -10:
            penalties += 1
            
        epochs += 1
    
    total_epoch += epochs
    total_penalties += penalties
    
print("Result after {} episodes".format(episodes))
print("Average timesteps per episode: ",total_epoch/episodes)
print("Average penalties per episode: ",total_penalties/episodes)

100%|██████████| 100000/100000 [02:24<00:00, 694.35it/s]


Training Finished


100%|██████████| 100/100 [00:00<00:00, 1280.77it/s]

Result after 100 episodes
Average timesteps per episode:  12.64
Average penalties per episode:  0.0



