# Import dependencies

In [99]:
import numpy as np
import random
import gym

# create environment 

In [100]:
env = gym.make("Taxi-v2")
env.render()

+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| |[43m [0m: | : |
|Y| : |[34;1mB[0m: |
+---------+



# Create Q table
 
For creating Q table we need to find state_size and action_size, because in Q table no of rows shown by no of states and no of column shown by action_size. for getting the knowledge of action_size we use env.action_space.n and for getting the size of state space we use env.observation_space.n

In [101]:
states_size = env.observation_space.n
actions_size = env.action_space.n
print(states_size)
print(actions_size)

500
6


In [102]:
qtable = np.zeros((states_size, actions_size))
print(qtable)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


# Create the hyperparameters

In [103]:
total_episodes = 10000
total_test_episodes = 100
epsilon = 1.0 #exploration rate
alpha = 0.45 #learning rate
gamma = 0.95 #discount factor
max_step = 99 # maximum no of steps during an episode an agent can move

max_epsilon = 1.0 #intially we don't know about the environment
min_epsilon = 0.01 #minimum exploration rate probability
decay_rate = 0.001 #exponential decay rate for exploration probability


# The Q learning algorithm
1- Initialize Q table arbitraraly 

2- for life or end of the episodes

    3- choose an action a based on maximum Q table value
    
    4- Take action a and observe the new state s' and reward r
    
    5- Update Q table using Q(s,a) := Q(s,a) + alpha*(r + gamma*max(Q(s',a')) - Q(s,a)
    
6- state = new_state

In [104]:
for episode in range(total_episodes):
    #reset the environment
    state = env.reset()
    step = 0
    done = False
    
    for step in range(max_step):
        #choose the action a in the current world state s
        #first we randomize a number
        exp_exp_tradeoff = random.uniform(0,1)
        
        #if this number is greater than epsilon than we done exploitation else take action by using concept of exploration
        if exp_exp_tradeoff > epsilon:
            #exploitation
            action = np.argmax(qtable[state, :])
            
        else:
            #exploration
            action = env.action_space.sample()
            
        #take action and observe the outcome
        new_state, reward, done, info = env.step(action)
        
        #update the q table
        q_target = reward + gamma* np.max(qtable[new_state, :])
        q_delta = q_target - qtable[state, action]
        qtable[state, action] += alpha * q_delta
        
        state = new_state
        
        if done == True:
            break
            
    #reduce epsilon because we need less and less exploration 
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate * episode)
    
print(qtable)
            

[[  0.           0.           0.           0.           0.
    0.        ]
 [244.41904802 257.77479268 243.43947133 258.59530667 273.30166436
  248.72296194]
 [273.12843201 287.21159863 273.16329847 287.98915232 304.98799375
  279.03089986]
 ...
 [147.31525241  40.03030701  82.75514159 283.3584898  -11.24851106
   34.48337828]
 [186.70001766 169.35470283  90.49393782 258.00835969 147.97640007
  111.35561865]
 [232.76800674 181.62905773 125.36826291 378.99998991 120.57875013
  274.30121198]]


# Use our q table to play taxi game

In [105]:
env.reset()
rewards = []
for episode in range(total_test_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0

    for step in range(max_step):
        #uncomment if you want to see our agent as playing
        env.render()
        action = np.argmax(qtable[state,:])
        new_state, reward, done, info = env.step(action)
        total_rewards += reward
        
        if done:
            rewards.append(total_rewards)
            break
        state = new_state

env.close()
print("score over time " + str(sum(rewards)/total_test_episodes))

+---------+
|R: | : :G|
| : :[43m [0m: : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+

+---------+
|R: | : :G|
| : : :[43m [0m: |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (East)
+---------+
|R: | : :G|
| : : : : |
| : : :[43m [0m: |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : |[43m [0m: |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1m[43mB[0m[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[42mB[0m: |
+---------+
  (Pickup)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : |[42m_[0m: |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|R: | : :G|
| : : : : |
| : : :[42m_[0m: |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|R: | : :G|
| : : : : |
| : :[42m_[0m: : |
|

  (East)
+---------+
|R: | : :[35mG[0m|
| : : : :[42m_[0m|
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (East)
+---------+
|R: | : :[35m[42mG[0m[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| :[43m [0m|[34;1mB[0m: |
+---------+

+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | :[43m [0m| : |
|Y| : |[34;1mB[0m: |
+---------+
  (North)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (North)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (East)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : |[43m [0m: |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[34;1m[43mB[0m[0m: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : : :