In [1]:
import numpy as np
import random
import time
import gym

In [2]:
env = gym.make("Taxi-v3")
state = env.reset()
env.render()
print("Current state is  :", state)

+---------+
|R: | : :[35mG[0m|
| : | : : |
|[43m [0m: : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+

Current state is  : 213


In [3]:
5*5*4*5

500

In [3]:
state_size = env.observation_space.n
print("State Space :", state_size)

State Space : 500


In [4]:
action_size = env.action_space.n
print("Action Space :", action_size)

Action Space : 6


# Algorithms:
- Q Learning
- SARSA

### 1. Q Learning

In [6]:
q_table = np.zeros((state_size, action_size))
episodes = 100000
learning_rate = 0.1
gamma = 0.7
epsilon = 0.1

def greedy_policy(state, table):
    z = np.random.random()
    if z > epsilon:
        action = np.argmax(table[state])
    else:
        action = env.action_space.sample()
    return action

In [7]:
deltas = []
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    step = 0
    change_t = 0
    
    if episode % 5000 == 0:
        print("Episode: {}".format(episode))
    while not done:
        #env.render()
        action = greedy_policy(state, q_table)
        new_state, reward, done, info = env.step(action)
        old_q = q_table[state, action]
        
        #Update
        q_table[state, action] += learning_rate * (reward + gamma * np.max(q_table[new_state, :]) - q_table[state, action])
        change_t = max(change_t, np.abs(q_table[state][action] - old_q))
        state = new_state
    deltas.append(change_t)
    if deltas[-1] < 0.000000001:
        break
    episode += 1
print("Maximum Difference is :", deltas[-1])

Maximum Difference is : 4.567040079450635e-10


In [20]:
state = env.reset()
env.render()

+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|[35mY[0m| : |B: |
+---------+



In [21]:
from IPython.display import clear_output
import time

In [22]:
done = False
cumulative_reward = 0

while(done==False):
    best_action = np.argmax(q_table[state, :])
    
    state, reward, done, _ = env.step(best_action)
    
    cumulative_reward += reward
    time.sleep(0.5)
    clear_output(wait=True)
    env.render()
    print('Episode Reward :', cumulative_reward)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)
Episode Reward : 11


### 2. SARSA

In [31]:
state = env.reset()
env.render()

+---------+
|[35mR[0m: | : :G|
| : | : :[43m [0m|
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+



In [32]:
s_table = np.zeros((state_size, action_size))
episodes = 100000
deltas = []
for episode in range(1, episodes+1):
    state = env.reset()
    done =False
    step = 0
    change_t = 0
    if episode % 5000 == 0:
        print("Episode: {}".format(episode))
        
    while not done:
        action = greedy_policy(state, s_table)
        next_state, reward, done, info = env.step(action)
        next_action = greedy_policy(state, s_table)
        old_t = s_table[state, action]
        
        s_table[state, action] += learning_rate * (reward+gamma* s_table[new_state, next_action] - s_table[state, action])
        change_t = max(change_t, np.abs(s_table[state][action] - old_t))
        
        state = new_state
    deltas.append(change_t)
    if deltas[-1] < 0.000000001:
        break
    episode += 1
print("Maximum Difference is :", deltas[-1])

Episode: 5000
Episode: 10000
Episode: 15000
Episode: 20000
Episode: 25000
Episode: 30000
Episode: 35000
Episode: 40000
Episode: 45000
Episode: 50000
Episode: 55000
Episode: 60000
Episode: 65000
Episode: 70000
Episode: 75000
Episode: 80000
Episode: 85000
Episode: 90000
Episode: 95000
Episode: 100000
Maximum Difference is : 0.790652232747


In [33]:
#test

In [40]:
state = env.reset()
env.render()

+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | :[43m [0m| : |
|[34;1mY[0m| : |B: |
+---------+



In [41]:
done = False
cumulative_reward = 0

while(done==False):
    best_action = np.argmax(s_table[state, :])
    
    state, reward, done, _ = env.step(best_action)
    
    cumulative_reward += reward
    time.sleep(0.5)
    clear_output(wait=True)
    env.render()
    print('Episode Reward :', cumulative_reward)

+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|[34;1mY[0m| : |B: |
+---------+
  (West)
Episode Reward : -87


KeyboardInterrupt: 

In [11]:
q_table = np.zeros((state_size, action_size))
episodes = 100000
learning_rate = 0.1
gamma = 0.9
epsilon = 0.1

def greedy_policy(state, table):
    z = np.random.random()
    if z > epsilon:
        action = np.argmax(table[state])
    else:
        action = env.action_space.sample()
    return action

In [12]:
state = env.reset()
env.render()

+---------+
|[35mR[0m: | : :G|
|[43m [0m: | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



In [13]:
s_table = np.zeros((state_size, action_size))
episodes = 100000
deltas = []
for episode in range(1, episodes+1):
    state = env.reset()
    done =False
    step = 0
    change_t = 0
    if episode % 5000 == 0:
        print("Episode: {}".format(episode))
        
    while not done:
        action = greedy_policy(state, s_table)
        next_state, reward, done, info = env.step(action)
        next_action = greedy_policy(state, s_table)
        old_t = s_table[state, action]
        
        s_table[state, action] += learning_rate * (reward+gamma* s_table[next_state, next_action] - s_table[state, action])
        change_t = max(change_t, np.abs(s_table[state][action] - old_t))
        
        state = next_state
    deltas.append(change_t)
    if deltas[-1] < 0.000000001:
        break
    episode += 1
print("Maximum Difference is :", deltas[-1])

Episode: 5000
Episode: 10000
Episode: 15000
Episode: 20000
Episode: 25000
Episode: 30000
Episode: 35000
Episode: 40000
Episode: 45000
Episode: 50000
Episode: 55000
Episode: 60000
Episode: 65000
Episode: 70000
Episode: 75000
Episode: 80000
Episode: 85000
Episode: 90000
Episode: 95000
Episode: 100000
Maximum Difference is : 0.22613673298220327


In [18]:
state = env.reset()
env.render()

from IPython.display import clear_output
import time

+---------+
|[35mR[0m: | : :G|
| : | : :[43m [0m|
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



In [16]:
done = False
cumulative_reward = 0

while(done==False):
    best_action = np.argmax(s_table[state, :])
    
    state, reward, done, _ = env.step(best_action)
    
    cumulative_reward += reward
    time.sleep(0.5)
    clear_output(wait=True)
    env.render()
    print('Episode Reward :', cumulative_reward)

+---------+
|R: | : :[35m[34;1m[43mG[0m[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
Episode Reward : 8


In [19]:
done = False
cumulative_reward = 0

while(done==False):
    best_action = np.argmax(s_table[state, :])
    
    state, reward, done, _ = env.step(best_action)
    
    cumulative_reward += reward
    time.sleep(0.5)
    clear_output(wait=True)
    env.render()
    print('Episode Reward :', cumulative_reward)

+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
Episode Reward : 8
