<a href="https://colab.research.google.com/github/OverGeek/Reinforcement-Learning/blob/master/Q_Learning_with_OpenAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Import Libraries**

In [0]:
import numpy as np
import gym
import random
from tqdm import tqdm

**Rendering the Game**

In [34]:
env = gym.make("Taxi-v3")
env.render()

+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | :[43m [0m|
|Y| : |B: |
+---------+



In [35]:
action_size = env.action_space.n
print("Action Size", action_size)

state_size = env.observation_space.n
print("State Size", state_size)

Action Size 6
State Size 500


In [36]:
qtable = np.zeros((state_size, action_size))
qtable

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

**Hyperparameters**

In [0]:
total_episodes = 50000  # Total training games
total_test_episodes = 100  # Total times game is tested
max_steps = 100     # Maximum number of moves in one game

learning_rate = 0.7
gamma = 0.618         # Discount rate

epsilon = 1.        # Current exploration rate
max_epsilon = 1.    # Maximum exploaration rate
min_epsilon = 0.01  # Minimum exploaration rate
decay_rate = 0.01   # Factor for exponential decaying of exploration rate

**Training**

In [38]:
# Training Loop
for episode in tqdm(range(total_episodes)):
  state = env.reset()
  step = 0
  done = False  # Marks if the game has ended or not

  # Taking steps in the game
  for step in range(max_steps):
    exp_exp_tradeoff = random.uniform(0, 1)  # random number to determine exploration or exploitation

    if exp_exp_tradeoff > epsilon:
      # If exploitation, choose the best action according to q-table
      action = np.argmax(qtable[state, : ])

    else:
      # If exploration, take a random action
      action = env.action_space.sample()

    # Take the chosena action and determine new_state and reward
    new_state, reward, done, info = env.step(action)

    # The training equation: Bellman's equation
    # to update the q-table for state-action pair
    qtable[state, action] = qtable[state, action] + learning_rate*(reward + gamma*np.max(qtable[new_state, : ]) - qtable[state, action])

    # Update the state
    state = new_state

    # If game terminates, end the episode (current game)
    if done == True:
      break

  episode += 1
 
  # As the agent begins to learn, exponentially decay the exploration rate
  epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)  

100%|██████████| 50000/50000 [00:21<00:00, 2281.50it/s]


**Testing**

In [41]:
env.reset() # Reset the game environment
rewards = []

for episode in range(total_test_episodes):
  state = env.reset()
  step = 0
  done = False
  total_rewards = 0

  for step in range(max_steps):
    env.render()

    action = np.argmax(qtable[state, :])  # Choose action based on q-table

    new_state, reward, done, info = env.step(action)

    total_rewards += reward

    if done:
      rewards.append(total_rewards)
      break

    state = new_state

env.close()
print("Average Score: ", sum(rewards)/ total_test_episodes)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (East)
+---------+
|R: | : :[42mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (Pickup)
+---------+
|R: | : :G|
| : | : :[42m_[0m|
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | :[42m_[0m: |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (West)
+---------+
|R: | : :G|
| : | : : |
| : : :[42m_[0m: |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : :[42m_[0m: : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (West)
+---------+
|R: | : :G|
| : | : : |
| :[42m_[0m: : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (West)
+---------+
|R: | : :G|
| : | : : |
|[42m_[0m: : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (West)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
|[42m_[0m| : | : |
|[35mY