<a href="https://colab.research.google.com/github/erdoganege/Reinforcement-Learning/blob/main/Off_Policy_Q_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import numpy as np
import random
import matplotlib.pyplot as plt

# Rendering environment in Colab

In [1]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1



In [2]:
import gym
from gym.wrappers import Monitor
import glob
import io
import base64
from IPython.display import HTML
from pyvirtualdisplay import Display
from IPython import display as ipythondisplay

display = Display(visible=0, size=(1400, 900))
display.start()

"""
Utility functions to enable video recording of gym environment 
and displaying it.
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

# First Try for above libraries

In [3]:
env = wrap_env(gym.make("CartPole-v0"))
env.reset()
for i in range(10):
  env.render()
  env.step(env.action_space.sample())
env.close()
show_video()

# Taxi Example


In [37]:
env = gym.make("Taxi-v3")
print("State Space: {}".format(env.observation_space))
print("Action Space: {}".format(env.action_space))
print("Reward Range: {}".format(env.reward_range))
spec = gym.spec("Taxi-v3")
print("Max Episode Steps: {}".format(spec.max_episode_steps))
print("Is it Nondeterministic: {}".format(spec.nondeterministic))
print("Reward Threshold: {}".format(spec.reward_threshold))

State Space: Discrete(500)
Action Space: Discrete(6)
Reward Range: (-inf, inf)
Max Episode Steps: 200
Is it Nondeterministic: False
Reward Threshold: 8


In [38]:
#blue: passenger
#magenta: destination
#yellow: empty taxi
#green: full taxi
env = wrap_env(gym.make("Taxi-v3"))
env.reset()
for _ in range(3):
  env.render()
  env.step(env.action_space.sample())
env.close()

+---------+
|[35mR[0m: |[43m [0m: :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+

+---------+
|[35mR[0m: | : :G|
| : |[43m [0m: : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : |[43m [0m: : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (Dropoff)


In [39]:
Qtable = np.zeros([env.observation_space.n, env.action_space.n])
DISCOUNT_RATE = 0.8
LEARNING_RATE = 0.2
EXPLORATION_RATE = 0.1
NUMBER_OF_EPISODES = 10000
print("Shape of Q-table: ", Qtable.shape)

Shape of Q-table:  (500, 6)


In [40]:
for episode in range(NUMBER_OF_EPISODES):
  done = False
  episode_reward = 0
  state = env.reset() #state index
  while not done:
    if random.uniform(0, 1) < EXPLORATION_RATE:
      action = env.action_space.sample() #take random action
    else:
      action = np.argmax(Qtable[state]) #take maximum Q valued action for that state (optimal policy)
    new_state, reward, done, log = env.step(action) #take the action

    Qtable[state, action] = ((1 - LEARNING_RATE) * Qtable[state, action]) + (LEARNING_RATE * (reward + DISCOUNT_RATE * np.max(Qtable[new_state]))) #update current state Q-value
    episode_reward += reward
    state = new_state      
  if episode%50 == 0:
    print("We are in {}th episode and current episode's total reward is {}.".format(episode, episode_reward))

We are in 0th episode and current episode's total reward is -605.
We are in 50th episode and current episode's total reward is -98.
We are in 100th episode and current episode's total reward is -362.
We are in 150th episode and current episode's total reward is -299.
We are in 200th episode and current episode's total reward is -205.
We are in 250th episode and current episode's total reward is -68.
We are in 300th episode and current episode's total reward is -28.
We are in 350th episode and current episode's total reward is 1.
We are in 400th episode and current episode's total reward is 5.
We are in 450th episode and current episode's total reward is -5.
We are in 500th episode and current episode's total reward is -23.
We are in 550th episode and current episode's total reward is -137.
We are in 600th episode and current episode's total reward is -91.
We are in 650th episode and current episode's total reward is 5.
We are in 700th episode and current episode's total reward is 9.
We

In [41]:
#!pip install tabulate
from tabulate import tabulate
print(tabulate(Qtable, headers=["South", "North", "East", "West", "Pick Up", "Drop off"], tablefmt='fancy_grid', showindex= env.observation_space.n))

╒═════╤════════════╤════════════╤════════════╤════════════╤════════════╤════════════╕
│     │      South │      North │       East │       West │    Pick Up │   Drop off │
╞═════╪════════════╪════════════╪════════════╪════════════╪════════════╪════════════╡
│   0 │  0         │  0         │  0         │  0         │   0        │   0        │
├─────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┤
│   1 │ -2.91194   │ -2.35446   │ -2.86919   │ -2.47174   │  -1.64456  │ -11.1117   │
├─────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┤
│   2 │  0.176292  │  1.35612   │  0.140817  │  0.0588942 │   3.192    │  -7.13093  │
├─────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┤
│   3 │ -2.32007   │ -1.69249   │ -2.3381    │ -1.68105   │  -0.805696 │ -10.631    │
├─────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┤
│   4 │ -4.43252   │ -4.44375   │ -4.43321   │ -4.4393

# Test Learned Q-table

In [42]:
total_steps = 0
total_penalty = 0
total_rewards = 0
for _ in range (100):
  state = env.reset()
  episode_step = 0
  episode_penalty = 0
  episode_reward = 0
  done = False
  while not done:
    episode_step += 1
    action = np.argmax(Qtable[state])
    state, reward , done , info = env.step(action)
    if reward == -10: #wrong pick up or drop off Passenger
      episode_penalty += 1
    episode_reward += reward
  total_penalty += episode_penalty
  total_steps += episode_step
  total_rewards += episode_reward

In [43]:
print("Average number of steps in an episode =", total_steps/100)
print("Average reward for an episode =", total_rewards/100)
print("Average number of penalty in an episode =", total_penalty/100)

Average number of steps in an episode = 16.89
Average reward for an episode = 3.69
Average number of penalty in an episode = 0.0
