<a href="https://colab.research.google.com/github/gamante91/reinforcement-learning-gym/blob/master/Taxi_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from collections import defaultdict
import gym
import math
import matplotlib.pyplot as plt
import numpy as np
import sys
from time import sleep

In [0]:
'''
constants and free functions
'''
epsilon_max = 0.95

clear = lambda : sys.stdout.flush()

def select_epsilon_greedy_action(Q_s, nA, epsilon):
  return np.argmax(Q_s) if np.random.sample() < epsilon else np.random.choice(nA)

def select_greedy_action(Q_s):
  return np.argmax(Q_s)

def render(env, info):
    clear()
    env.render()

    if 'state' in info:
        print("state:", info['state'])
    if 'action' in info:
        print("action:", info['action'])
    if 'reward' in info:
        print("reward:", info['reward'])
        if 'done' in info and info['done']:
          print("game ended")
          print("you won! :)") if info['reward'] > 0 else print("you lost! :(")
    
    sleep(0.1)

In [0]:
class Agent:
  
  def __init__(self, nA):
    ''' 
    agent's constructor 
    '''
    self.nA = nA
    self.Q = defaultdict(lambda: np.zeros(self.nA))
    
  def train(self, env, num_episodes, alpha, gamma, epsilon):
    '''
    trains the agent over a fixed number of episodes using Q-learning
    '''
    rewards = []
    max_reward = -math.inf
    
    for i_episode in range(1, num_episodes+1):
      state = env.reset()
      done = False
      tot_reward = 0
      eps = max(epsilon * i_episode, epsilon_max)
      
      while not done:
        action = select_epsilon_greedy_action(self.Q[state], self.nA, eps)
        next_state, reward, done, _ = env.step(action)
        tot_reward += reward
        
        '''
        updated internal Q table using Q-learning update rule
        '''
        self.Q[state][action] += alpha * (reward + gamma * self.Q[next_state][np.argmax(self.Q[next_state])] - + self.Q[state][action])
        state = next_state
        
      if (i_episode % 100 == 0):
        print("episode:", i_episode, "reward:", tot_reward, ", best reward so far:", max_reward)
      rewards.append(tot_reward)
      max_reward = max(max_reward, tot_reward)
      clear()
      
  def play(self, env, showDisplay):
    state = env.reset()
    done = False
    tot_reward = 0 
    
    while not done:
      action = select_greedy_action(self.Q[state])
      state, reward, done, _ = env.step(action)
      tot_reward += reward
      
      info = {'state': state, 'action': action, 'reward': reward, 'done': done}
      
      if (showDisplay):
        render(env, info)

In [38]:
env = gym.make('Taxi-v2')

agent = Agent(env.action_space.n)
agent.train(env, 20000, 0.1, 0.9, 0.1)

episode: 100 reward: -317 , best reward so far: 1
episode: 200 reward: -85 , best reward so far: 11
episode: 300 reward: -107 , best reward so far: 12
episode: 400 reward: 9 , best reward so far: 12
episode: 500 reward: -55 , best reward so far: 15
episode: 600 reward: -60 , best reward so far: 15
episode: 700 reward: 5 , best reward so far: 15
episode: 800 reward: 9 , best reward so far: 15
episode: 900 reward: 11 , best reward so far: 15
episode: 1000 reward: -118 , best reward so far: 15
episode: 1100 reward: 12 , best reward so far: 15
episode: 1200 reward: 6 , best reward so far: 15
episode: 1300 reward: 7 , best reward so far: 15
episode: 1400 reward: 12 , best reward so far: 15
episode: 1500 reward: 14 , best reward so far: 15
episode: 1600 reward: 8 , best reward so far: 15
episode: 1700 reward: 3 , best reward so far: 15
episode: 1800 reward: 12 , best reward so far: 15
episode: 1900 reward: 7 , best reward so far: 15
episode: 2000 reward: 8 , best reward so far: 15
episode: 2

In [0]:
agent.play(env, True)

+---------+
|[34;1m[43mR[0m[0m: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
state: 1
action: 1
reward: -1
+---------+
|[42mR[0m: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Pickup)
state: 17
action: 4
reward: -1
+---------+
|R: | : :[35mG[0m|
|[42m_[0m: : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (South)
state: 117
action: 0
reward: -1
+---------+
|R: | : :[35mG[0m|
| :[42m_[0m: : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (East)
state: 137
action: 2
reward: -1
+---------+
|R: | : :[35mG[0m|
| : :[42m_[0m: : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (East)
state: 157
action: 2
reward: -1
+---------+
|R: | : :[35mG[0m|
| : : :[42m_[0m: |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (East)
state: 177
action: 2
reward: -1
+---------+
|R: | :[42m_[0m:[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
state: 77
action: 