# Q* Learning with Gym Taxi-v2 Environment

## Import the dependencies

* Numpy - a library of linear algebra
* Gym - a library of environments
* Random - a library which helps us to generate random numbers
* Matplotlib - a plotting library

In [1]:
import numpy as np
import gym
import random
import matplotlib.pyplot as plt
%matplotlib inline
from collections import deque

## Create our Agent

In [2]:
class Agent():
    
    def __init__(self, env):
        # Create the Q-table and initialize it with zeros
        self.qtable = np.zeros((env.observation_space.n, env.action_space.n))
        
        self.lr_interval = np.linspace(0.1, 0.8, 200000)
        
        self.gamma = 0.99    
        self.gamma_decay = 0.998

        self.epsilon = 1            
        self.epsilon_decay = 0.99995
        
    def select_action(self, state):
        tradeoff = random.uniform(0, 1)
        
        if tradeoff > self.epsilon:
            action = np.argmax(self.qtable[state, :])
        else:
            action = env.action_space.sample()
        
        return action
    
    def fit(self, episode, action, state, new_state, reward, done):
        self.lr = self.lr_interval[episode]
        
        self.qtable[state, action] = self.qtable[state, action] + self.lr * (reward + 
               self.gamma * np.max(self.qtable[new_state, :]) - self.qtable[state, action])
        
        self.epsilon *= self.epsilon_decay
        
        if done:
            self.gamma = 0.99
        else:
            self.gamma *= self.gamma_decay
        

        
        

## Create the training function

In [3]:
def training(env, agent, number_episodes):
    
    rewards = deque(maxlen=100)
    best_avg_reward = -9999
    
    for episode in range(number_episodes):
        state = env.reset()
        episode_reward = 0
        
        while True:
            action = agent.select_action(state)
            
            new_state, reward, done, _ = env.step(action)
            
            agent.fit(episode, action, state, new_state, reward, done)
            
            state = new_state
            episode_reward += reward
            
            if done:
                break
        
        rewards.append(episode_reward)
        
        if episode >= 99:
            avg_reward = np.mean(rewards)
            
            if avg_reward > best_avg_reward:
                best_avg_reward = avg_reward
        
        if episode > 0 and episode % 20000 == 0:
            print("Best average reward is ", best_avg_reward)
            
        
        
    
    

## Create the Taxi environment

* env.action_space.n - number of actions, which provides our environment
* env.observation_space.n - number of states, which provides our environment

In [4]:
env = gym.make('Taxi-v2')

In [5]:
action_size = env.action_space.n
state_size = env.observation_space.n

print("Action size =", action_size)
print("State size =", state_size)

Action size = 6
State size = 500


## Train the agent on the selected environment

In [6]:
agent = Agent(env)
training(env, agent, 20001)


Best average reward is  9.61
