[Credit to kaggle user palaksood97](https://www.kaggle.com/palaksood97/cartpole-dqn#Define-agent)

My first go at using the OpenAI Gym and writing a DQN

## Initialize gym environment

In [129]:
import random
import os
from collections import deque

import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

import gym

In [46]:
def query_environment(name):
  env = gym.make(name)
  spec = gym.spec(name)
  print(f"Action Space: {env.action_space}")
  print(f"Observation Space: {env.observation_space}")
  print(f"Max Episode Steps: {spec.max_episode_steps}")
  print(f"Nondeterministic: {spec.nondeterministic}")
  print(f"Reward Range: {env.reward_range}")
  print(f"Reward Threshold: {spec.reward_threshold}")

In [47]:
query_environment('Taxi-v3')

Action Space: Discrete(6)
Observation Space: Discrete(500)
Max Episode Steps: 200
Nondeterministic: False
Reward Range: (-inf, inf)
Reward Threshold: 8


In [23]:
env = gym.make('Taxi-v3')

## Random agent to practice interacting with the environment

In [38]:
print("*****Random Agent*****")
for i_episode in range (2):
  state = env.reset()
  total_reward = 0
  for t in range(5):
    # env.render()
    action = env.action_space.sample() # choose random action: left or right (0 or 1)
    state, reward, done, info = env.step(action)
    print(state, reward, action)
    total_reward += reward 
    if done:
      print(f"Episode finished after {t} timesteps with total reward {total_reward}")
      break
  env.close()

*****Random Agent*****
303 -1 1
403 -1 0
403 -10 5
403 -1 3
403 -10 4
488 -1 2
388 -1 1
288 -1 1
288 -10 5
188 -1 1


## DQN Agent

In [172]:
class DQNAgent:

  """
    action_space:     action space of the gym environment
    gamma:            discount/decay/gamme
    epsilon:          exploration rate
    epsilon_decay:    exploration rate decay
    epsilon_min:      minimum exploration rate
    history_len:      how much past data to store for re-training
    batch_size:       batch size for training NN
    model:            a keras model defining the NN
  """
  def __init__(self, model, action_space, gamma=.95, epsilon=1, epsilon_decay=.995, epsilon_min=.1, history_len=2000, batch_size=32):
    self.action_space = action_space
    self.gamma = gamma
    self.epsilon = epsilon
    self.epsilon_decay = epsilon_decay
    self.epsilon_min = epsilon_min
    self.model = model
    self.memory = deque(maxlen=history_len)
    self.batch_size = batch_size

  ## remember a state for later re-training
  def remember(self, state, action, reward, next_state, done):
    self.memory.append((state, action, reward, next_state, done))

  # get next action
  def get_action(self, state):
    # epsilon greedy exploration
    if np.random.rand() <= self.epsilon:
      # explore
      return action_space.sample()
    actions = self.model.predict((state,1))
    return np.argmax(actions[0])

  # training loop
  def train(self):
    batch = random.sample(self.memory, self.batch_size)
    for state, action, reward, next_state, done in batch:
      target = reward
      if not done:
        target = (reward + self.gamma * np.amax(self.model.predict((next_state,1))[0]))
      target_f = self.model.predict((state,1))
      target_f[0][action] = target
      self.model.fit(np.array([state, 1]), target_f, epochs=1, verbose=0)
      if self.epsilon > self.epsilon_min:
        self.epsilon *= self.epsilon_decay

  # load model weights
  def load(self, name):
    self.model.load_weights(name)

  # save model weights
  def save(self, name):
    self.model.save_weights(name)

In [122]:
def basic_model(action_space, observation_space, learning_rate):
  model = Sequential()
  model.add(Dense(24, input_shape=(1,), activation='relu'))
  model.add(Dense(24, activation='relu'))
  model.add(Dense(action_space.n, activation='linear'))
  model.compile(loss='mse', optimizer=Adam(learning_rate=learning_rate))
  return model

## Train the agent(s)

In [123]:
observation_space = env.observation_space
action_space = env.action_space

In [124]:
model = basic_model(action_space, observation_space, .001)

In [131]:
model.summary()

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_36 (Dense)             (None, 24)                48        
_________________________________________________________________
dense_37 (Dense)             (None, 24)                600       
_________________________________________________________________
dense_38 (Dense)             (None, 6)                 150       
Total params: 798
Trainable params: 798
Non-trainable params: 0
_________________________________________________________________


In [173]:
agent = DQNAgent(model, action_space)

In [127]:
num_episodes = 1000

In [None]:
done = False
for episode in range(num_episodes):
  state = env.reset()
  total_reward = 0
  for time in range(5000):
    action = agent.get_action(state)
    next_state, reward, done, _ = env.step(action)
    total_reward += reward
    agent.remember(state, action, reward, next_state, done)
    state = next_state
    if done:
      print(f"episode {episode}/{num_episodes} finished with score {reward}")
      break;
    if len(agent.memory) >= agent.batch_size:
      agent.train()

episode 0/1000 finished with score -10
episode 1/1000 finished with score -1
episode 2/1000 finished with score -1
episode 3/1000 finished with score -1
episode 4/1000 finished with score -1
episode 5/1000 finished with score -1
episode 6/1000 finished with score -1
episode 7/1000 finished with score -1
episode 8/1000 finished with score -1
episode 9/1000 finished with score -1
episode 10/1000 finished with score -1
