## Tutorial: Displaying gym Game in Jupyter Notebook

ref: http://mckinziebrandon.me/TensorflowNotebooks/2016/12/21/openai.html

I

In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from JSAnimation.IPython_display import display_animation
from matplotlib import animation
from IPython.display import display

In [3]:
def display_frames_as_gif(frames):
    """
    Displays a list of frames as a git, with controls
    """
    patch = plt.imshow(frames[0])
    plt.axis('off')
    
    def animate(i):
        patch.set_data(frames[i])
    
    anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=50)
    display(display_animation(anim, default_mode='loop'))

In [4]:
env = gym.make('CartPole-v0')

observation = env.reset()
cum_reward = 0
frames = []
for t in range(50):
    frames.append(env.render(mode = 'rgb_array'))
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    if done:
        print("Finished at step %d" % t)
        break
env.render(close=True)
# display_frames_as_gif(frames)

[2017-11-14 23:20:32,122] Making new env: CartPole-v0


Finished at step 12


## Training a Q Learning Network

ref: https://keon.io/deep-q-learning/

We define the loss function as follows:

$$loss=\left(r+\gamma \max_a\hat{Q}(s,a) - Q(s,a)\right)^2$$

Where $r$ is the reward, $\gamma$ is the decay rate. The term $r+\gamma \max_a\hat{Q}(s,a)$ is the **target**, and term $Q(s,a)$ is the **prediction**. In this simple problem, the Q function is a mapping:

`Q: (state, action) => reward`

In python, we calculate the target as:

```python
target = reward + gamma * np.max(model.predict(next_state))
```

### The Model

The following model is copied from the blog. In my view this is **not the most optimal** way because it models the Q function as:

`Q: state => [reward(action1), reward(action2)]`

It is sub-optimal because only one of `reward(action1)` or `reward(action2)` can be observed at one time. Nonetheless, it provides us a good starting point.


### How the Agent Decides to Act

The agent will randomly select its action at first by a certain probability, called **exploration rate** or **epsilon**. This is because at first, it is better for the agent to try all kinds of things before it starts to see the patterns. When it is not deciding the action randomly, the agent will predict the reward value based on the current state and pick the action that will give the highest reward. 

In [3]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

import random
from collections import deque

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=200)
        self.gamma = 0.95  # discount rate 
        self.epsilon = 1.0 # exploration rate 
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()
    
    def _build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])
    
    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                Q = self.model.predict(next_state)[0]
                target = reward + self.gamma * np.amax(Q)
            target_f = self.model.predict(state)
            print(target_f)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
    
    def load(self, name):
        self.model.load_weights(name)
    
    def save(self, name):
        self.model.save_weights(name)

In [None]:
# do not run this block. Instead, we train the model from the terminal and retrieve it from the 
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]  # 4
action_size = env.action_space.n  # 2
agent = DQNAgent(state_size, action_size)
done = False
batch_size = 32
episodes = 5000

for e in range(1, episodes+1):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    for time in range(500):
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        reward = reward if not done else -10
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        if done:
            print("episode: {}/{}, score: {}, e: {:.2}"
                  .format(e, episodes, time, agent.epsilon))
            break
    if len(agent.memory) > batch_size:
        agent.replay(batch_size)
        
agent.save('CartPole.model')

Now, let's play the game using the trained model.

In [None]:
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]  # 4
action_size = env.action_space.n  # 2
agent = DQNAgent(state_size, action_size)

agent.load('CartPole.model')

# agent.epsilon = 1.0  # use random actions
agent.epsilon = 0.0  # disable random actions

observation = env.reset()
state = env.reset()
frames = []
for t in range(5000):
    frames.append(env.render(mode = 'rgb_array'))
    action = agent.act(state.reshape((1, -1)))
    print(action)
    state, reward, done, info = env.step(action)
    if done:
        print("Finished at step %d" % t)
        break
env.render(close=True)