**Human-level control through Deep Reinforcement Learning**

https://deepmind.com/research/dqn/

https://deepmind.com/blog/deep-reinforcement-learning/

**Playing Atari with Deep Reinforcement Learning**

https://arxiv.org/abs/1312.5602
    
**Demystifying Deep Reinforcement Learning**

https://www.nervanasys.com/demystifying-deep-reinforcement-learning/

**Let’s make a DQN**

https://jaromiru.com/category/dqn/

**CartPole-v0**

https://gym.openai.com/envs/CartPole-v0

In [1]:
from bokeh.io import output_notebook, push_notebook, show
from bokeh.charts import HeatMap
from bokeh.layouts import column, row
from bokeh.models import ColumnDataSource, DataRange1d, HoverTool
from bokeh.palettes import Set1
from bokeh.plotting import figure

output_notebook()

The bokeh.charts API has moved to a separate 'bkcharts' package.

This compatibility shim will remain until Bokeh 1.0 is released.
After that, if you want to use this API you will have to install
the bkcharts package explicitly.

  warn(message)


In [2]:
from collections import deque, namedtuple
from datetime import timedelta
from time import time

import math
import random

import numpy as np
import tensorflow as tf

import gym

env = gym.make('CartPole-v0')

state_size = env.observation_space.shape[0] # 4
action_size = env.action_space.n # 2

[2017-06-16 12:24:21,970] Making new env: CartPole-v0


## Agent execution + Random Agent

In [3]:
RunState = namedtuple('RunState', ['i', 't', 'mean_t', 'min_t', 'max_t', 'R'])
Memento = namedtuple('Memento', ['state', 'action', 'reward', 'state_after'])

class NoMemory:
    def add(self, data):
        pass
    def sample(self, n):
        return []
    
def gym_run(agent_func,
            memory=NoMemory(),
            num_episodes=20,
            max_steps=500,
            report_buffer=1,
            report_func=lambda r: None):
    results = []
    start = time()
    n_t, mean_t, min_t, max_t = 0, 0, max_steps, 0
    terminal_state = None
    for i in range(1, num_episodes+1):
        done = False
        t = 0
        R = 0
        s = env.reset()
        while not done and t < max_steps:
            a = agent_func(s, memory)
            s_, r, done, _ = env.step(a)
            if done: s_ = terminal_state
            memory.add(Memento(s, a, r, s_))
            t += 1
            R += r
            s = s_
        
        n_t += 1
        mean_t = (n_t - 1) / n_t * mean_t + t / n_t
        min_t = min(min_t, t)
        max_t = max(max_t, t)
        results.append(RunState(i, t, mean_t, min_t, max_t, R))
        
        if report_buffer > 0 and i % report_buffer == 0:
            report_func(results)
            results.clear()

            duration = timedelta(seconds=time() - start)
            start = time()
            print('Episode {} last for {} steps, mean steps {:.2f} [{}, {}], time {}' \
                  .format(i, t, mean_t, min_t, max_t, duration))
            if report_buffer != 1:
                n_t, mean_t, min_t, max_t = 0, 0, max_steps, 0

def random_agent(s, memory=None):
    return random.randint(0, action_size-1)

gym_run(random_agent)

Episode 1 last for 31 steps, mean steps 31.00 [31, 31], time 0:00:00.000757
Episode 2 last for 40 steps, mean steps 35.50 [31, 40], time 0:00:00.001075
Episode 3 last for 18 steps, mean steps 29.67 [18, 40], time 0:00:00.000577
Episode 4 last for 9 steps, mean steps 24.50 [9, 40], time 0:00:00.000328
Episode 5 last for 27 steps, mean steps 25.00 [9, 40], time 0:00:00.000675
Episode 6 last for 24 steps, mean steps 24.83 [9, 40], time 0:00:00.000639
Episode 7 last for 20 steps, mean steps 24.14 [9, 40], time 0:00:00.000531
Episode 8 last for 21 steps, mean steps 23.75 [9, 40], time 0:00:00.000675
Episode 9 last for 11 steps, mean steps 22.33 [9, 40], time 0:00:00.000351
Episode 10 last for 21 steps, mean steps 22.20 [9, 40], time 0:00:00.000565
Episode 11 last for 37 steps, mean steps 23.55 [9, 40], time 0:00:00.000927
Episode 12 last for 65 steps, mean steps 27.00 [9, 65], time 0:00:00.001588
Episode 13 last for 31 steps, mean steps 27.31 [9, 65], time 0:00:00.001119
Episode 14 last for

In [4]:
RunPlot = namedtuple('RunPlot', ['handle', 'steps_source', 'rewards_source'])

def plot_run():
    steps_data = ColumnDataSource(data=dict(episodes=[], t=[], mean_t=[], min_t=[], max_t=[]))
    rewards_data = ColumnDataSource(data=dict(episodes=[], r=[]))

    steps_hover = HoverTool(
        tooltips=[
            ('episode', '@episodes'),
            ('steps', '@t'),
            ('mean', '@mean_t'),
            ('min', '@min_t'),
            ('max', '@max_t'),
        ]
    )
    
    plot_steps = figure(width=475, height=300, tools=[steps_hover], logo=None, toolbar_location=None, 
                        title='Steps', x_axis_label='episodes', y_axis_label='T',
                        x_range=DataRange1d(follow='end', follow_interval=100, range_padding=0))
    plot_steps.line('episodes', 't', color='yellow', source=steps_data)
    plot_steps.line('episodes', 'mean_t', color='green', source=steps_data)
    plot_steps.line('episodes', 'min_t', color='orange', source=steps_data)
    plot_steps.line('episodes', 'max_t', color='red', source=steps_data)

    rewards_hover = HoverTool(
        tooltips=[
            ('episode', '@episodes'),
            ('reward', '@r'),
        ]
    )
    
    plot_rewards = figure(width=475, height=300, tools=[rewards_hover], logo=None, toolbar_location=None,
                          title='Rewards', x_axis_label='episodes', y_axis_label='R',
                          x_range=DataRange1d(range_padding=0))
    plot_rewards.line('episodes', 'r', color='blue', source=rewards_data)
    handle = show(row(plot_steps, plot_rewards), notebook_handle=True)
    return RunPlot(handle, steps_data, rewards_data)

plot = plot_run()

In [5]:
def report_plot(plot, results):
    episodes, t, mean_t, min_t, max_t, r = zip(*results)
    plot.steps_source.stream(dict(episodes=episodes, t=t, mean_t=mean_t, min_t=min_t, max_t=max_t), rollover=1000)
    plot.rewards_source.stream(dict(episodes=episodes, r=r))
    push_notebook(handle=plot.handle)

def plot_adapter(plot):
    return lambda r: report_plot(plot, r)

gym_run(random_agent, num_episodes=200, report_buffer=10,report_func=plot_adapter(plot))

Episode 10 last for 16 steps, mean steps 21.80 [12, 53], time 0:00:00.016969
Episode 20 last for 22 steps, mean steps 20.40 [10, 40], time 0:00:00.016956
Episode 30 last for 13 steps, mean steps 19.20 [12, 44], time 0:00:00.016571
Episode 40 last for 35 steps, mean steps 23.30 [14, 35], time 0:00:00.018060
Episode 50 last for 12 steps, mean steps 17.00 [12, 25], time 0:00:00.017321
Episode 60 last for 17 steps, mean steps 22.50 [9, 50], time 0:00:00.021365
Episode 70 last for 37 steps, mean steps 20.60 [10, 37], time 0:00:00.020692
Episode 80 last for 10 steps, mean steps 17.80 [9, 33], time 0:00:00.017838
Episode 90 last for 45 steps, mean steps 23.10 [15, 45], time 0:00:00.022105
Episode 100 last for 26 steps, mean steps 18.60 [11, 28], time 0:00:00.019969
Episode 110 last for 33 steps, mean steps 20.90 [11, 33], time 0:00:00.024781
Episode 120 last for 18 steps, mean steps 19.80 [11, 36], time 0:00:00.018147
Episode 130 last for 18 steps, mean steps 18.10 [11, 31], time 0:00:00.0173

In [6]:
def run_agent(*args, **kargs):
    plot = plot_run()
    gym_run(*args, **kargs, report_func=plot_adapter(plot))

In [7]:
run_agent(random_agent)

Episode 1 last for 18 steps, mean steps 18.00 [18, 18], time 0:00:00.029387
Episode 2 last for 22 steps, mean steps 20.00 [18, 22], time 0:00:00.026224
Episode 3 last for 22 steps, mean steps 20.67 [18, 22], time 0:00:00.028620
Episode 4 last for 10 steps, mean steps 18.00 [10, 22], time 0:00:00.028119
Episode 5 last for 13 steps, mean steps 17.00 [10, 22], time 0:00:00.023974
Episode 6 last for 19 steps, mean steps 17.33 [10, 22], time 0:00:00.021445
Episode 7 last for 41 steps, mean steps 20.71 [10, 41], time 0:00:00.021427
Episode 8 last for 27 steps, mean steps 21.50 [10, 41], time 0:00:00.023700
Episode 9 last for 20 steps, mean steps 21.33 [10, 41], time 0:00:00.031049
Episode 10 last for 25 steps, mean steps 21.70 [10, 41], time 0:00:00.022274
Episode 11 last for 12 steps, mean steps 20.82 [10, 41], time 0:00:00.026047
Episode 12 last for 12 steps, mean steps 20.08 [10, 41], time 0:00:00.026582
Episode 13 last for 21 steps, mean steps 20.15 [10, 41], time 0:00:00.024104
Episode 

## Simple DQN Agent

* $\varepsilon$-greedy
* Uniform Experience Replay
* Single Q Network

(missing target network, error clipping)

**This algorithm is Unstable**

### $\varepsilon$-greedy

$\varepsilon = \varepsilon_{min} + (\varepsilon_{max} - \varepsilon_{min}) e^{-\lambda t}$

The $\lambda$ parameter controls the speed of decay. This way we start with a policy that explores greatly and behaves more and more greedily over time.

In [8]:
class EpsilonGreedy:
    
    def __init__(self, eps_min=0.01, eps_max=1.0, decay=0.001):
        self.eps_min = eps_min
        self.eps_max = eps_max
        self.decay = decay
        self.steps = 0

    def epsilon(self):
        return self.eps_min \
        + (self.eps_max - self.eps_min) \
        * math.exp(-self.decay * self.steps)
    
    def explore(self):
        return random.random() < self.epsilon()
    
    def __iter__(self):
        return self
    
    def __next__(self):
        self.steps += 1
        return self.explore()

eps = EpsilonGreedy()
for _ in range(5):
    explore = next(eps)
    print('{:.6f}, explore={}'.format(eps.epsilon(), explore))

0.999010, explore=True
0.998022, explore=True
0.997034, explore=True
0.996048, explore=True
0.995062, explore=True


In [9]:
steps = list(range(1, 10000))
eps = []
exp = []
eps_ = EpsilonGreedy()
exp_values = ['exploit', 'explore']
for _ in steps:
    next(eps_)
    eps.append(eps_.epsilon())
    exp.append(exp_values[int(eps_.explore())])

eps_v = figure(width=900, height=300, tools='hover', logo=None,
              title='Epsilon', x_axis_label='steps', y_axis_label='eps')
eps_v.line(steps, eps)
hmap = HeatMap(dict(steps=steps, exp=exp), x='steps', y='exp',
               width=900, height=200, tools=None, legend=False, toolbar_location=None)
hmap.x_range = eps_v.x_range

show(column(eps_v, hmap))

### Uniform Memory (Experience Replay)

In [10]:
memory_size = 100000

class UniformMemory:
    
    def __init__(self, size=memory_size):
        self.data = deque(maxlen=size)

    def __del__(self):
        self.data.clear()

    def add(self, data):
        self.data.append(data)

    def sample(self, n):
        n = min(n, len(self.data))
        return random.sample(self.data, n)

memory = UniformMemory()
memory.add(Memento(1,2,3,4))
memory.add(Memento(5,6,7,8))
memory.add(Memento(9,10,11,12))
memory.add(Memento(13,14,15,16))

print(memory.sample(2))
print(memory.sample(2))

del memory

[Memento(state=5, action=6, reward=7, state_after=8), Memento(state=13, action=14, reward=15, state_after=16)]
[Memento(state=1, action=2, reward=3, state_after=4), Memento(state=5, action=6, reward=7, state_after=8)]


### Q Network

The *Q Network* class encapsulates the neural network. Our problem is simple enough so we will use only one hidden layer of 64 neurons, with ReLU activation function. The final layer will consist of only two neurons, one for each available action. Their activation function will be linear. Remember that we are trying to approximate the Q function, which in essence can be of any real value. Therefore we can’t restrict the output from the network and the linear activation works well.

In [11]:
class QNetwork(namedtuple('QNetwork', ['x', 'y', 'y_hat', 'v_hat', 'pi_hat', 'loss', 'train_op'])):
    __slots__ = ()
    
    def train(self, session, X, Y):
        return session.run([self.train_op, self.loss], feed_dict={self.x: X, self.y: Y})[1]
    
    def predict_Q(self, session, X):
        return session.run(self.y_hat, feed_dict={self.x: X})
    
    def predict_V(self, session, X):
        return session.run(self.v_hat, feed_dict={self.x: X})
    
    def predict_action(self, session, s):
        X = s.reshape((1, state_size))
        return session.run(self.pi_hat, feed_dict={self.x: X})[0]

def simple_q(input_size=state_size, output_size=action_size, hidden_size=64, learning_rate=0.00025):
    x = tf.placeholder(tf.float32, shape=[None, input_size])
    y = tf.placeholder(tf.float32, shape=[None, output_size])

    W_h = tf.Variable(tf.random_uniform([input_size, hidden_size], -1.0, 1.0))
    b_h = tf.Variable(tf.zeros([hidden_size]))

    hidden = tf.nn.relu(tf.matmul(x, W_h) + b_h)

    W_o = tf.Variable(tf.random_uniform([hidden_size, output_size], -1.0, 1.0))
    b_o = tf.Variable(tf.zeros([output_size]))

    y_hat = tf.matmul(hidden, W_o) + b_o

    v_hat = tf.reduce_max(y_hat, 1)
    pi_hat = tf.argmax(y_hat, 1)

    loss = tf.reduce_mean(tf.square(y_hat - y))
    opt = tf.train.RMSPropOptimizer(learning_rate)
    train_op = opt.minimize(loss)

    return QNetwork(x, y, y_hat, v_hat, pi_hat, loss, train_op)

graph = tf.Graph()

with graph.as_default():
    Q_net = simple_q()
    init_op = tf.global_variables_initializer()

session = tf.Session(graph=graph)
session.run(init_op)

In [12]:
s = np.array([-0.04288668, -0.0078879 , -0.01835355, -0.03561641])
s

array([-0.04288668, -0.0078879 , -0.01835355, -0.03561641])

In [13]:
Q_net.predict_Q(session, s.reshape((1, state_size)))

array([[ 0.28950709,  0.04591805]], dtype=float32)

In [14]:
Q_net.predict_V(session, s.reshape((1, state_size)))

array([ 0.28950709], dtype=float32)

In [15]:
Q_net.predict_action(session, s)

0

In [16]:
session.run(init_op)
Q_net.predict_Q(session, s.reshape((1, state_size)))

array([[-0.05019628,  0.03127588]], dtype=float32)

### Training

$Q(s, a) \xrightarrow{} r + \gamma max_{a'}{Q(s', a')}$

This formula means that for a sample $(s, r, a, s')$ we will update the network’s weights so that its output is closer to the target.

In [17]:
gamma = 0.99
batch_size = 5

memory = UniformMemory()
gym_run(random_agent, memory, report_buffer=0)
replay_sample = memory.sample(batch_size)
replay_sample

[Memento(state=array([ 0.00463897, -0.15833604,  0.01647166,  0.38461789]), action=1, reward=1.0, state_after=array([ 0.00147225,  0.03654823,  0.02416402,  0.09717358])),
 Memento(state=array([-0.05571164,  0.0290116 ,  0.12100379,  0.29174747]), action=1, reward=1.0, state_after=array([-0.05513141,  0.22221911,  0.12683874,  0.03954368])),
 Memento(state=array([-0.00213451, -0.38078348,  0.05107367,  0.62041654]), action=1, reward=1.0, state_after=array([-0.00975018, -0.18641063,  0.063482  ,  0.34424657])),
 Memento(state=array([ 0.01852824, -0.36374395, -0.12527527,  0.28793194]), action=1, reward=1.0, state_after=array([ 0.01125336, -0.16707887, -0.11951663, -0.04148854])),
 Memento(state=array([ 0.01300753,  0.15043982, -0.02719874, -0.31710095]), action=0, reward=1.0, state_after=array([ 0.01601633, -0.04428439, -0.03354075, -0.03311812]))]

In [18]:
# Q shape (batch_size, action_size)
Q = Q_net.predict_Q(session, np.array([t.state for t in replay_sample]))

print(Q.shape)
print(Q)

(5, 2)
[[ 0.56078172  1.46936822]
 [ 0.81535673  0.96419829]
 [ 0.67394304  2.70820951]
 [ 0.07511951  1.69816005]
 [-0.06407934  0.43001533]]


In [19]:
# max_Q (batch_size, )
terminal_state = np.zeros(state_size)
def make_state(s): return terminal_state if s is None else s

max_Q = Q_net.predict_V(session, np.array([make_state(t.state_after) for t in replay_sample]))

print(max_Q.shape)
print(max_Q)

(5,)
[ 0.36021847  0.21214449  1.55512583  0.61762357  0.18363477]


In [20]:
# X shape [batch_size, state_size]
# Y shape [batch_size, action_size]
# Q(s, a) -> r + gamma * max_a' Q(s', a')
n = len(replay_sample)
X = np.zeros((n, state_size))
Y = np.zeros((n, action_size))
for i, t in enumerate(replay_sample):
    s, a, r, s_ = t
    X[i] = s
    q = Q[i]
    if s_ is None:
        q[a] = r
    else:
        q[a] = r + gamma * max_Q[i]
    Y[i] = q

print(X.shape)
print(X)
print(Y.shape)
print(Y)

(5, 4)
[[ 0.00463897 -0.15833604  0.01647166  0.38461789]
 [-0.05571164  0.0290116   0.12100379  0.29174747]
 [-0.00213451 -0.38078348  0.05107367  0.62041654]
 [ 0.01852824 -0.36374395 -0.12527527  0.28793194]
 [ 0.01300753  0.15043982 -0.02719874 -0.31710095]]
(5, 2)
[[ 0.56078172  1.35661626]
 [ 0.81535673  1.21002305]
 [ 0.67394304  2.53957462]
 [ 0.07511951  1.61144733]
 [ 1.18179846  0.43001533]]


In [21]:
Q_net.train(session, X, Y)

0.16613111

In [22]:
session.close()
del graph

In [23]:
class DQNSimpleGraph:
    
    # self.graph
    # self.Q_net
    # self.init_op
    # self.input_size
    # self.output_size
    
    def __init__(self,
                 input_size=state_size,
                 output_size=action_size,
                 hidden_size=64,
                 learning_rate=0.00025):
        self.graph = tf.Graph()
        with self.graph.as_default():
            self.Q_net = self.q_network(input_size,
                                        output_size,
                                        hidden_size,
                                        learning_rate)
            self.init_op = tf.global_variables_initializer()
        
        self.input_size = input_size
        self.output_size = output_size
    
    def q_network(self, input_size, output_size, hidden_size, learning_rate):
        x = tf.placeholder(tf.float32, shape=[None, input_size])
        y = tf.placeholder(tf.float32, shape=[None, output_size])

        W_h = tf.Variable(tf.random_uniform([input_size, hidden_size], -1.0, 1.0))
        b_h = tf.Variable(tf.zeros([hidden_size]))

        hidden = tf.nn.relu(tf.matmul(x, W_h) + b_h)

        W_o = tf.Variable(tf.random_uniform([hidden_size, output_size], -1.0, 1.0))
        b_o = tf.Variable(tf.zeros([output_size]))

        y_hat = tf.matmul(hidden, W_o) + b_o

        v_hat = tf.reduce_max(y_hat, 1)
        pi_hat = tf.argmax(y_hat, 1)

        loss = tf.reduce_mean(tf.square(y_hat - y))
        opt = tf.train.RMSPropOptimizer(learning_rate)
        train_op = opt.minimize(loss)

        return QNetwork(x, y, y_hat, v_hat, pi_hat, loss, train_op)

class DQNSimple:
    
    def __init__(self,
                 q_graph = DQNSimpleGraph(),
                 batch_size=64,
                 gamma=0.99,
                 explore_policy=EpsilonGreedy()):
        self.q_graph = q_graph
        self.batch_size = batch_size
        self.gamma = gamma
        self.explore_policy = explore_policy

        self.session = tf.Session(graph=q_graph.graph)
        self.session.run(q_graph.init_op)
    
    def __del__(self):
        self.session.close()
    
    # Q(s, a) -> r + gamma * max_a' Q(s', a')
    def update_q(self, memory):
        replay_sample = memory.sample(self.batch_size)
        if not replay_sample:
            return
        session = self.session
        Q_net = self.q_graph.Q_net
        gamma = self.gamma
        
        states = np.array([t.state for t in replay_sample])
        states_ = np.array([make_state(t.state_after) for t in replay_sample])
        
        Q = Q_net.predict_Q(session, states)
        max_Q = Q_net.predict_V(session, states_)
        
        n = len(replay_sample)
        X = np.zeros((n, self.q_graph.input_size))
        Y = np.zeros((n, self.q_graph.output_size))
        for i, t in enumerate(replay_sample):
            s, a, r, s_ = t
            X[i] = s
            q = Q[i]
            if s_ is None:
                q[a] = r
            else:
                q[a] = r + gamma * max_Q[i]
            Y[i] = q
        loss_value = Q_net.train(session, X, Y)
        #print(loss_value)
    
    def Q(self, X):
        return self.q_graph.Q_net.predict_Q(self.session, X)
    
    def explore(self):
        return next(self.explore_policy)
    
    def action(self, s):
        return self.q_graph.Q_net.predict_action(self.session, s)
    
    def __call__(self, s, memory):
        self.update_q(memory)
        if self.explore():
            return random_agent(s)
        return self.action(s)

In [24]:
%%script false

dqn_simple = DQNSimple()
memory = UniformMemory()

run_agent(dqn_simple, memory)

del memory
del dqn_simple

In [25]:
%%time

dqn_simple = DQNSimple()
memory = UniformMemory()

run_agent(dqn_simple, memory, num_episodes=2000, report_buffer=100)

Episode 100 last for 12 steps, mean steps 12.93 [8, 45], time 0:00:06.057437
Episode 200 last for 10 steps, mean steps 10.29 [8, 14], time 0:00:05.064312
Episode 300 last for 32 steps, mean steps 13.06 [8, 32], time 0:00:06.526746
Episode 400 last for 120 steps, mean steps 90.09 [10, 200], time 0:00:45.690379
Episode 500 last for 200 steps, mean steps 160.01 [55, 200], time 0:01:21.767704
Episode 600 last for 200 steps, mean steps 191.13 [67, 200], time 0:01:39.880707
Episode 700 last for 200 steps, mean steps 198.91 [135, 200], time 0:01:46.424693
Episode 800 last for 200 steps, mean steps 200.00 [200, 200], time 0:01:49.712110
Episode 900 last for 200 steps, mean steps 200.00 [200, 200], time 0:01:52.098831
Episode 1000 last for 200 steps, mean steps 200.00 [200, 200], time 0:01:53.085813
Episode 1100 last for 200 steps, mean steps 191.02 [128, 200], time 0:01:47.655489
Episode 1200 last for 158 steps, mean steps 160.27 [87, 200], time 0:01:30.174050
Episode 1300 last for 63 steps, m

### Q values sampling

In [26]:
Q_params = []

def q_collector(s, memory=None):
    a = dqn_simple.action(s)
    Q_params.append((s, a))
    return a

gym_run(q_collector, num_episodes=1)

Episode 1 last for 200 steps, mean steps 200.00 [200, 200], time 0:00:00.094545


In [27]:
q_index = [0, len(Q_params) // 4 - 1, len(Q_params) // 2 - 1, 3 * len(Q_params) // 4 - 1, -1]
s_sample, a_sample = zip(*[Q_params[i] for i in q_index])
print(s_sample)
print(a_sample)

(array([ 0.02195775, -0.04812855, -0.01645927, -0.03783925]), array([-0.52179689, -0.96334508, -0.01011338,  0.09365299]), array([-1.40749506, -0.58997817,  0.05557231, -0.11961461]), array([-1.80784305, -0.22762392,  0.00525624, -0.08955601]), array([ -2.10102857e+00,  -2.25124399e-01,  -1.98869434e-03,
        -1.44693601e-01]))
(1, 1, 0, 0, 1)


In [28]:
X_sample = np.array(s_sample).reshape(len(s_sample), state_size)
X_sample

array([[  2.19577455e-02,  -4.81285500e-02,  -1.64592732e-02,
         -3.78392457e-02],
       [ -5.21796885e-01,  -9.63345080e-01,  -1.01133800e-02,
          9.36529893e-02],
       [ -1.40749506e+00,  -5.89978169e-01,   5.55723143e-02,
         -1.19614609e-01],
       [ -1.80784305e+00,  -2.27623916e-01,   5.25623935e-03,
         -8.95560143e-02],
       [ -2.10102857e+00,  -2.25124399e-01,  -1.98869434e-03,
         -1.44693601e-01]])

In [29]:
a_index = [(i,a) for i, a in enumerate(a_sample)]
q_sample = np.random.rand(len(a_sample), action_size)
print(q_sample)
[q_sample[i] for i in a_index]

[[ 0.21755458  0.596331  ]
 [ 0.83078409  0.83036207]
 [ 0.72204554  0.95417692]
 [ 0.48746045  0.02567296]
 [ 0.84641581  0.86500996]]


[0.59633100362236646,
 0.83036206776371757,
 0.7220455359524901,
 0.48746045491974332,
 0.86500996048144774]

In [30]:
Q_values = []
step_count = 0
step_sample = 100

def q_sampler(s, memory):
    a = dqn_simple(s, memory)
    global step_count
    if step_count % step_sample == 0:
        q_sample = dqn_simple.Q(X_sample)
        q_sample = [q_sample[k] for k in a_index]
        Q_values.append(q_sample)
    step_count += 1
    return a

In [31]:
%%time

run_agent(q_sampler, memory, num_episodes=1000, report_buffer=100)

Episode 100 last for 11 steps, mean steps 46.67 [9, 200], time 0:00:25.919990
Episode 200 last for 9 steps, mean steps 10.22 [9, 12], time 0:00:05.861102
Episode 300 last for 13 steps, mean steps 10.80 [9, 14], time 0:00:06.187288
Episode 400 last for 10 steps, mean steps 23.88 [8, 200], time 0:00:13.456658
Episode 500 last for 14 steps, mean steps 9.92 [8, 14], time 0:00:05.722039
Episode 600 last for 200 steps, mean steps 173.96 [12, 200], time 0:01:36.818823
Episode 700 last for 200 steps, mean steps 200.00 [200, 200], time 0:01:51.576833
Episode 800 last for 200 steps, mean steps 200.00 [200, 200], time 0:01:51.244876
Episode 900 last for 200 steps, mean steps 200.00 [200, 200], time 0:01:51.299717
Episode 1000 last for 200 steps, mean steps 200.00 [200, 200], time 0:01:51.975602
CPU times: user 13min 31s, sys: 46.3 s, total: 14min 18s
Wall time: 10min


In [32]:
len(Q_values)

1076

In [33]:
q_samples = tuple(zip(*Q_values))
len(q_samples)

5

In [34]:
steps = range(1, len(Q_values) + 1)

plot = figure(width=900, tools='hover', logo=None,
              title='Q samples', x_axis_label='steps', y_axis_label='Q(s,a)')

for i, q_sample in enumerate(q_samples):
    plot.line(steps, q_sample, legend='Sample {}'.format(i+1), color=Set1[len(q_samples)][i])

show(plot)

In [35]:
del memory
del dqn_simple

### Error Clipping

**Huber Loss**

https://en.wikipedia.org/wiki/Huber_loss

In [36]:
def _mse(err): return err ** 2

def _abs(err): return np.abs(err)

def _huber(err): return np.sqrt(err ** 2 + 1) - 1

plot = figure(width=900, height=400, tools='hover', logo=None,
              title='Error', x_axis_label='error', y_axis_label='loss')

err = np.linspace(-5, 5, 1000)
e_mse = [_mse(i) for i in err]
e_abs = [_abs(i) for i in err]
e_huber = [_huber(i) for i in err]

plot.line(err, e_mse, color='green', legend='mse')
plot.line(err, e_abs, color='blue', legend='abs')
plot.line(err, e_huber, color='red', legend='huber')

show(plot)

**Huber loss with TensorFlow**

In [37]:
graph = tf.Graph()

with graph.as_default():
    y = tf.placeholder(tf.float32, shape=[5,2])
    y_hat = tf.placeholder(tf.float32, shape=[5,2])
    loss = tf.reduce_mean(tf.sqrt(tf.square(y_hat - y) + 1) - 1)

with tf.Session(graph=graph) as session:
    Y = np.random.rand(5, 2)
    Y_hat = np.random.rand(5, 2)
    loss_value = session.run(loss, feed_dict={y: Y, y_hat: Y_hat})
    print(loss_value)

del graph

0.10632


**Huber loss with Numpy**

In [38]:
print(Y_hat)
print(Y)

[[ 0.41836987  0.02270375]
 [ 0.98863746  0.31383677]
 [ 0.2731227   0.7053818 ]
 [ 0.93503922  0.63234313]
 [ 0.5950769   0.48778855]]
[[ 0.47127343  0.99442756]
 [ 0.37784481  0.30671782]
 [ 0.58044985  0.98020849]
 [ 0.16158782  0.55265852]
 [ 0.10640178  0.23279328]]


In [39]:
Y_hat - Y

array([[-0.05290356, -0.97172381],
       [ 0.61079264,  0.00711896],
       [-0.30732715, -0.2748267 ],
       [ 0.7734514 ,  0.07968462],
       [ 0.48867511,  0.25499527]])

In [40]:
(Y_hat - Y) * (Y_hat - Y)

array([[  2.79878640e-03,   9.44247154e-01],
       [  3.73067655e-01,   5.06795506e-05],
       [  9.44499759e-02,   7.55297124e-02],
       [  5.98227068e-01,   6.34963830e-03],
       [  2.38803365e-01,   6.50225868e-02]])

In [41]:
(Y_hat - Y) * (Y_hat - Y) + 1

array([[ 1.00279879,  1.94424715],
       [ 1.37306766,  1.00005068],
       [ 1.09444998,  1.07552971],
       [ 1.59822707,  1.00634964],
       [ 1.23880337,  1.06502259]])

In [42]:
np.sqrt((Y_hat - Y) * (Y_hat - Y) + 1)

array([[ 1.00139842,  1.39436263],
       [ 1.1717797 ,  1.00002534],
       [ 1.04615963,  1.03707749],
       [ 1.26421006,  1.0031698 ],
       [ 1.11301544,  1.03199932]])

In [43]:
np.sqrt((Y_hat - Y) * (Y_hat - Y) + 1) - 1

array([[  1.39841542e-03,   3.94362633e-01],
       [  1.71779696e-01,   2.53394542e-05],
       [  4.61596321e-02,   3.70774862e-02],
       [  2.64210057e-01,   3.16979535e-03],
       [  1.13015438e-01,   3.19993153e-02]])

In [44]:
np.mean(np.sqrt((Y_hat - Y) * (Y_hat - Y) + 1) - 1)

0.10631978079055886

### Copying Weights with TensorFlow

In [45]:
graph = tf.Graph()

with graph.as_default():
    with tf.variable_scope('q1'):
        q1_w = tf.Variable(tf.random_uniform([2, 4], -1.0, 1.0), name='w')
        print(q1_w.name)
    with tf.variable_scope('q2'):
        q2_w = tf.Variable(tf.random_uniform([2, 4], -1.0, 1.0), name='w')
        print(q2_w.name)
    assign_w1_w2 = tf.assign(q1_w, q2_w)
    init_op = tf.global_variables_initializer()

print()

with tf.Session(graph=graph) as session:
    session.run(init_op)
    w1 = graph.get_tensor_by_name('q1/w:0')
    w2 = graph.get_tensor_by_name('q2/w:0')
    w1_, w2_ = session.run([w1, w2])
    print('w1 =', w1_)
    print('w2 =', w2_)
    print('\nw1 <- w2\n')
    session.run(assign_w1_w2)
    w1_, w2_ = session.run([w1, w2])
    print('w1 =', w1_)
    print('w2 =', w2_)
    print('\nfeeding w2\n')
    w1_, w2_ = session.run([w1, w2], feed_dict={w2: np.zeros((2, 4))})
    print('w1 =', w1_)
    print('w2 =', w2_)

del graph

q1/w:0
q2/w:0

w1 = [[-0.28734374 -0.49261642  0.87060857 -0.47738433]
 [ 0.09548998 -0.49507046 -0.14668584  0.07943463]]
w2 = [[ 0.12418556 -0.70336151  0.7957139   0.789994  ]
 [-0.32224298  0.53229809 -0.09923434 -0.05655694]]

w1 <- w2

w1 = [[ 0.12418556 -0.70336151  0.7957139   0.789994  ]
 [-0.32224298  0.53229809 -0.09923434 -0.05655694]]
w2 = [[ 0.12418556 -0.70336151  0.7957139   0.789994  ]
 [-0.32224298  0.53229809 -0.09923434 -0.05655694]]

feeding w2

w1 = [[ 0.12418556 -0.70336151  0.7957139   0.789994  ]
 [-0.32224298  0.53229809 -0.09923434 -0.05655694]]
w2 = [[ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]]


## DQN

* Target Network
* Error clipping

**Target Network**

$Q(s, a) \xrightarrow{} r + \gamma max_{a'}{\tilde{Q}(s', a')}$

After severals steps, the target network $\tilde{Q}$ is updated, just by copying the weights from the current network $Q$.

**Error clipping**

The loss function is directly used in the backward propagation algorithm and large errors cause large changes to the network.

Huber loss.

In [46]:
def q_network(input_size=state_size,
              output_size=action_size,
              hidden_size=64,
              learning_rate=0.00025,
              trainable=True):
    x = tf.placeholder(tf.float32, shape=[None, input_size])
    y = tf.placeholder(tf.float32, shape=[None, action_size])

    W_h = tf.Variable(tf.random_uniform([input_size, hidden_size], -1.0, 1.0), name='W_h')
    b_h = tf.Variable(tf.zeros([hidden_size]), name='b_h')

    hidden = tf.nn.relu(tf.matmul(x, W_h) + b_h)

    W_o = tf.Variable(tf.random_uniform([hidden_size, output_size], -1.0, 1.0), name='W_o')
    b_o = tf.Variable(tf.zeros([output_size]), name='b_o')

    y_hat = tf.matmul(hidden, W_o) + b_o

    v_hat = tf.reduce_max(y_hat, 1)
    pi_hat = tf.argmax(y_hat, 1)

    loss, train_op = None, None
    if trainable:
        loss = tf.reduce_mean(tf.sqrt(tf.square(y_hat - y) + 1) - 1)
        opt = tf.train.RMSPropOptimizer(learning_rate)
        train_op = opt.minimize(loss)

    return QNetwork(x, y, y_hat, v_hat, pi_hat, loss, train_op)

graph = tf.Graph()

with graph.as_default():
    common_variables = ['W_h:0', 'b_h:0', 'W_o:0', 'b_o:0']
    with tf.variable_scope('q_network'):
        Q_net = q_network()
    with tf.variable_scope('q_target'):
        Q_target = q_network(trainable=False)
    
    source_variables = [graph.get_tensor_by_name('q_network/' + name) for name in common_variables]
    target_variables = [graph.get_tensor_by_name('q_target/' + name) for name in common_variables]
    assign_target = [tf.assign(v_target, v_source)
                     for v_target, v_source in zip(target_variables, source_variables)] 
    update_target_op = tf.group(*assign_target)
    
    init_op = tf.global_variables_initializer()

session = tf.Session(graph=graph)
session.run(init_op)

In [47]:
Q_net.predict_action(session, s)

0

In [48]:
Q_target.predict_action(session, s)

0

In [49]:
session.run(update_target_op)

In [50]:
Q_target.predict_action(session, s)

0

In [51]:
session.close()
del graph

In [52]:
class DQNGraph:
    
    # self.graph
    # self.Q_net
    # self.Q_target
    # self.update_target_op
    # self.init_op
    # self.input_size
    # self.output_size
    
    def __init__(self,
                 input_size=state_size,
                 output_size=action_size,
                 hidden_size=64,
                 learning_rate=0.00025):
        self.graph = tf.Graph()
        with self.graph.as_default():
            common_variables = ['W_h:0', 'b_h:0', 'W_o:0', 'b_o:0']
            with tf.variable_scope('q_network'):
                self.Q_net = self.q_network(input_size, output_size, hidden_size, learning_rate)
            with tf.variable_scope('q_target'):
                self.Q_target = self.q_network(input_size, output_size, hidden_size, learning_rate, False)

            source_variables = [self.graph.get_tensor_by_name('q_network/' + name) for name in common_variables]
            target_variables = [self.graph.get_tensor_by_name('q_target/' + name) for name in common_variables]
            assign_target = [tf.assign(v_target, v_source)
                             for v_target, v_source in zip(target_variables, source_variables)] 
            self.update_target_op = tf.group(*assign_target)

            self.init_op = tf.global_variables_initializer()

        self.input_size = input_size
        self.output_size = output_size
    
    def q_network(self, input_size, output_size, hidden_size, learning_rate, trainable=True):
        x = tf.placeholder(tf.float32, shape=[None, input_size])
        y = tf.placeholder(tf.float32, shape=[None, action_size])

        W_h = tf.Variable(tf.random_uniform([input_size, hidden_size], -1.0, 1.0), name='W_h')
        b_h = tf.Variable(tf.zeros([hidden_size]), name='b_h')

        hidden = tf.nn.relu(tf.matmul(x, W_h) + b_h)

        W_o = tf.Variable(tf.random_uniform([hidden_size, output_size], -1.0, 1.0), name='W_o')
        b_o = tf.Variable(tf.zeros([output_size]), name='b_o')

        y_hat = tf.matmul(hidden, W_o) + b_o

        v_hat = tf.reduce_max(y_hat, 1)
        pi_hat = tf.argmax(y_hat, 1)

        loss, train_op = None, None
        if trainable:
            loss = tf.reduce_mean(tf.sqrt(tf.square(y_hat - y) + 1) - 1)
            opt = tf.train.RMSPropOptimizer(learning_rate)
            train_op = opt.minimize(loss)

        return QNetwork(x, y, y_hat, v_hat, pi_hat, loss, train_op)

class DQN:
    
    def __init__(self,
                 q_graph=DQNGraph(),
                 batch_size=64,
                 gamma=0.99,
                 explore_policy=EpsilonGreedy()):
        self.q_graph = q_graph
        self.batch_size = batch_size
        self.gamma = gamma
        self.explore_policy = explore_policy

        self.session = tf.Session(graph=q_graph.graph)
        self.session.run(q_graph.init_op)
        self.update_target()

        
        self.steps_count = 0
    
    def __del__(self):
        self.session.close()

    def update_q(self, memory):
        replay_sample = memory.sample(self.batch_size)
        if not replay_sample:
            return
        
        session = self.session
        Q_net = self.q_graph.Q_net
        Q_target = self.q_graph.Q_target
        gamma = self.gamma
        
        states = np.array([t.state for t in replay_sample])
        states_ = np.array([make_state(t.state_after) for t in replay_sample])

        Q = Q_net.predict_Q(session, states)
        max_Q = Q_target.predict_V(session, states_)
        
        n = len(replay_sample)
        X = np.zeros((n, self.q_graph.input_size))
        Y = np.zeros((n, self.q_graph.output_size))
        for i, t in enumerate(replay_sample):
            s, a, r, s_ = t
            X[i] = s
            q = Q[i]
            if s_ is None:
                q[a] = r
            else:
                q[a] = r + gamma * max_Q[i]
            Y[i] = q
        loss_value = Q_net.train(session, X, Y)
        #print(loss_value)

    def Q(self, X):
        return self.q_graph.Q_net.predict_Q(self.session, X)
    
    def explore(self):
        return next(self.explore_policy)
    
    def action(self, s):
        return self.q_graph.Q_net.predict_action(self.session, s)
    
    def steps_completed(self, n=1000):
        self.steps_count += 1
        return self.steps_count % n == 0

    def update_target(self):
        self.session.run(self.q_graph.update_target_op)

    def __call__(self, s, memory):
        if self.steps_completed(1000):
            self.update_target()
        self.update_q(memory)
        if self.explore():
            return random_agent(s)
        return self.action(s)

In [53]:
%%script false

dqn_agent = DQN()
memory = UniformMemory()

run_agent(dqn_agent, memory)

del memory
del dqn_agent

In [54]:
%%time
#%%script false

memory = UniformMemory()

print('Fill Memory with Random\n')
gym_run(random_agent, memory, num_episodes=10000, report_buffer=1000)

dqn_agent = DQN()

print('\nDQN')
run_agent(dqn_agent, memory, num_episodes=2000, report_buffer=100)

del memory
del dqn_agent

Fill Memory with Random

Episode 1000 last for 20 steps, mean steps 21.77 [8, 77], time 0:00:00.597412
Episode 2000 last for 17 steps, mean steps 22.24 [8, 108], time 0:00:00.555485
Episode 3000 last for 33 steps, mean steps 21.62 [8, 91], time 0:00:00.515341
Episode 4000 last for 8 steps, mean steps 21.95 [8, 97], time 0:00:00.511414
Episode 5000 last for 14 steps, mean steps 21.69 [8, 86], time 0:00:00.538407
Episode 6000 last for 11 steps, mean steps 21.86 [8, 87], time 0:00:00.540359
Episode 7000 last for 18 steps, mean steps 22.72 [8, 90], time 0:00:00.521909
Episode 8000 last for 10 steps, mean steps 22.09 [8, 103], time 0:00:00.517365
Episode 9000 last for 33 steps, mean steps 22.49 [8, 98], time 0:00:00.527812
Episode 10000 last for 11 steps, mean steps 21.97 [8, 80], time 0:00:00.512029

DQN


Episode 100 last for 9 steps, mean steps 14.39 [8, 48], time 0:00:07.870899
Episode 200 last for 9 steps, mean steps 10.36 [8, 16], time 0:00:05.845378
Episode 300 last for 9 steps, mean steps 9.68 [8, 12], time 0:00:05.668502
Episode 400 last for 12 steps, mean steps 9.63 [8, 12], time 0:00:05.664591
Episode 500 last for 10 steps, mean steps 9.53 [8, 12], time 0:00:05.638911
Episode 600 last for 10 steps, mean steps 9.60 [8, 11], time 0:00:05.650201
Episode 700 last for 10 steps, mean steps 9.56 [8, 12], time 0:00:05.604360
Episode 800 last for 8 steps, mean steps 9.44 [8, 13], time 0:00:05.549086
Episode 900 last for 10 steps, mean steps 9.47 [8, 12], time 0:00:05.567588
Episode 1000 last for 10 steps, mean steps 9.67 [8, 16], time 0:00:05.680924
Episode 1100 last for 165 steps, mean steps 68.76 [8, 200], time 0:00:38.706176
Episode 1200 last for 137 steps, mean steps 125.86 [108, 173], time 0:01:05.770983
Episode 1300 last for 187 steps, mean steps 165.12 [116, 200], time 0:01:26.19

## DDQN

$Q(s, a) \xrightarrow{} r + \gamma \tilde{Q}(s', argmax_{a'}{Q(s', a')})$

In [55]:
Q = np.random.rand(5,2)
a_max = np.argmax(Q, axis=1)

print(Q)
print(a_max)

[[ 0.77961014  0.11675672]
 [ 0.10265127  0.68690219]
 [ 0.5319123   0.16421541]
 [ 0.16468436  0.62897323]
 [ 0.48049501  0.87588718]]
[0 1 0 1 1]


In [56]:
class DDQN(DQN):
    
    def update_q(self, memory):
        replay_sample = memory.sample(self.batch_size)
        if not replay_sample:
            return

        session = self.session
        Q_net = self.q_graph.Q_net
        Q_target = self.q_graph.Q_target
        gamma = self.gamma
        
        states = np.array([t.state for t in replay_sample])
        states_ = np.array([make_state(t.state_after) for t in replay_sample])
        
        Q1 = Q_net.predict_Q(session, states)
        Q1_ = Q_net.predict_Q(session, states_) 
        max_a = np.argmax(Q1_, axis=1)
        Q2 = Q_target.predict_Q(session, states_)
        
        n = len(replay_sample)
        X = np.zeros((n, self.q_graph.input_size))
        Y = np.zeros((n, self.q_graph.output_size))
        for i, t in enumerate(replay_sample):
            s, a, r, s_ = t
            X[i] = s
            q = Q1[i]
            if s_ is None:
                q[a] = r
            else:
                q[a] = r + gamma * Q2[i][max_a[i]]
            Y[i] = q
        loss_value = Q_net.train(session, X, Y)
        #print(loss_value)

In [57]:
%%time
#%%script false

memory = UniformMemory()

print('Fill Memory with Random\n')
gym_run(random_agent, memory, num_episodes=10000, report_buffer=1000)

ddqn_agent = DDQN()

print('\nDDQN')
run_agent(ddqn_agent, memory, num_episodes=2000, report_buffer=100)

del memory
del ddqn_agent

Fill Memory with Random

Episode 1000 last for 16 steps, mean steps 22.39 [8, 84], time 0:00:00.562163
Episode 2000 last for 26 steps, mean steps 22.09 [8, 103], time 0:00:00.507309
Episode 3000 last for 13 steps, mean steps 22.30 [8, 82], time 0:00:00.511277
Episode 4000 last for 14 steps, mean steps 22.55 [8, 109], time 0:00:00.519137
Episode 5000 last for 50 steps, mean steps 22.19 [8, 116], time 0:00:00.508193
Episode 6000 last for 20 steps, mean steps 22.44 [8, 134], time 0:00:00.530512
Episode 7000 last for 25 steps, mean steps 22.15 [8, 90], time 0:00:00.531438
Episode 8000 last for 39 steps, mean steps 22.17 [8, 88], time 0:00:00.573438
Episode 9000 last for 12 steps, mean steps 22.11 [8, 125], time 0:00:00.518561
Episode 10000 last for 20 steps, mean steps 23.00 [8, 111], time 0:00:00.527818

DDQN


Episode 100 last for 46 steps, mean steps 18.21 [8, 200], time 0:00:11.073061
Episode 200 last for 170 steps, mean steps 115.51 [12, 200], time 0:01:07.470602
Episode 300 last for 179 steps, mean steps 179.90 [105, 200], time 0:01:44.976621
Episode 400 last for 200 steps, mean steps 190.21 [145, 200], time 0:01:51.570002
Episode 500 last for 200 steps, mean steps 195.97 [151, 200], time 0:01:54.107719
Episode 600 last for 83 steps, mean steps 164.21 [51, 200], time 0:01:37.188499
Episode 700 last for 78 steps, mean steps 80.14 [62, 135], time 0:00:47.456318
Episode 800 last for 76 steps, mean steps 77.25 [68, 91], time 0:00:45.188283
Episode 900 last for 71 steps, mean steps 76.64 [12, 88], time 0:00:44.253540
Episode 1000 last for 68 steps, mean steps 73.84 [16, 89], time 0:00:42.405106
Episode 1100 last for 80 steps, mean steps 76.90 [65, 95], time 0:00:45.860875
Episode 1200 last for 81 steps, mean steps 76.69 [54, 95], time 0:00:44.252467
Episode 1300 last for 87 steps, mean steps 

## Priorized Experience Replay

**Priority**

$p = (error + \epsilon)^\alpha$

* $\epsilon$ small positive constant
* $\alpha \in [0, 1]$ difference between high an low error

**Error**

$error = |Q(s, a) - T(s')|$

$T(s') = r + \gamma \tilde{Q}(s', argmax_{a'}{Q(s', a')})$

In [58]:
# Efficient sampling using Sum Tree

class WeightedTree:
    
    def __init__(self, min_size):
        data_size = 2 ** math.ceil(np.log2(min_size))
        self.data_size = data_size
        self.data_level = data_size - 1
        self.data_index = 0 # 0 .. data_size-1
        self.tree = np.zeros(2 * data_size - 1)
        self.data = [None] * data_size
    
    def __str__(self):
        return str(self.tree)
    
    def max_range(self):
        return self.tree[0]

    def add(self, data, weight):
        self.set_weight(self.data_index, weight)
        self.data[self.data_index] = data
        self.data_index = (self.data_index + 1) % self.data_size

    def set_weight(self, data_index, weight):
        i = self.data_level + data_index
        delta = weight - self.tree[i]
        self.tree[i] = weight
        while i:
            i = (i - 1) // 2
            self.tree[i] += delta

    def _select(self, i, range_value):
        while i < self.data_level:
            left = 2 * i + 1
            right = left + 1
            if range_value <= self.tree[left]:
                i = left
            else:
                i, range_value = right, range_value - self.tree[left]
        return i

    def select(self, range_value):
        i = self._select(0, range_value)
        data_index = i - self.data_level
        return (data_index, self.data[data_index], self.tree[i])

t = WeightedTree(8)
print(t)
t.add("a", 1)
print(t)
t.add("b", 2)
print(t)
t.add("c", 3)
print(t)
t.add("d", 4)
print(t)
t.add("e", 5)
print(t)
t.add("f", 6)
print(t)
t.add("g", 7)
print(t)
t.add("h", 8)
print(t)

for i in range(int(t.max_range())):
    m = t.select(i+1)
    print(i+1, m)

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 1.  1.  0.  1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.]
[ 3.  3.  0.  3.  0.  0.  0.  1.  2.  0.  0.  0.  0.  0.  0.]
[ 6.  6.  0.  3.  3.  0.  0.  1.  2.  3.  0.  0.  0.  0.  0.]
[ 10.  10.   0.   3.   7.   0.   0.   1.   2.   3.   4.   0.   0.   0.   0.]
[ 15.  10.   5.   3.   7.   5.   0.   1.   2.   3.   4.   5.   0.   0.   0.]
[ 21.  10.  11.   3.   7.  11.   0.   1.   2.   3.   4.   5.   6.   0.   0.]
[ 28.  10.  18.   3.   7.  11.   7.   1.   2.   3.   4.   5.   6.   7.   0.]
[ 36.  10.  26.   3.   7.  11.  15.   1.   2.   3.   4.   5.   6.   7.   8.]
1 (0, 'a', 1.0)
2 (1, 'b', 2.0)
3 (1, 'b', 2.0)
4 (2, 'c', 3.0)
5 (2, 'c', 3.0)
6 (2, 'c', 3.0)
7 (3, 'd', 4.0)
8 (3, 'd', 4.0)
9 (3, 'd', 4.0)
10 (3, 'd', 4.0)
11 (4, 'e', 5.0)
12 (4, 'e', 5.0)
13 (4, 'e', 5.0)
14 (4, 'e', 5.0)
15 (4, 'e', 5.0)
16 (5, 'f', 6.0)
17 (5, 'f', 6.0)
18 (5, 'f', 6.0)
19 (5, 'f', 6.0)
20 (5, 'f', 6.0)
21 (5, 'f', 6.0)
22 (6, 'g', 7.0)
23

In [59]:
priority_eps = 0.01
priority_alpha = 0.6

class PriorizedMemory:
    
    def __init__(self,
                 error_func=lambda m: 0,
                 min_size=memory_size,
                 priority_eps=priority_eps,
                 priority_alpha=priority_alpha):
        self.data = WeightedTree(min_size)
        self.priority_eps = priority_eps
        self.priority_alpha = priority_alpha
        self._error = error_func
    
    def _priority(self, error):
        return (error + self.priority_eps) ** self.priority_alpha
    
    def add(self, memento):
        err = self._error(memento)
        p = self._priority(err)
        self.data.add(memento, p)
    
    def sample(self, n):
        sample = []

        range_size = self.data.max_range() / n
        for i in range(n):
            start = i * range_size
            stop = (i + 1) * range_size

            range_value = random.uniform(start, stop)
            k, memento, _ = self.data.select(range_value)
            if memento is not None:
                sample.append((k, memento))

        return sample

    def update(self, k, error):
        p = self._priority(error)
        self.data.set_weight(k, p)

In [60]:
memory = UniformMemory()
gym_run(random_agent, memory, num_episodes=1, report_buffer=0)
s, a, r, s_ = memory.sample(1)[0]

s_ = make_state(s_)
s = s.reshape((1, state_size))
s_ = s_.reshape((1, state_size))

s, a, r, s_

(array([[-0.00360538,  0.00935829,  0.05024814, -0.00322131]]),
 0,
 1.0,
 array([[-0.00341821, -0.18644694,  0.05018372,  0.30488256]]))

In [61]:
graph = tf.Graph()

with graph.as_default():
    common_variables = ['W_h:0', 'b_h:0', 'W_o:0', 'b_o:0']
    with tf.variable_scope('q_network'):
        Q_net = q_network()
    with tf.variable_scope('q_target'):
        Q_target = q_network(trainable=False)
    
    source_variables = [graph.get_tensor_by_name('q_network/' + name) for name in common_variables]
    target_variables = [graph.get_tensor_by_name('q_target/' + name) for name in common_variables]
    assign_target = [tf.assign(v_target, v_source)
                     for v_target, v_source in zip(target_variables, source_variables)] 
    update_target_op = tf.group(*assign_target)
    
    init_op = tf.global_variables_initializer()

session = tf.Session(graph=graph)
session.run(init_op)

In [62]:
q1 = Q_net.predict_Q(session, s)[0][a]
print(q1)

-0.129706


In [63]:
q1_ = Q_net.predict_Q(session, s_)[0]
a_ = np.argmax(q1_)
print(q1_)
print(a_)

[-0.33111036  0.8463589 ]
1


In [64]:
q2_ = Q_target.predict_Q(session, s_)[0][a_]
print(q2_)

2.15977


In [65]:
t = r + gamma * q2_
print(t)

3.13817632437


In [66]:
error = abs(q1 - t)
print(error)

3.26788205147


In [67]:
def ddqn_error(memento):
    s, a, r, s_ = memento

    s_ = make_state(s_)
    s = s.reshape((1, state_size))
    s_ = s_.reshape((1, state_size))
    
    q1 = Q_net.predict_Q(session, s)[0][a]
    q1_ = Q_net.predict_Q(session, s_)[0]
    a_ = np.argmax(q1_)
    
    q2_ = Q_target.predict_Q(session, s_)[0][a_]
    t = r + gamma * q2_
    
    return abs(q1 - t)

err = ddqn_error(memory.sample(1)[0])
print(err)

0.83857799232


In [68]:
session.close()
del graph

In [69]:
class DDQN_PER(DQN):
    
    def error(self, memento):
        _, _, err = self.train_data([(-1, memento)])
        return err[0][1]
    
    def train_data(self, sample):
        states = np.array([t.state for _, t in sample])
        states_ = np.array([make_state(t.state_after) for _, t in sample])
        
        session = self.session
        Q_net = self.q_graph.Q_net
        Q_target = self.q_graph.Q_target
        gamma = self.gamma

        Q1 = Q_net.predict_Q(session, states)
        Q1_ = Q_net.predict_Q(session, states_) 
        max_a = np.argmax(Q1_, axis=1)
        Q2 = Q_target.predict_Q(session, states_)
        
        n = len(sample)
        X = np.zeros((n, self.q_graph.input_size))
        Y = np.zeros((n, self.q_graph.output_size))
        errors = [None] * n
        for i, (k, t) in enumerate(sample):
            s, a, r, s_ = t
            X[i] = s
            q = Q1[i]
            q_a = q[a]
            if s_ is None:
                q[a] = r
            else:
                q[a] = r + gamma * Q2[i][max_a[i]]
            Y[i] = q
            errors[i] = (k, abs(q_a - q[a]))
        
        return X, Y, errors
    
    def update_q(self, memory):
        replay_sample = memory.sample(self.batch_size)
        if not replay_sample:
            return

        X, Y, errors = self.train_data(replay_sample)
        
        for k, err in errors:
            memory.update(k, err)

        session = self.session
        Q_net = self.q_graph.Q_net

        loss_value = Q_net.train(session, X, Y)
        #print(loss_value)

In [70]:
%%time
#%%script false

ddqn_per_agent = DDQN_PER()

memory = PriorizedMemory(ddqn_per_agent.error)

print('\nDDQN+PER')
run_agent(ddqn_per_agent, memory, num_episodes=2000, report_buffer=100)

del memory
del ddqn_per_agent


DDQN+PER


Episode 100 last for 25 steps, mean steps 17.90 [8, 50], time 0:00:19.529878
Episode 200 last for 73 steps, mean steps 37.91 [13, 162], time 0:00:42.535979
Episode 300 last for 177 steps, mean steps 148.18 [31, 200], time 0:02:47.247540
Episode 400 last for 177 steps, mean steps 174.23 [157, 200], time 0:03:17.164638
Episode 500 last for 166 steps, mean steps 169.00 [155, 186], time 0:03:14.451255
Episode 600 last for 167 steps, mean steps 170.55 [158, 183], time 0:03:13.085338
Episode 700 last for 155 steps, mean steps 161.78 [145, 175], time 0:03:06.417266
Episode 800 last for 160 steps, mean steps 156.44 [145, 166], time 0:03:11.236603
Episode 900 last for 169 steps, mean steps 158.62 [146, 171], time 0:03:16.016536
Episode 1000 last for 186 steps, mean steps 170.00 [159, 200], time 0:03:26.934858
Episode 1100 last for 200 steps, mean steps 194.33 [139, 200], time 0:03:59.079000
Episode 1200 last for 200 steps, mean steps 196.35 [110, 200], time 0:04:02.042312
Episode 1300 last for 