**Human-level control through Deep Reinforcement Learning**

https://deepmind.com/research/dqn/

https://deepmind.com/blog/deep-reinforcement-learning/

**Playing Atari with Deep Reinforcement Learning**

https://arxiv.org/abs/1312.5602
    
**Demystifying Deep Reinforcement Learning**

https://www.nervanasys.com/demystifying-deep-reinforcement-learning/

**Let’s make a DQN**

https://jaromiru.com/category/dqn/

**CartPole-v0**

https://gym.openai.com/envs/CartPole-v0

In [1]:
from bokeh.io import output_notebook, push_notebook, show
from bokeh.charts import HeatMap
from bokeh.layouts import column, row
from bokeh.models import ColumnDataSource, DataRange1d, HoverTool
from bokeh.palettes import Set1
from bokeh.plotting import figure

output_notebook()

In [2]:
from collections import deque, namedtuple
from datetime import timedelta
from time import time

import math
import random

import numpy as np
import tensorflow as tf

import gym

env = gym.make('CartPole-v0')

state_size = env.observation_space.shape[0] # 4
action_size = env.action_space.n # 2

[2017-05-24 17:39:42,797] Making new env: CartPole-v0


## Agent execution + Random Agent

In [3]:
RunState = namedtuple('RunState', ['i', 't', 'mean_t', 'min_t', 'max_t', 'R'])
Memento = namedtuple('Memento', ['state', 'action', 'reward', 'state_after'])

class NoMemory:
    def add(self, data):
        pass
    def sample(self, n):
        return []
    
def gym_run(agent_func,
            memory=NoMemory(),
            num_episodes=20,
            max_steps=500,
            report_buffer=1,
            report_func=lambda r: None):
    results = []
    start = time()
    n_t, mean_t, min_t, max_t = 0, 0, max_steps, 0
    terminal_state = None
    for i in range(1, num_episodes+1):
        done = False
        t = 0
        R = 0
        s = env.reset()
        while not done and t < max_steps:
            a = agent_func(s, memory)
            s_, r, done, _ = env.step(a)
            if done: s_ = terminal_state
            memory.add(Memento(s, a, r, s_))
            t += 1
            R += r
            s = s_
        
        n_t += 1
        mean_t = (n_t - 1) / n_t * mean_t + t / n_t
        min_t = min(min_t, t)
        max_t = max(max_t, t)
        results.append(RunState(i, t, mean_t, min_t, max_t, R))
        
        if report_buffer > 0 and i % report_buffer == 0:
            report_func(results)
            results.clear()

            duration = timedelta(seconds=time() - start)
            start = time()
            print('Episode {} last for {} steps, mean steps {:.2f} [{}, {}], time {}' \
                  .format(i, t, mean_t, min_t, max_t, duration))
            if report_buffer != 1:
                n_t, mean_t, min_t, max_t = 0, 0, max_steps, 0

def random_agent(s, memory=None):
    return random.randint(0, action_size-1)

gym_run(random_agent)

Episode 1 last for 23 steps, mean steps 23.00 [23, 23], time 0:00:00.000389
Episode 2 last for 18 steps, mean steps 20.50 [18, 23], time 0:00:00.000756
Episode 3 last for 14 steps, mean steps 18.33 [14, 23], time 0:00:00.000318
Episode 4 last for 12 steps, mean steps 16.75 [12, 23], time 0:00:00.000264
Episode 5 last for 14 steps, mean steps 16.20 [12, 23], time 0:00:00.000537
Episode 6 last for 25 steps, mean steps 17.67 [12, 25], time 0:00:00.000433
Episode 7 last for 16 steps, mean steps 17.43 [12, 25], time 0:00:00.000275
Episode 8 last for 25 steps, mean steps 18.38 [12, 25], time 0:00:00.000681
Episode 9 last for 19 steps, mean steps 18.44 [12, 25], time 0:00:00.000476
Episode 10 last for 23 steps, mean steps 18.90 [12, 25], time 0:00:00.000412
Episode 11 last for 14 steps, mean steps 18.45 [12, 25], time 0:00:00.000335
Episode 12 last for 18 steps, mean steps 18.42 [12, 25], time 0:00:00.000329
Episode 13 last for 17 steps, mean steps 18.31 [12, 25], time 0:00:00.000285
Episode 

In [4]:
RunPlot = namedtuple('RunPlot', ['handle', 'steps_source', 'rewards_source'])

def plot_run():
    steps_data = ColumnDataSource(data=dict(episodes=[], t=[], mean_t=[], min_t=[], max_t=[]))
    rewards_data = ColumnDataSource(data=dict(episodes=[], r=[]))

    steps_hover = HoverTool(
        tooltips=[
            ('episode', '@episodes'),
            ('steps', '@t'),
            ('mean', '@mean_t'),
            ('min', '@min_t'),
            ('max', '@max_t'),
        ]
    )
    
    plot_steps = figure(width=475, height=300, tools=[steps_hover], logo=None, toolbar_location=None, 
                        title='Steps', x_axis_label='episodes', y_axis_label='T',
                        x_range=DataRange1d(follow='end', follow_interval=100, range_padding=0))
    plot_steps.line('episodes', 't', color='yellow', source=steps_data)
    plot_steps.line('episodes', 'mean_t', color='green', source=steps_data)
    plot_steps.line('episodes', 'min_t', color='orange', source=steps_data)
    plot_steps.line('episodes', 'max_t', color='red', source=steps_data)

    rewards_hover = HoverTool(
        tooltips=[
            ('episode', '@episodes'),
            ('reward', '@r'),
        ]
    )
    
    plot_rewards = figure(width=475, height=300, tools=[rewards_hover], logo=None, toolbar_location=None,
                          title='Rewards', x_axis_label='episodes', y_axis_label='R',
                          x_range=DataRange1d(range_padding=0))
    plot_rewards.line('episodes', 'r', color='blue', source=rewards_data)
    handle = show(row(plot_steps, plot_rewards), notebook_handle=True)
    return RunPlot(handle, steps_data, rewards_data)

plot = plot_run()

In [5]:
def report_plot(plot, results):
    episodes, t, mean_t, min_t, max_t, r = zip(*results)
    plot.steps_source.stream(dict(episodes=episodes, t=t, mean_t=mean_t, min_t=min_t, max_t=max_t), rollover=1000)
    plot.rewards_source.stream(dict(episodes=episodes, r=r))
    push_notebook(handle=plot.handle)

def plot_adapter(plot):
    return lambda r: report_plot(plot, r)

gym_run(random_agent, num_episodes=200, report_buffer=10,report_func=plot_adapter(plot))

Episode 10 last for 28 steps, mean steps 25.10 [11, 40], time 0:00:00.012177
Episode 20 last for 16 steps, mean steps 20.20 [12, 29], time 0:00:00.014312
Episode 30 last for 29 steps, mean steps 25.20 [9, 43], time 0:00:00.013966
Episode 40 last for 26 steps, mean steps 24.40 [11, 48], time 0:00:00.023752
Episode 50 last for 22 steps, mean steps 32.20 [17, 69], time 0:00:00.025893
Episode 60 last for 59 steps, mean steps 26.80 [11, 59], time 0:00:00.028797
Episode 70 last for 20 steps, mean steps 22.20 [12, 38], time 0:00:00.016939
Episode 80 last for 12 steps, mean steps 19.40 [11, 37], time 0:00:00.015138
Episode 90 last for 25 steps, mean steps 22.00 [11, 45], time 0:00:00.013975
Episode 100 last for 27 steps, mean steps 33.30 [13, 90], time 0:00:00.015477
Episode 110 last for 17 steps, mean steps 23.40 [8, 61], time 0:00:00.015286
Episode 120 last for 20 steps, mean steps 21.60 [9, 31], time 0:00:00.016052
Episode 130 last for 15 steps, mean steps 21.60 [11, 48], time 0:00:00.01645

In [6]:
def run_agent(*args, **kargs):
    plot = plot_run()
    gym_run(*args, **kargs, report_func=plot_adapter(plot))

In [7]:
run_agent(random_agent)

Episode 1 last for 26 steps, mean steps 26.00 [26, 26], time 0:00:00.028507
Episode 2 last for 17 steps, mean steps 21.50 [17, 26], time 0:00:00.028110
Episode 3 last for 13 steps, mean steps 18.67 [13, 26], time 0:00:00.017272
Episode 4 last for 25 steps, mean steps 20.25 [13, 26], time 0:00:00.017404
Episode 5 last for 11 steps, mean steps 18.40 [11, 26], time 0:00:00.017219
Episode 6 last for 17 steps, mean steps 18.17 [11, 26], time 0:00:00.019324
Episode 7 last for 15 steps, mean steps 17.71 [11, 26], time 0:00:00.021368
Episode 8 last for 20 steps, mean steps 18.00 [11, 26], time 0:00:00.017066
Episode 9 last for 14 steps, mean steps 17.56 [11, 26], time 0:00:00.016532
Episode 10 last for 50 steps, mean steps 20.80 [11, 50], time 0:00:00.023598
Episode 11 last for 22 steps, mean steps 20.91 [11, 50], time 0:00:00.022272
Episode 12 last for 25 steps, mean steps 21.25 [11, 50], time 0:00:00.016738
Episode 13 last for 26 steps, mean steps 21.62 [11, 50], time 0:00:00.035353
Episode 

## Simple DQN Agent

* $\varepsilon$-greedy
* Uniform Experience Replay
* Single Q Network

(missing target network, error clipping)

**This algorithm is Unstable**

### $\varepsilon$-greedy

$\varepsilon = \varepsilon_{min} + (\varepsilon_{max} - \varepsilon_{min}) e^{-\lambda t}$

The $\lambda$ parameter controls the speed of decay. This way we start with a policy that explores greatly and behaves more and more greedily over time.

In [8]:
class EpsilonGreedy:
    
    def __init__(self, eps_min=0.01, eps_max=1.0, decay=0.001):
        self.eps_min = eps_min
        self.eps_max = eps_max
        self.decay = decay
        self.steps = 0

    def epsilon(self):
        return self.eps_min \
        + (self.eps_max - self.eps_min) \
        * math.exp(-self.decay * self.steps)
    
    def explore(self):
        return random.random() < self.epsilon()
    
    def __iter__(self):
        return self
    
    def __next__(self):
        self.steps += 1
        return self.explore()

eps = EpsilonGreedy()
for _ in range(5):
    explore = next(eps)
    print('{:.6f}, explore={}'.format(eps.epsilon(), explore))

0.999010, explore=True
0.998022, explore=True
0.997034, explore=True
0.996048, explore=True
0.995062, explore=True


In [9]:
steps = list(range(1, 10000))
eps = []
exp = []
eps_ = EpsilonGreedy()
exp_values = ['exploit', 'explore']
for _ in steps:
    next(eps_)
    eps.append(eps_.epsilon())
    exp.append(exp_values[int(eps_.explore())])

eps_v = figure(width=900, height=300, tools='hover', logo=None,
              title='Epsilon', x_axis_label='steps', y_axis_label='eps')
eps_v.line(steps, eps)
hmap = HeatMap(dict(steps=steps, exp=exp), x='steps', y='exp',
               width=900, height=200, tools=None, legend=False, toolbar_location=None)
hmap.x_range = eps_v.x_range

show(column(eps_v, hmap))

### Uniform Memory (Experience Replay)

In [10]:
memory_size = 100000

class UniformMemory:
    
    def __init__(self, size=memory_size):
        self.data = deque(maxlen=size)

    def __del__(self):
        self.data.clear()

    def add(self, data):
        self.data.append(data)

    def sample(self, n):
        n = min(n, len(self.data))
        return random.sample(self.data, n)

memory = UniformMemory()
memory.add(Memento(1,2,3,4))
memory.add(Memento(5,6,7,8))
memory.add(Memento(9,10,11,12))
memory.add(Memento(13,14,15,16))

print(memory.sample(2))
print(memory.sample(2))

del memory

[Memento(state=1, action=2, reward=3, state_after=4), Memento(state=13, action=14, reward=15, state_after=16)]
[Memento(state=13, action=14, reward=15, state_after=16), Memento(state=5, action=6, reward=7, state_after=8)]


### Q Network

The *Q Network* class encapsulates the neural network. Our problem is simple enough so we will use only one hidden layer of 64 neurons, with ReLU activation function. The final layer will consist of only two neurons, one for each available action. Their activation function will be linear. Remember that we are trying to approximate the Q function, which in essence can be of any real value. Therefore we can’t restrict the output from the network and the linear activation works well.

In [11]:
class QNetwork(namedtuple('QNetwork', ['x', 'y', 'y_hat', 'v_hat', 'pi_hat', 'loss', 'train_op'])):
    __slots__ = ()
    
    def train(self, session, X, Y):
        return session.run([self.train_op, self.loss], feed_dict={self.x: X, self.y: Y})[1]
    
    def predict_Q(self, session, X):
        return session.run(self.y_hat, feed_dict={self.x: X})
    
    def predict_V(self, session, X):
        return session.run(self.v_hat, feed_dict={self.x: X})
    
    def predict_action(self, session, s):
        X = s.reshape((1, state_size))
        return session.run(self.pi_hat, feed_dict={self.x: X})[0]

def simple_q(input_size=state_size, output_size=action_size, hidden_size=64, learning_rate=0.00025):
    x = tf.placeholder(tf.float32, shape=[None, input_size])
    y = tf.placeholder(tf.float32, shape=[None, output_size])

    W_h = tf.Variable(tf.random_uniform([input_size, hidden_size], -1.0, 1.0))
    b_h = tf.Variable(tf.zeros([hidden_size]))

    hidden = tf.nn.relu(tf.matmul(x, W_h) + b_h)

    W_o = tf.Variable(tf.random_uniform([hidden_size, output_size], -1.0, 1.0))
    b_o = tf.Variable(tf.zeros([output_size]))

    y_hat = tf.matmul(hidden, W_o) + b_o

    v_hat = tf.reduce_max(y_hat, 1)
    pi_hat = tf.argmax(y_hat, 1)

    loss = tf.reduce_mean(tf.square(y_hat - y))
    opt = tf.train.RMSPropOptimizer(learning_rate)
    train_op = opt.minimize(loss)

    return QNetwork(x, y, y_hat, v_hat, pi_hat, loss, train_op)

graph = tf.Graph()

with graph.as_default():
    Q_net = simple_q()
    init_op = tf.global_variables_initializer()

session = tf.Session(graph=graph)
session.run(init_op)

In [12]:
s = np.array([-0.04288668, -0.0078879 , -0.01835355, -0.03561641])
s

array([-0.04288668, -0.0078879 , -0.01835355, -0.03561641])

In [13]:
Q_net.predict_Q(session, s.reshape((1, state_size)))

array([[ 0.12616175,  0.02846579]], dtype=float32)

In [14]:
Q_net.predict_V(session, s.reshape((1, state_size)))

array([ 0.12616175], dtype=float32)

In [15]:
Q_net.predict_action(session, s)

0

In [16]:
session.run(init_op)
Q_net.predict_Q(session, s.reshape((1, state_size)))

array([[-0.01517322, -0.18473908]], dtype=float32)

### Training

$Q(s, a) \xrightarrow{} r + \gamma max_{a'}{Q(s', a')}$

This formula means that for a sample $(s, r, a, s')$ we will update the network’s weights so that its output is closer to the target.

In [17]:
gamma = 0.99
batch_size = 5

memory = UniformMemory()
gym_run(random_agent, memory, report_buffer=0)
replay_sample = memory.sample(batch_size)
replay_sample

[Memento(state=array([ 0.02853973, -0.160745  , -0.01603637,  0.28532042]), action=1, reward=1.0, state_after=array([ 0.02532483,  0.03460194, -0.01032996, -0.01237681])),
 Memento(state=array([ 0.01212832,  0.97858345, -0.11165295, -1.60528402]), action=1, reward=1.0, state_after=array([ 0.03169999,  1.17483499, -0.14375863, -1.93058615])),
 Memento(state=array([ 0.03823931,  0.43513105,  0.01481132, -0.52872147]), action=0, reward=1.0, state_after=array([ 0.04694193,  0.23980389,  0.00423689, -0.2314085 ])),
 Memento(state=array([-0.06650066, -0.05606863,  0.12588118,  0.43025968]), action=0, reward=1.0, state_after=array([-0.06762203, -0.2527274 ,  0.13448638,  0.75982512])),
 Memento(state=array([-0.01525193,  0.58653253, -0.06642928, -0.97419758]), action=1, reward=1.0, state_after=array([-0.00352127,  0.78247974, -0.08591323, -1.28698596]))]

In [18]:
# Q shape (batch_size, action_size)
Q = Q_net.predict_Q(session, np.array([t.state for t in replay_sample]))

print(Q.shape)
print(Q)

(5, 2)
[[ 0.39378229 -1.20794153]
 [ 2.92413354 -3.15798283]
 [ 1.09184313 -1.14461255]
 [ 1.19519281 -1.71272779]
 [ 1.77337039 -1.95831156]]


In [19]:
# max_Q (batch_size, )
terminal_state = np.zeros(state_size)
def make_state(s): return terminal_state if s is None else s

max_Q = Q_net.predict_V(session, np.array([make_state(t.state_after) for t in replay_sample]))

print(max_Q.shape)
print(max_Q)

(5,)
[ 0.00783761  3.4934442   0.51732701  1.53482389  2.35198379]


In [20]:
# X shape [batch_size, state_size]
# Y shape [batch_size, action_size]
# Q(s, a) -> r + gamma * max_a' Q(s', a')
n = len(replay_sample)
X = np.zeros((n, state_size))
Y = np.zeros((n, action_size))
for i, t in enumerate(replay_sample):
    s, a, r, s_ = t
    X[i] = s
    q = Q[i]
    if s_ is None:
        q[a] = r
    else:
        q[a] = r + gamma * max_Q[i]
    Y[i] = q

print(X.shape)
print(X)
print(Y.shape)
print(Y)

(5, 4)
[[ 0.02853973 -0.160745   -0.01603637  0.28532042]
 [ 0.01212832  0.97858345 -0.11165295 -1.60528402]
 [ 0.03823931  0.43513105  0.01481132 -0.52872147]
 [-0.06650066 -0.05606863  0.12588118  0.43025968]
 [-0.01525193  0.58653253 -0.06642928 -0.97419758]]
(5, 2)
[[ 0.39378229  1.00775921]
 [ 2.92413354  4.45850992]
 [ 1.51215374 -1.14461255]
 [ 2.5194757  -1.71272779]
 [ 1.77337039  3.32846403]]


In [21]:
Q_net.train(session, X, Y)

9.2800674

In [22]:
session.close()
del graph

In [23]:
class DQNSimpleGraph:
    
    # self.graph
    # self.Q_net
    # self.init_op
    # self.input_size
    # self.output_size
    
    def __init__(self,
                 input_size=state_size,
                 output_size=action_size,
                 hidden_size=64,
                 learning_rate=0.00025):
        self.graph = tf.Graph()
        with self.graph.as_default():
            self.Q_net = self.q_network(input_size,
                                        output_size,
                                        hidden_size,
                                        learning_rate)
            self.init_op = tf.global_variables_initializer()
        
        self.input_size = input_size
        self.output_size = output_size
    
    def q_network(self, input_size, output_size, hidden_size, learning_rate):
        x = tf.placeholder(tf.float32, shape=[None, input_size])
        y = tf.placeholder(tf.float32, shape=[None, output_size])

        W_h = tf.Variable(tf.random_uniform([input_size, hidden_size], -1.0, 1.0))
        b_h = tf.Variable(tf.zeros([hidden_size]))

        hidden = tf.nn.relu(tf.matmul(x, W_h) + b_h)

        W_o = tf.Variable(tf.random_uniform([hidden_size, output_size], -1.0, 1.0))
        b_o = tf.Variable(tf.zeros([output_size]))

        y_hat = tf.matmul(hidden, W_o) + b_o

        v_hat = tf.reduce_max(y_hat, 1)
        pi_hat = tf.argmax(y_hat, 1)

        loss = tf.reduce_mean(tf.square(y_hat - y))
        opt = tf.train.RMSPropOptimizer(learning_rate)
        train_op = opt.minimize(loss)

        return QNetwork(x, y, y_hat, v_hat, pi_hat, loss, train_op)

class DQNSimple:
    
    def __init__(self,
                 q_graph = DQNSimpleGraph(),
                 batch_size=64,
                 gamma=0.99,
                 explore_policy=EpsilonGreedy()):
        self.q_graph = q_graph
        self.batch_size = batch_size
        self.gamma = gamma
        self.explore_policy = explore_policy

        self.session = tf.Session(graph=q_graph.graph)
        self.session.run(q_graph.init_op)
    
    def __del__(self):
        self.session.close()
    
    # Q(s, a) -> r + gamma * max_a' Q(s', a')
    def update_q(self, memory):
        replay_sample = memory.sample(self.batch_size)
        if not replay_sample:
            return
        session = self.session
        Q_net = self.q_graph.Q_net
        gamma = self.gamma
        
        states = np.array([t.state for t in replay_sample])
        states_ = np.array([make_state(t.state_after) for t in replay_sample])
        
        Q = Q_net.predict_Q(session, states)
        max_Q = Q_net.predict_V(session, states_)
        
        n = len(replay_sample)
        X = np.zeros((n, self.q_graph.input_size))
        Y = np.zeros((n, self.q_graph.output_size))
        for i, t in enumerate(replay_sample):
            s, a, r, s_ = t
            X[i] = s
            q = Q[i]
            if s_ is None:
                q[a] = r
            else:
                q[a] = r + gamma * max_Q[i]
            Y[i] = q
        loss_value = Q_net.train(session, X, Y)
        #print(loss_value)
    
    def Q(self, X):
        return self.q_graph.Q_net.predict_Q(self.session, X)
    
    def explore(self):
        return next(self.explore_policy)
    
    def action(self, s):
        return self.q_graph.Q_net.predict_action(self.session, s)
    
    def __call__(self, s, memory):
        self.update_q(memory)
        if self.explore():
            return random_agent(s)
        return self.action(s)

In [24]:
%%script false

dqn_simple = DQNSimple()
memory = UniformMemory()

run_agent(dqn_simple, memory)

del memory
del dqn_simple

In [25]:
%%time

dqn_simple = DQNSimple()
memory = UniformMemory()

run_agent(dqn_simple, memory, num_episodes=2000, report_buffer=100)

Episode 100 last for 18 steps, mean steps 15.42 [8, 56], time 0:00:03.151585
Episode 200 last for 12 steps, mean steps 12.64 [9, 17], time 0:00:02.763862
Episode 300 last for 88 steps, mean steps 69.66 [13, 200], time 0:00:15.151727
Episode 400 last for 112 steps, mean steps 109.95 [43, 200], time 0:00:23.853513
Episode 500 last for 152 steps, mean steps 138.98 [63, 200], time 0:00:30.498407
Episode 600 last for 176 steps, mean steps 163.82 [71, 200], time 0:00:36.665087
Episode 700 last for 191 steps, mean steps 189.53 [102, 200], time 0:00:42.641108
Episode 800 last for 174 steps, mean steps 176.63 [94, 200], time 0:00:41.549302
Episode 900 last for 200 steps, mean steps 172.64 [87, 200], time 0:00:41.893742
Episode 1000 last for 124 steps, mean steps 152.51 [81, 200], time 0:00:36.777917
Episode 1100 last for 200 steps, mean steps 177.49 [78, 200], time 0:00:42.966633
Episode 1200 last for 190 steps, mean steps 189.16 [155, 200], time 0:00:46.319957
Episode 1300 last for 200 steps, 

### Q values sampling

In [26]:
Q_params = []

def q_collector(s, memory=None):
    a = dqn_simple.action(s)
    Q_params.append((s, a))
    return a

gym_run(q_collector, num_episodes=1)

Episode 1 last for 200 steps, mean steps 200.00 [200, 200], time 0:00:00.078975


In [27]:
q_index = [0, len(Q_params) // 4 - 1, len(Q_params) // 2 - 1, 3 * len(Q_params) // 4 - 1, -1]
s_sample, a_sample = zip(*[Q_params[i] for i in q_index])
print(s_sample)
print(a_sample)

(array([ 0.04629129, -0.01727426, -0.03378046, -0.01275839]), array([-0.01616493,  0.1801864 , -0.00473429, -0.35702099]), array([-0.09550726, -0.20623427, -0.01223724,  0.14401652]), array([-0.2773406 , -0.19686801, -0.0053368 , -0.06261411]), array([-0.57444741, -0.18345861, -0.02299285, -0.3584895 ]))
(0, 0, 1, 1, 0)


In [28]:
X_sample = np.array(s_sample).reshape(len(s_sample), state_size)
X_sample

array([[ 0.04629129, -0.01727426, -0.03378046, -0.01275839],
       [-0.01616493,  0.1801864 , -0.00473429, -0.35702099],
       [-0.09550726, -0.20623427, -0.01223724,  0.14401652],
       [-0.2773406 , -0.19686801, -0.0053368 , -0.06261411],
       [-0.57444741, -0.18345861, -0.02299285, -0.3584895 ]])

In [29]:
a_index = [(i,a) for i, a in enumerate(a_sample)]
q_sample = np.random.rand(len(a_sample), action_size)
print(q_sample)
[q_sample[i] for i in a_index]

[[ 0.32665971  0.81505711]
 [ 0.64669858  0.94275141]
 [ 0.57960861  0.70773988]
 [ 0.52941932  0.7397141 ]
 [ 0.77029219  0.95726354]]


[0.32665970503931052,
 0.6466985820411485,
 0.70773988151390166,
 0.73971409677092048,
 0.77029218777122388]

In [30]:
Q_values = []
step_count = 0
step_sample = 100

def q_sampler(s, memory):
    a = dqn_simple(s, memory)
    global step_count
    if step_count % step_sample == 0:
        q_sample = dqn_simple.Q(X_sample)
        q_sample = [q_sample[k] for k in a_index]
        Q_values.append(q_sample)
    step_count += 1
    return a

In [31]:
%%time

run_agent(q_sampler, memory, num_episodes=1000, report_buffer=100)

Episode 100 last for 13 steps, mean steps 197.10 [13, 200], time 0:00:48.116774
Episode 200 last for 200 steps, mean steps 131.64 [9, 200], time 0:00:32.081211
Episode 300 last for 200 steps, mean steps 200.00 [200, 200], time 0:00:48.896754
Episode 400 last for 200 steps, mean steps 200.00 [200, 200], time 0:00:48.873317
Episode 500 last for 200 steps, mean steps 200.00 [200, 200], time 0:00:49.409687
Episode 600 last for 200 steps, mean steps 200.00 [200, 200], time 0:00:48.888530
Episode 700 last for 200 steps, mean steps 200.00 [200, 200], time 0:00:49.027434
Episode 800 last for 200 steps, mean steps 200.00 [200, 200], time 0:00:49.450074
Episode 900 last for 200 steps, mean steps 200.00 [200, 200], time 0:00:48.956857
Episode 1000 last for 200 steps, mean steps 200.00 [200, 200], time 0:00:49.091821
CPU times: user 10min 26s, sys: 37.1 s, total: 11min 4s
Wall time: 7min 53s


In [32]:
len(Q_values)

1929

In [33]:
q_samples = tuple(zip(*Q_values))
len(q_samples)

5

In [34]:
steps = range(1, len(Q_values) + 1)

plot = figure(width=900, tools='hover', logo=None,
              title='Q samples', x_axis_label='steps', y_axis_label='Q(s,a)')

for i, q_sample in enumerate(q_samples):
    plot.line(steps, q_sample, legend='Sample {}'.format(i+1), color=Set1[len(q_samples)][i])

show(plot)

In [35]:
del memory
del dqn_simple

### Error Clipping

**Huber Loss**

https://en.wikipedia.org/wiki/Huber_loss

In [36]:
def _mse(err): return err ** 2

def _abs(err): return np.abs(err)

def _huber(err): return np.sqrt(err ** 2 + 1) - 1

plot = figure(width=900, height=400, tools='hover', logo=None,
              title='Error', x_axis_label='error', y_axis_label='loss')

err = np.linspace(-5, 5, 1000)
e_mse = [_mse(i) for i in err]
e_abs = [_abs(i) for i in err]
e_huber = [_huber(i) for i in err]

plot.line(err, e_mse, color='green', legend='mse')
plot.line(err, e_abs, color='blue', legend='abs')
plot.line(err, e_huber, color='red', legend='huber')

show(plot)

**Huber loss with TensorFlow**

In [37]:
graph = tf.Graph()

with graph.as_default():
    y = tf.placeholder(tf.float32, shape=[5,2])
    y_hat = tf.placeholder(tf.float32, shape=[5,2])
    loss = tf.reduce_mean(tf.sqrt(tf.square(y_hat - y) + 1) - 1)

with tf.Session(graph=graph) as session:
    Y = np.random.rand(5, 2)
    Y_hat = np.random.rand(5, 2)
    loss_value = session.run(loss, feed_dict={y: Y, y_hat: Y_hat})
    print(loss_value)

del graph

0.0585029


**Huber loss with Numpy**

In [38]:
print(Y_hat)
print(Y)

[[ 0.02595563  0.07457896]
 [ 0.39653749  0.20581913]
 [ 0.91674566  0.0949264 ]
 [ 0.50639303  0.67502549]
 [ 0.11050242  0.55128651]]
[[ 0.7474048   0.11104955]
 [ 0.33724186  0.81065257]
 [ 0.43418947  0.02848677]
 [ 0.83121369  0.55712057]
 [ 0.00472184  0.45782529]]


In [39]:
Y_hat - Y

array([[-0.72144916, -0.03647059],
       [ 0.05929563, -0.60483344],
       [ 0.4825562 ,  0.06643964],
       [-0.32482066,  0.11790491],
       [ 0.10578058,  0.09346122]])

In [40]:
(Y_hat - Y) * (Y_hat - Y)

array([[ 0.5204889 ,  0.0013301 ],
       [ 0.00351597,  0.36582349],
       [ 0.23286048,  0.00441423],
       [ 0.10550846,  0.01390157],
       [ 0.01118953,  0.008735  ]])

In [41]:
(Y_hat - Y) * (Y_hat - Y) + 1

array([[ 1.5204889 ,  1.0013301 ],
       [ 1.00351597,  1.36582349],
       [ 1.23286048,  1.00441423],
       [ 1.10550846,  1.01390157],
       [ 1.01118953,  1.008735  ]])

In [42]:
np.sqrt((Y_hat - Y) * (Y_hat - Y) + 1)

array([[ 1.23308106,  1.00066483],
       [ 1.00175644,  1.16868451],
       [ 1.11034251,  1.00220468],
       [ 1.05143162,  1.00692679],
       [ 1.0055792 ,  1.004358  ]])

In [43]:
np.sqrt((Y_hat - Y) * (Y_hat - Y) + 1) - 1

array([[ 0.23308106,  0.00066483],
       [ 0.00175644,  0.16868451],
       [ 0.11034251,  0.00220468],
       [ 0.05143162,  0.00692679],
       [ 0.0055792 ,  0.004358  ]])

In [44]:
np.mean(np.sqrt((Y_hat - Y) * (Y_hat - Y) + 1) - 1)

0.058502965682089722

### Copying Weights with TensorFlow

In [45]:
graph = tf.Graph()

with graph.as_default():
    with tf.variable_scope('q1'):
        q1_w = tf.Variable(tf.random_uniform([2, 4], -1.0, 1.0), name='w')
        print(q1_w.name)
    with tf.variable_scope('q2'):
        q2_w = tf.Variable(tf.random_uniform([2, 4], -1.0, 1.0), name='w')
        print(q2_w.name)
    assign_w1_w2 = tf.assign(q1_w, q2_w)
    init_op = tf.global_variables_initializer()

print()

with tf.Session(graph=graph) as session:
    session.run(init_op)
    w1 = graph.get_tensor_by_name('q1/w:0')
    w2 = graph.get_tensor_by_name('q2/w:0')
    w1_, w2_ = session.run([w1, w2])
    print('w1 =', w1_)
    print('w2 =', w2_)
    print('\nw1 <- w2\n')
    session.run(assign_w1_w2)
    w1_, w2_ = session.run([w1, w2])
    print('w1 =', w1_)
    print('w2 =', w2_)
    print('\nfeeding w2\n')
    w1_, w2_ = session.run([w1, w2], feed_dict={w2: np.zeros((2, 4))})
    print('w1 =', w1_)
    print('w2 =', w2_)

del graph

q1/w:0
q2/w:0

w1 = [[ 0.152107   -0.42608047 -0.54790759  0.01789832]
 [ 0.44629812  0.39322686 -0.5048604  -0.4018681 ]]
w2 = [[ 0.55456233 -0.88316011  0.64940739  0.62729669]
 [ 0.53516746 -0.69544077  0.62369895 -0.54851151]]

w1 <- w2

w1 = [[ 0.55456233 -0.88316011  0.64940739  0.62729669]
 [ 0.53516746 -0.69544077  0.62369895 -0.54851151]]
w2 = [[ 0.55456233 -0.88316011  0.64940739  0.62729669]
 [ 0.53516746 -0.69544077  0.62369895 -0.54851151]]

feeding w2

w1 = [[ 0.55456233 -0.88316011  0.64940739  0.62729669]
 [ 0.53516746 -0.69544077  0.62369895 -0.54851151]]
w2 = [[ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]]


## DQN

* Target Network
* Error clipping

**Target Network**

$Q(s, a) \xrightarrow{} r + \gamma max_{a'}{\tilde{Q}(s', a')}$

After severals steps, the target network $\tilde{Q}$ is updated, just by copying the weights from the current network $Q$.

**Error clipping**

The loss function is directly used in the backward propagation algorithm and large errors cause large changes to the network.

Huber loss.

In [46]:
def q_network(input_size=state_size,
              output_size=action_size,
              hidden_size=64,
              learning_rate=0.00025,
              trainable=True):
    x = tf.placeholder(tf.float32, shape=[None, input_size])
    y = tf.placeholder(tf.float32, shape=[None, action_size])

    W_h = tf.Variable(tf.random_uniform([input_size, hidden_size], -1.0, 1.0), name='W_h')
    b_h = tf.Variable(tf.zeros([hidden_size]), name='b_h')

    hidden = tf.nn.relu(tf.matmul(x, W_h) + b_h)

    W_o = tf.Variable(tf.random_uniform([hidden_size, output_size], -1.0, 1.0), name='W_o')
    b_o = tf.Variable(tf.zeros([output_size]), name='b_o')

    y_hat = tf.matmul(hidden, W_o) + b_o

    v_hat = tf.reduce_max(y_hat, 1)
    pi_hat = tf.argmax(y_hat, 1)

    loss, train_op = None, None
    if trainable:
        loss = tf.reduce_mean(tf.sqrt(tf.square(y_hat - y) + 1) - 1)
        opt = tf.train.RMSPropOptimizer(learning_rate)
        train_op = opt.minimize(loss)

    return QNetwork(x, y, y_hat, v_hat, pi_hat, loss, train_op)

graph = tf.Graph()

with graph.as_default():
    common_variables = ['W_h:0', 'b_h:0', 'W_o:0', 'b_o:0']
    with tf.variable_scope('q_network'):
        Q_net = q_network()
    with tf.variable_scope('q_target'):
        Q_target = q_network(trainable=False)
    
    source_variables = [graph.get_tensor_by_name('q_network/' + name) for name in common_variables]
    target_variables = [graph.get_tensor_by_name('q_target/' + name) for name in common_variables]
    assign_target = [tf.assign(v_target, v_source)
                     for v_target, v_source in zip(target_variables, source_variables)] 
    update_target_op = tf.group(*assign_target)
    
    init_op = tf.global_variables_initializer()

session = tf.Session(graph=graph)
session.run(init_op)

In [47]:
Q_net.predict_action(session, s)

0

In [48]:
Q_target.predict_action(session, s)

0

In [49]:
session.run(update_target_op)

In [50]:
Q_target.predict_action(session, s)

0

In [51]:
session.close()
del graph

In [52]:
class DQNGraph:
    
    # self.graph
    # self.Q_net
    # self.Q_target
    # self.update_target_op
    # self.init_op
    # self.input_size
    # self.output_size
    
    def __init__(self,
                 input_size=state_size,
                 output_size=action_size,
                 hidden_size=64,
                 learning_rate=0.00025):
        self.graph = tf.Graph()
        with self.graph.as_default():
            common_variables = ['W_h:0', 'b_h:0', 'W_o:0', 'b_o:0']
            with tf.variable_scope('q_network'):
                self.Q_net = self.q_network(input_size, output_size, hidden_size, learning_rate)
            with tf.variable_scope('q_target'):
                self.Q_target = self.q_network(input_size, output_size, hidden_size, learning_rate, False)

            source_variables = [self.graph.get_tensor_by_name('q_network/' + name) for name in common_variables]
            target_variables = [self.graph.get_tensor_by_name('q_target/' + name) for name in common_variables]
            assign_target = [tf.assign(v_target, v_source)
                             for v_target, v_source in zip(target_variables, source_variables)] 
            self.update_target_op = tf.group(*assign_target)

            self.init_op = tf.global_variables_initializer()

        self.input_size = input_size
        self.output_size = output_size
    
    def q_network(self, input_size, output_size, hidden_size, learning_rate, trainable=True):
        x = tf.placeholder(tf.float32, shape=[None, input_size])
        y = tf.placeholder(tf.float32, shape=[None, action_size])

        W_h = tf.Variable(tf.random_uniform([input_size, hidden_size], -1.0, 1.0), name='W_h')
        b_h = tf.Variable(tf.zeros([hidden_size]), name='b_h')

        hidden = tf.nn.relu(tf.matmul(x, W_h) + b_h)

        W_o = tf.Variable(tf.random_uniform([hidden_size, output_size], -1.0, 1.0), name='W_o')
        b_o = tf.Variable(tf.zeros([output_size]), name='b_o')

        y_hat = tf.matmul(hidden, W_o) + b_o

        v_hat = tf.reduce_max(y_hat, 1)
        pi_hat = tf.argmax(y_hat, 1)

        loss, train_op = None, None
        if trainable:
            loss = tf.reduce_mean(tf.sqrt(tf.square(y_hat - y) + 1) - 1)
            opt = tf.train.RMSPropOptimizer(learning_rate)
            train_op = opt.minimize(loss)

        return QNetwork(x, y, y_hat, v_hat, pi_hat, loss, train_op)

class DQN:
    
    def __init__(self,
                 q_graph=DQNGraph(),
                 batch_size=64,
                 gamma=0.99,
                 explore_policy=EpsilonGreedy()):
        self.q_graph = q_graph
        self.batch_size = batch_size
        self.gamma = gamma
        self.explore_policy = explore_policy

        self.session = tf.Session(graph=q_graph.graph)
        self.session.run(q_graph.init_op)
        self.update_target()

        
        self.steps_count = 0
    
    def __del__(self):
        self.session.close()

    def update_q(self, memory):
        replay_sample = memory.sample(self.batch_size)
        if not replay_sample:
            return
        
        session = self.session
        Q_net = self.q_graph.Q_net
        Q_target = self.q_graph.Q_target
        gamma = self.gamma
        
        states = np.array([t.state for t in replay_sample])
        states_ = np.array([make_state(t.state_after) for t in replay_sample])

        Q = Q_net.predict_Q(session, states)
        max_Q = Q_target.predict_V(session, states_)
        
        n = len(replay_sample)
        X = np.zeros((n, self.q_graph.input_size))
        Y = np.zeros((n, self.q_graph.output_size))
        for i, t in enumerate(replay_sample):
            s, a, r, s_ = t
            X[i] = s
            q = Q[i]
            if s_ is None:
                q[a] = r
            else:
                q[a] = r + gamma * max_Q[i]
            Y[i] = q
        loss_value = Q_net.train(session, X, Y)
        #print(loss_value)

    def Q(self, X):
        return self.q_graph.Q_net.predict_Q(self.session, X)
    
    def explore(self):
        return next(self.explore_policy)
    
    def action(self, s):
        return self.q_graph.Q_net.predict_action(self.session, s)
    
    def steps_completed(self, n=1000):
        self.steps_count += 1
        return self.steps_count % n == 0

    def update_target(self):
        self.session.run(self.q_graph.update_target_op)

    def __call__(self, s, memory):
        if self.steps_completed(1000):
            self.update_target()
        self.update_q(memory)
        if self.explore():
            return random_agent(s)
        return self.action(s)

In [53]:
%%script false

dqn_agent = DQN()
memory = UniformMemory()

run_agent(dqn_agent, memory)

del memory
del dqn_agent

In [54]:
%%time
#%%script false

memory = UniformMemory()

print('Fill Memory with Random\n')
gym_run(random_agent, memory, num_episodes=10000, report_buffer=1000)

dqn_agent = DQN()

print('\nDQN')
run_agent(dqn_agent, memory, num_episodes=2000, report_buffer=100)

del memory
del dqn_agent

Fill Memory with Random

Episode 1000 last for 23 steps, mean steps 23.13 [8, 117], time 0:00:00.329134
Episode 2000 last for 20 steps, mean steps 22.33 [8, 84], time 0:00:00.292335
Episode 3000 last for 12 steps, mean steps 22.48 [8, 102], time 0:00:00.369667
Episode 4000 last for 19 steps, mean steps 22.16 [8, 89], time 0:00:00.295186
Episode 5000 last for 13 steps, mean steps 22.24 [8, 101], time 0:00:00.291901
Episode 6000 last for 21 steps, mean steps 21.93 [8, 105], time 0:00:00.282845
Episode 7000 last for 18 steps, mean steps 22.25 [8, 99], time 0:00:00.308845
Episode 8000 last for 23 steps, mean steps 22.40 [9, 79], time 0:00:00.299804
Episode 9000 last for 15 steps, mean steps 22.46 [8, 114], time 0:00:00.293422
Episode 10000 last for 24 steps, mean steps 22.19 [8, 102], time 0:00:00.299876

DQN


Episode 100 last for 9 steps, mean steps 13.81 [8, 44], time 0:00:03.290421
Episode 200 last for 11 steps, mean steps 9.83 [8, 14], time 0:00:02.631790
Episode 300 last for 9 steps, mean steps 9.55 [8, 12], time 0:00:02.645751
Episode 400 last for 8 steps, mean steps 9.36 [8, 12], time 0:00:02.524166
Episode 500 last for 9 steps, mean steps 9.59 [8, 12], time 0:00:02.590215
Episode 600 last for 10 steps, mean steps 9.33 [8, 11], time 0:00:02.738682
Episode 700 last for 11 steps, mean steps 9.35 [8, 11], time 0:00:02.565848
Episode 800 last for 9 steps, mean steps 9.39 [8, 11], time 0:00:02.554197
Episode 900 last for 10 steps, mean steps 9.41 [8, 11], time 0:00:02.596771
Episode 1000 last for 9 steps, mean steps 9.41 [8, 12], time 0:00:02.611900
Episode 1100 last for 9 steps, mean steps 9.70 [8, 13], time 0:00:02.640550
Episode 1200 last for 10 steps, mean steps 15.36 [8, 34], time 0:00:04.008642
Episode 1300 last for 9 steps, mean steps 12.15 [8, 21], time 0:00:03.241183
Episode 1400 

## DDQN

$Q(s, a) \xrightarrow{} r + \gamma \tilde{Q}(s', argmax_{a'}{Q(s', a')})$

In [55]:
Q = np.random.rand(5,2)
a_max = np.argmax(Q, axis=1)

print(Q)
print(a_max)

[[ 0.33383221  0.78305821]
 [ 0.51584218  0.86033497]
 [ 0.87480848  0.81155146]
 [ 0.01165582  0.12032662]
 [ 0.94101879  0.72528019]]
[1 1 0 1 0]


In [56]:
class DDQN(DQN):
    
    def update_q(self, memory):
        replay_sample = memory.sample(self.batch_size)
        if not replay_sample:
            return

        session = self.session
        Q_net = self.q_graph.Q_net
        Q_target = self.q_graph.Q_target
        gamma = self.gamma
        
        states = np.array([t.state for t in replay_sample])
        states_ = np.array([make_state(t.state_after) for t in replay_sample])
        
        Q1 = Q_net.predict_Q(session, states)
        Q1_ = Q_net.predict_Q(session, states_) 
        max_a = np.argmax(Q1_, axis=1)
        Q2 = Q_target.predict_Q(session, states_)
        
        n = len(replay_sample)
        X = np.zeros((n, self.q_graph.input_size))
        Y = np.zeros((n, self.q_graph.output_size))
        for i, t in enumerate(replay_sample):
            s, a, r, s_ = t
            X[i] = s
            q = Q1[i]
            if s_ is None:
                q[a] = r
            else:
                q[a] = r + gamma * Q2[i][max_a[i]]
            Y[i] = q
        loss_value = Q_net.train(session, X, Y)
        #print(loss_value)

In [57]:
%%time
#%%script false

memory = UniformMemory()

print('Fill Memory with Random\n')
gym_run(random_agent, memory, num_episodes=10000, report_buffer=1000)

ddqn_agent = DDQN()

print('\nDDQN')
run_agent(ddqn_agent, memory, num_episodes=2000, report_buffer=100)

del memory
del ddqn_agent

Fill Memory with Random

Episode 1000 last for 15 steps, mean steps 22.12 [8, 83], time 0:00:00.357315
Episode 2000 last for 31 steps, mean steps 22.32 [8, 80], time 0:00:00.293653
Episode 3000 last for 34 steps, mean steps 22.33 [8, 95], time 0:00:00.291127
Episode 4000 last for 20 steps, mean steps 21.70 [8, 103], time 0:00:00.364328
Episode 5000 last for 18 steps, mean steps 22.52 [8, 131], time 0:00:00.311522
Episode 6000 last for 27 steps, mean steps 22.46 [8, 99], time 0:00:00.293593
Episode 7000 last for 12 steps, mean steps 22.09 [9, 115], time 0:00:00.288201
Episode 8000 last for 35 steps, mean steps 22.41 [8, 131], time 0:00:00.301552
Episode 9000 last for 10 steps, mean steps 22.11 [8, 97], time 0:00:00.294681
Episode 10000 last for 23 steps, mean steps 22.04 [8, 129], time 0:00:00.277158

DDQN


Episode 100 last for 9 steps, mean steps 9.49 [8, 12], time 0:00:02.917065
Episode 200 last for 9 steps, mean steps 9.55 [8, 11], time 0:00:02.961491
Episode 300 last for 10 steps, mean steps 9.33 [8, 12], time 0:00:02.916018
Episode 400 last for 10 steps, mean steps 9.48 [8, 12], time 0:00:03.037117
Episode 500 last for 8 steps, mean steps 9.47 [8, 12], time 0:00:02.983466
Episode 600 last for 9 steps, mean steps 9.37 [8, 12], time 0:00:03.124681
Episode 700 last for 10 steps, mean steps 9.42 [8, 11], time 0:00:02.989656
Episode 800 last for 9 steps, mean steps 9.39 [8, 12], time 0:00:02.916817
Episode 900 last for 9 steps, mean steps 9.29 [8, 11], time 0:00:02.968359
Episode 1000 last for 9 steps, mean steps 9.29 [8, 11], time 0:00:02.912266
Episode 1100 last for 10 steps, mean steps 9.52 [8, 12], time 0:00:03.028264
Episode 1200 last for 10 steps, mean steps 9.37 [8, 12], time 0:00:02.909065
Episode 1300 last for 9 steps, mean steps 10.36 [8, 24], time 0:00:03.272312
Episode 1400 la

## Priorized Experience Replay

**Priority**

$p = (error + \epsilon)^\alpha$

* $\epsilon$ small positive constant
* $\alpha \in [0, 1]$ difference between high an low error

**Error**

$error = |Q(s, a) - T(s')|$

$T(s') = r + \gamma \tilde{Q}(s', argmax_{a'}{Q(s', a')})$

In [58]:
# Efficient sampling using Sum Tree

class WeightedTree:
    
    def __init__(self, min_size):
        data_size = 2 ** math.ceil(np.log2(min_size))
        self.data_size = data_size
        self.data_level = data_size - 1
        self.data_index = 0 # 0 .. data_size-1
        self.tree = np.zeros(2 * data_size - 1)
        self.data = [None] * data_size
    
    def __str__(self):
        return str(self.tree)
    
    def max_range(self):
        return self.tree[0]

    def add(self, data, weight):
        self.set_weight(self.data_index, weight)
        self.data[self.data_index] = data
        self.data_index = (self.data_index + 1) % self.data_size

    def set_weight(self, data_index, weight):
        i = self.data_level + data_index
        delta = weight - self.tree[i]
        self.tree[i] = weight
        while i:
            i = (i - 1) // 2
            self.tree[i] += delta

    def _select(self, i, range_value):
        while i < self.data_level:
            left = 2 * i + 1
            right = left + 1
            if range_value <= self.tree[left]:
                i = left
            else:
                i, range_value = right, range_value - self.tree[left]
        return i

    def select(self, range_value):
        i = self._select(0, range_value)
        data_index = i - self.data_level
        return (data_index, self.data[data_index], self.tree[i])

t = WeightedTree(8)
print(t)
t.add("a", 1)
print(t)
t.add("b", 2)
print(t)
t.add("c", 3)
print(t)
t.add("d", 4)
print(t)
t.add("e", 5)
print(t)
t.add("f", 6)
print(t)
t.add("g", 7)
print(t)
t.add("h", 8)
print(t)

for i in range(int(t.max_range())):
    m = t.select(i+1)
    print(i+1, m)

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 1.  1.  0.  1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.]
[ 3.  3.  0.  3.  0.  0.  0.  1.  2.  0.  0.  0.  0.  0.  0.]
[ 6.  6.  0.  3.  3.  0.  0.  1.  2.  3.  0.  0.  0.  0.  0.]
[ 10.  10.   0.   3.   7.   0.   0.   1.   2.   3.   4.   0.   0.   0.   0.]
[ 15.  10.   5.   3.   7.   5.   0.   1.   2.   3.   4.   5.   0.   0.   0.]
[ 21.  10.  11.   3.   7.  11.   0.   1.   2.   3.   4.   5.   6.   0.   0.]
[ 28.  10.  18.   3.   7.  11.   7.   1.   2.   3.   4.   5.   6.   7.   0.]
[ 36.  10.  26.   3.   7.  11.  15.   1.   2.   3.   4.   5.   6.   7.   8.]
1 (0, 'a', 1.0)
2 (1, 'b', 2.0)
3 (1, 'b', 2.0)
4 (2, 'c', 3.0)
5 (2, 'c', 3.0)
6 (2, 'c', 3.0)
7 (3, 'd', 4.0)
8 (3, 'd', 4.0)
9 (3, 'd', 4.0)
10 (3, 'd', 4.0)
11 (4, 'e', 5.0)
12 (4, 'e', 5.0)
13 (4, 'e', 5.0)
14 (4, 'e', 5.0)
15 (4, 'e', 5.0)
16 (5, 'f', 6.0)
17 (5, 'f', 6.0)
18 (5, 'f', 6.0)
19 (5, 'f', 6.0)
20 (5, 'f', 6.0)
21 (5, 'f', 6.0)
22 (6, 'g', 7.0)
23

In [59]:
priority_eps = 0.01
priority_alpha = 0.6

class PriorizedMemory:
    
    def __init__(self,
                 error_func=lambda m: 0,
                 min_size=memory_size,
                 priority_eps=priority_eps,
                 priority_alpha=priority_alpha):
        self.data = WeightedTree(min_size)
        self.priority_eps = priority_eps
        self.priority_alpha = priority_alpha
        self._error = error_func
    
    def _priority(self, error):
        return (error + self.priority_eps) ** self.priority_alpha
    
    def add(self, memento):
        err = self._error(memento)
        p = self._priority(err)
        self.data.add(memento, p)
    
    def sample(self, n):
        sample = []

        range_size = self.data.max_range() / n
        for i in range(n):
            start = i * range_size
            stop = (i + 1) * range_size

            range_value = random.uniform(start, stop)
            k, memento, _ = self.data.select(range_value)
            if memento is not None:
                sample.append((k, memento))

        return sample

    def update(self, k, error):
        p = self._priority(error)
        self.data.set_weight(k, p)

In [60]:
memory = UniformMemory()
gym_run(random_agent, memory, num_episodes=1, report_buffer=0)
s, a, r, s_ = memory.sample(1)[0]

s_ = make_state(s_)
s = s.reshape((1, state_size))
s_ = s_.reshape((1, state_size))

s, a, r, s_

(array([[ 0.00353868, -0.43856987, -0.01818327,  0.60223067]]),
 0,
 1.0,
 array([[-0.00523272, -0.63343283, -0.00613865,  0.8891312 ]]))

In [61]:
graph = tf.Graph()

with graph.as_default():
    common_variables = ['W_h:0', 'b_h:0', 'W_o:0', 'b_o:0']
    with tf.variable_scope('q_network'):
        Q_net = q_network()
    with tf.variable_scope('q_target'):
        Q_target = q_network(trainable=False)
    
    source_variables = [graph.get_tensor_by_name('q_network/' + name) for name in common_variables]
    target_variables = [graph.get_tensor_by_name('q_target/' + name) for name in common_variables]
    assign_target = [tf.assign(v_target, v_source)
                     for v_target, v_source in zip(target_variables, source_variables)] 
    update_target_op = tf.group(*assign_target)
    
    init_op = tf.global_variables_initializer()

session = tf.Session(graph=graph)
session.run(init_op)

In [62]:
q1 = Q_net.predict_Q(session, s)[0][a]
print(q1)

0.722408


In [63]:
q1_ = Q_net.predict_Q(session, s_)[0]
a_ = np.argmax(q1_)
print(q1_)
print(a_)

[ 1.05110538  1.41670191]
1


In [64]:
q2_ = Q_target.predict_Q(session, s_)[0][a_]
print(q2_)

-1.4806


In [65]:
t = r + gamma * q2_
print(t)

-0.465795651674


In [66]:
error = abs(q1 - t)
print(error)

1.18820335031


In [67]:
def ddqn_error(memento):
    s, a, r, s_ = memento

    s_ = make_state(s_)
    s = s.reshape((1, state_size))
    s_ = s_.reshape((1, state_size))
    
    q1 = Q_net.predict_Q(session, s)[0][a]
    q1_ = Q_net.predict_Q(session, s_)[0]
    a_ = np.argmax(q1_)
    
    q2_ = Q_target.predict_Q(session, s_)[0][a_]
    t = r + gamma * q2_
    
    return abs(q1 - t)

err = ddqn_error(memory.sample(1)[0])
print(err)

0.445749735609


In [68]:
session.close()
del graph

In [69]:
class DDQN_PER(DQN):
    
    def error(self, memento):
        _, _, err = self.train_data([(-1, memento)])
        return err[0][1]
    
    def train_data(self, sample):
        states = np.array([t.state for _, t in sample])
        states_ = np.array([make_state(t.state_after) for _, t in sample])
        
        session = self.session
        Q_net = self.q_graph.Q_net
        Q_target = self.q_graph.Q_target
        gamma = self.gamma

        Q1 = Q_net.predict_Q(session, states)
        Q1_ = Q_net.predict_Q(session, states_) 
        max_a = np.argmax(Q1_, axis=1)
        Q2 = Q_target.predict_Q(session, states_)
        
        n = len(sample)
        X = np.zeros((n, self.q_graph.input_size))
        Y = np.zeros((n, self.q_graph.output_size))
        errors = [None] * n
        for i, (k, t) in enumerate(sample):
            s, a, r, s_ = t
            X[i] = s
            q = Q1[i]
            q_a = q[a]
            if s_ is None:
                q[a] = r
            else:
                q[a] = r + gamma * Q2[i][max_a[i]]
            Y[i] = q
            errors[i] = (k, abs(q_a - q[a]))
        
        return X, Y, errors
    
    def update_q(self, memory):
        replay_sample = memory.sample(self.batch_size)
        if not replay_sample:
            return

        X, Y, errors = self.train_data(replay_sample)
        
        for k, err in errors:
            memory.update(k, err)

        session = self.session
        Q_net = self.q_graph.Q_net

        loss_value = Q_net.train(session, X, Y)
        #print(loss_value)

In [70]:
%%time
#%%script false

ddqn_per_agent = DDQN_PER()

memory = PriorizedMemory(ddqn_per_agent.error)

print('\nDDQN+PER')
run_agent(ddqn_per_agent, memory, num_episodes=2000, report_buffer=100)

del memory
del ddqn_per_agent


DDQN+PER


Episode 100 last for 9 steps, mean steps 9.31 [8, 12], time 0:00:04.735059
Episode 200 last for 9 steps, mean steps 9.39 [8, 12], time 0:00:04.798128
Episode 300 last for 9 steps, mean steps 9.29 [8, 13], time 0:00:04.757299
Episode 400 last for 11 steps, mean steps 9.43 [8, 12], time 0:00:04.849615
Episode 500 last for 9 steps, mean steps 9.39 [8, 11], time 0:00:04.938023
Episode 600 last for 11 steps, mean steps 9.82 [8, 12], time 0:00:05.061600
Episode 700 last for 104 steps, mean steps 29.00 [9, 105], time 0:00:14.304862
Episode 800 last for 200 steps, mean steps 161.03 [58, 200], time 0:01:18.478816
Episode 900 last for 200 steps, mean steps 199.83 [190, 200], time 0:01:37.921211
Episode 1000 last for 184 steps, mean steps 194.05 [164, 200], time 0:01:46.376399
Episode 1100 last for 193 steps, mean steps 192.44 [169, 200], time 0:01:36.796337
Episode 1200 last for 200 steps, mean steps 195.84 [175, 200], time 0:01:35.102344
Episode 1300 last for 200 steps, mean steps 198.61 [179, 