# Double DQNの解説

![image.png](attachment:image.png)

画像引用:  
https://qiita.com/sugulu/items/3c7d6cbe600d455e853b

### DQNの特徴
- Q学習において状態行動テーブルを関数で表したもの.
- 離散的な行動を扱うことができる.

参考:  
http://blog.syundo.org/post/20171208-reinforcement-learning-dqn-and-impl/

### OpenAI gymのインストール

githubのレポジトリを参考に, gymモジュールをインストールしてください.  
https://github.com/openai/gym

In [1]:
import gym
import numpy as np
import renom as rm
import matplotlib.pyplot as plt
from renom.utility.initializer import Gaussian
from renom.cuda import set_cuda_active
from renom_rl.double_dqn import DoubleDQN
from renom_rl.env import BaseEnv
from gym.core import Env
from PIL import Image

set_cuda_active(True)
env = gym.make('BreakoutNoFrameskip-v4')

class CustomEnv(BaseEnv):
    
    def __init__(self, env):
        self.env = env
        self.action_shape = 4
        self.state_shape = (4, 84, 84)
        self.previous_frames = []
        super(CustomEnv, self).__init__()
    
    def reset(self):
        self.env.reset()
        n_step = np.random.randint(4, 32+1)
        for _ in range(n_step):
            state, _, _ = self.step(self.env.action_space.sample())
        return state
    
    def sample(self):
        return self.env.action_space.sample()
    
    def render(self):
        self.env.render()

    def _preprocess(self, state):
        resized_image = Image.fromarray(state).resize((84, 110)).convert('L')
        image_array = np.asarray(resized_image)/255.
        final_image = image_array[26:110]
        # Confirm that the image is processed correctly.
        # Image.fromarray(np.clip(final_image.reshape(84, 84)*255, 0, 255).astype(np.uint8)).save("test.png")
        return final_image
    
    def step(self, action):
        state_list = []
        reward_list = []
        terminal = False
        for _ in range(4):
            # Use last frame. Other frames will be skipped.
            s, r, t, _ = self.env.step(action)
            state = self._preprocess(s)
            reward_list.append(r)
            if t:
                terminal = True
                
        if len(self.previous_frames) > 3:
            self.previous_frames = self.previous_frames[1:] + [state]
        else:
            self.previous_frames += [state]
        state = np.stack(self.previous_frames)
        return state, np.sum(reward_list), terminal
    
custom_env = CustomEnv(env)
q_network = rm.Sequential([rm.Conv2d(32, filter=8, stride=4, ignore_bias=True),
                           rm.Relu(),
                           rm.Conv2d(64, filter=4, stride=2, ignore_bias=True),
                           rm.Relu(),
                           rm.Conv2d(64, filter=3, stride=1, ignore_bias=True),
                           rm.Relu(), 
                           rm.Flatten(), 
                           rm.Dense(512, ignore_bias=True),
                           rm.Relu(),
                           rm.Dense(custom_env.action_shape, ignore_bias=True)])

In [None]:
model = DoubleDQN(custom_env, q_network)

In [None]:
model.fit(render=True, greedy_step=1000000)

Run random 5000 step for storing experiences


episode 001 avg_loss: 0.009 total_reward [train:2.000 test:-] e-greedy:0.000: : 226it [00:06, 35.27it/s]
episode 002 avg_loss: 0.009 total_reward [train:0.000 test:-] e-greedy:0.000: : 105it [00:02, 36.68it/s]
episode 003 avg_loss: 0.005 total_reward [train:2.000 test:-] e-greedy:0.000: : 184it [00:05, 35.70it/s]
episode 004 avg_loss: 0.008 total_reward [train:0.000 test:-] e-greedy:0.001: : 121it [00:03, 35.71it/s]
episode 005 avg_loss: 0.008 total_reward [train:1.000 test:-] e-greedy:0.001: : 160it [00:04, 35.89it/s]
episode 006 avg_loss: 0.008 total_reward [train:1.000 test:-] e-greedy:0.001: : 172it [00:04, 36.85it/s]
episode 007 avg_loss: 0.006 total_reward [train:1.000 test:-] e-greedy:0.001: : 126it [00:03, 35.44it/s]
episode 008 avg_loss: 0.006 total_reward [train:1.000 test:-] e-greedy:0.001: : 154it [00:04, 35.32it/s]
episode 009 avg_loss: 0.010 total_reward [train:1.000 test:-] e-greedy:0.001: : 171it [00:04, 39.11it/s]
episode 010 avg_loss: 0.009 total_reward [train:0.000 t

episode 156 avg_loss: 0.001 total_reward [train:0.000 test:-] e-greedy:0.022: : 129it [00:03, 37.66it/s]
episode 157 avg_loss: 0.003 total_reward [train:0.000 test:-] e-greedy:0.022: : 122it [00:03, 38.17it/s]
episode 158 avg_loss: 0.002 total_reward [train:0.000 test:-] e-greedy:0.022: : 112it [00:02, 38.26it/s]
episode 159 avg_loss: 0.002 total_reward [train:2.000 test:-] e-greedy:0.022: : 174it [00:04, 38.32it/s]
episode 160 avg_loss: 0.004 total_reward [train:0.000 test:-] e-greedy:0.022: : 136it [00:03, 38.79it/s]
episode 161 avg_loss: 0.002 total_reward [train:2.000 test:-] e-greedy:0.023: : 194it [00:05, 38.50it/s]
episode 162 avg_loss: 0.001 total_reward [train:1.000 test:-] e-greedy:0.023: : 148it [00:03, 38.43it/s]
episode 163 avg_loss: 0.004 total_reward [train:0.000 test:-] e-greedy:0.023: : 141it [00:03, 38.41it/s]
episode 164 avg_loss: 0.002 total_reward [train:2.000 test:-] e-greedy:0.023: : 165it [00:04, 37.95it/s]
episode 165 avg_loss: 0.002 total_reward [train:1.000 t

episode 310 avg_loss: 0.001 total_reward [train:1.000 test:-] e-greedy:0.043: : 132it [00:03, 37.07it/s]
episode 311 avg_loss: 0.002 total_reward [train:1.000 test:-] e-greedy:0.044: : 150it [00:03, 37.83it/s]
episode 312 avg_loss: 0.002 total_reward [train:0.000 test:-] e-greedy:0.044: : 118it [00:03, 38.70it/s]
episode 313 avg_loss: 0.002 total_reward [train:2.000 test:-] e-greedy:0.044: : 188it [00:04, 38.01it/s]
episode 314 avg_loss: 0.001 total_reward [train:0.000 test:-] e-greedy:0.044: : 116it [00:03, 37.66it/s]
episode 315 avg_loss: 0.001 total_reward [train:1.000 test:-] e-greedy:0.044: : 134it [00:03, 38.12it/s]
episode 316 avg_loss: 0.002 total_reward [train:0.000 test:-] e-greedy:0.044: : 130it [00:03, 39.80it/s]
episode 317 avg_loss: 0.001 total_reward [train:0.000 test:-] e-greedy:0.044: : 132it [00:03, 37.76it/s]
episode 318 avg_loss: 0.002 total_reward [train:1.000 test:-] e-greedy:0.044: : 142it [00:03, 38.98it/s]
episode 319 avg_loss: 0.001 total_reward [train:1.000 t

episode 464 avg_loss: 0.002 total_reward [train:3.000 test:-] e-greedy:0.066: : 253it [00:07, 36.11it/s]
episode 465 avg_loss: 0.001 total_reward [train:1.000 test:-] e-greedy:0.066: : 161it [00:04, 36.75it/s]
episode 466 avg_loss: 0.003 total_reward [train:0.000 test:-] e-greedy:0.066: : 115it [00:03, 37.82it/s]
episode 467 avg_loss: 0.001 total_reward [train:1.000 test:-] e-greedy:0.067: : 156it [00:04, 37.55it/s]
episode 468 avg_loss: 0.002 total_reward [train:2.000 test:-] e-greedy:0.067: : 199it [00:05, 36.38it/s]
episode 469 avg_loss: 0.002 total_reward [train:0.000 test:-] e-greedy:0.067: : 113it [00:03, 36.91it/s]
episode 470 avg_loss: 0.002 total_reward [train:2.000 test:-] e-greedy:0.067: : 218it [00:06, 36.34it/s]
episode 471 avg_loss: 0.002 total_reward [train:1.000 test:-] e-greedy:0.067: : 142it [00:03, 37.35it/s]
episode 472 avg_loss: 0.001 total_reward [train:3.000 test:-] e-greedy:0.067: : 211it [00:05, 35.17it/s]
episode 473 avg_loss: 0.001 total_reward [train:0.000 t

episode 618 avg_loss: 0.002 total_reward [train:2.000 test:-] e-greedy:0.088: : 213it [00:05, 38.13it/s]
episode 619 avg_loss: 0.002 total_reward [train:2.000 test:-] e-greedy:0.089: : 207it [00:05, 37.09it/s]
episode 620 avg_loss: 0.001 total_reward [train:4.000 test:-] e-greedy:0.089: : 269it [00:07, 36.60it/s]
episode 621 avg_loss: 0.001 total_reward [train:3.000 test:-] e-greedy:0.089: : 267it [00:07, 37.82it/s]
episode 622 avg_loss: 0.001 total_reward [train:2.000 test:-] e-greedy:0.089: : 223it [00:05, 37.97it/s]
episode 623 avg_loss: 0.001 total_reward [train:0.000 test:-] e-greedy:0.089: : 99it [00:02, 37.32it/s]
episode 624 avg_loss: 0.002 total_reward [train:1.000 test:-] e-greedy:0.090: : 166it [00:04, 37.03it/s]
episode 625 avg_loss: 0.002 total_reward [train:7.000 test:-] e-greedy:0.090: : 251it [00:06, 38.17it/s]
episode 626 avg_loss: 0.002 total_reward [train:1.000 test:-] e-greedy:0.090: : 145it [00:03, 37.86it/s]
episode 627 avg_loss: 0.003 total_reward [train:0.000 te

episode 696 avg_loss: 0.002 total_reward [train:5.000 test:-] e-greedy:0.101: : 358it [00:09, 36.16it/s]
episode 697 avg_loss: 0.002 total_reward [train:2.000 test:-] e-greedy:0.101: : 207it [00:05, 35.37it/s]
episode 698 avg_loss: 0.002 total_reward [train:0.000 test:-] e-greedy:0.101: : 123it [00:03, 35.25it/s]
episode 699 avg_loss: 0.001 total_reward [train:2.000 test:-] e-greedy:0.101: : 213it [00:05, 35.72it/s]
episode 700 avg_loss: 0.001 total_reward [train:0.000 test:1.000] e-greedy:0.101: : 109it [00:05, 37.05it/s]
episode 701 avg_loss: 0.002 total_reward [train:0.000 test:-] e-greedy:0.101: : 130it [00:03, 36.74it/s]
episode 702 avg_loss: 0.002 total_reward [train:4.000 test:-] e-greedy:0.102: : 291it [00:07, 37.00it/s]
episode 703 avg_loss: 0.001 total_reward [train:3.000 test:-] e-greedy:0.102: : 232it [00:06, 36.78it/s]
episode 704 avg_loss: 0.001 total_reward [train:3.000 test:-] e-greedy:0.102: : 209it [00:05, 37.37it/s]
episode 705 avg_loss: 0.002 total_reward [train:0.0

episode 850 avg_loss: 0.001 total_reward [train:0.000 test:1.000] e-greedy:0.122: : 114it [00:04, 27.70it/s]
episode 851 avg_loss: 0.002 total_reward [train:3.000 test:-] e-greedy:0.123: : 230it [00:05, 38.42it/s]
episode 852 avg_loss: 0.001 total_reward [train:0.000 test:-] e-greedy:0.123: : 106it [00:02, 36.72it/s]
episode 853 avg_loss: 0.001 total_reward [train:1.000 test:-] e-greedy:0.123: : 147it [00:03, 37.80it/s]
episode 854 avg_loss: 0.001 total_reward [train:1.000 test:-] e-greedy:0.123: : 150it [00:03, 37.57it/s]
episode 855 avg_loss: 0.001 total_reward [train:1.000 test:-] e-greedy:0.123: : 201it [00:05, 37.83it/s]
episode 856 avg_loss: 0.001 total_reward [train:0.000 test:-] e-greedy:0.123: : 116it [00:03, 37.85it/s]
episode 857 avg_loss: 0.002 total_reward [train:1.000 test:-] e-greedy:0.124: : 169it [00:04, 38.82it/s]
episode 858 avg_loss: 0.001 total_reward [train:3.000 test:-] e-greedy:0.124: : 222it [00:05, 38.10it/s]
episode 859 avg_loss: 0.002 total_reward [train:1.0

episode 1004 avg_loss: 0.002 total_reward [train:0.000 test:-] e-greedy:0.143: : 121it [00:03, 36.85it/s]
episode 1005 avg_loss: 0.001 total_reward [train:1.000 test:-] e-greedy:0.143: : 144it [00:03, 37.82it/s]
episode 1006 avg_loss: 0.001 total_reward [train:0.000 test:-] e-greedy:0.143: : 123it [00:03, 38.07it/s]
episode 1007 avg_loss: 0.002 total_reward [train:2.000 test:-] e-greedy:0.144: : 193it [00:05, 37.42it/s]
episode 1008 avg_loss: 0.001 total_reward [train:1.000 test:-] e-greedy:0.144: : 138it [00:03, 37.25it/s]
episode 1009 avg_loss: 0.002 total_reward [train:1.000 test:-] e-greedy:0.144: : 163it [00:04, 37.93it/s]
episode 1010 avg_loss: 0.004 total_reward [train:0.000 test:-] e-greedy:0.144: : 110it [00:03, 36.58it/s]
episode 1011 avg_loss: 0.003 total_reward [train:2.000 test:-] e-greedy:0.144: : 191it [00:05, 38.50it/s]
episode 1012 avg_loss: 0.002 total_reward [train:1.000 test:-] e-greedy:0.144: : 133it [00:03, 38.00it/s]
episode 1013 avg_loss: 0.002 total_reward [tra

episode 1158 avg_loss: 0.002 total_reward [train:2.000 test:-] e-greedy:0.166: : 181it [00:05, 36.39it/s]
episode 1159 avg_loss: 0.001 total_reward [train:1.000 test:-] e-greedy:0.167: : 185it [00:05, 36.24it/s]
episode 1160 avg_loss: 0.002 total_reward [train:1.000 test:-] e-greedy:0.167: : 143it [00:04, 35.13it/s]
episode 1161 avg_loss: 0.002 total_reward [train:2.000 test:-] e-greedy:0.167: : 197it [00:05, 35.54it/s]
episode 1162 avg_loss: 0.003 total_reward [train:1.000 test:-] e-greedy:0.167: : 149it [00:03, 39.60it/s]
episode 1163 avg_loss: 0.002 total_reward [train:2.000 test:-] e-greedy:0.167: : 191it [00:05, 35.13it/s]
episode 1164 avg_loss: 0.002 total_reward [train:1.000 test:-] e-greedy:0.167: : 167it [00:04, 35.74it/s]
episode 1165 avg_loss: 0.002 total_reward [train:1.000 test:-] e-greedy:0.167: : 180it [00:05, 35.76it/s]
episode 1166 avg_loss: 0.002 total_reward [train:7.000 test:-] e-greedy:0.168: : 367it [00:10, 36.94it/s]
episode 1167 avg_loss: 0.002 total_reward [tra

episode 1312 avg_loss: 0.003 total_reward [train:3.000 test:-] e-greedy:0.192: : 256it [00:06, 35.04it/s]
episode 1313 avg_loss: 0.002 total_reward [train:3.000 test:-] e-greedy:0.192: : 209it [00:05, 39.07it/s]
episode 1314 avg_loss: 0.002 total_reward [train:0.000 test:-] e-greedy:0.192: : 126it [00:03, 37.11it/s]
episode 1315 avg_loss: 0.002 total_reward [train:1.000 test:-] e-greedy:0.192: : 167it [00:04, 36.85it/s]
episode 1316 avg_loss: 0.002 total_reward [train:4.000 test:-] e-greedy:0.192: : 254it [00:06, 37.74it/s]
episode 1317 avg_loss: 0.002 total_reward [train:3.000 test:-] e-greedy:0.193: : 247it [00:06, 35.81it/s]
episode 1318 avg_loss: 0.002 total_reward [train:0.000 test:-] e-greedy:0.193: : 142it [00:04, 35.02it/s]
episode 1319 avg_loss: 0.004 total_reward [train:0.000 test:-] e-greedy:0.193: : 98it [00:02, 33.66it/s]
episode 1320 avg_loss: 0.003 total_reward [train:2.000 test:-] e-greedy:0.193: : 163it [00:04, 37.82it/s]
episode 1321 avg_loss: 0.002 total_reward [trai

In [None]:
q_network.save("dqn_exp5.h5")
# model = DQN(custom_env, q_network)

In [None]:
model.test(render=True)

In [None]:
import time
start_t = time.time()
a = np.random.permutation(int(1e1))
print(time.time()-start_t)

# 