In [1]:
import matplotlib.pyplot as plt
import gym

env=gym.make("BreakoutDeterministic-v4")
env.reset()
plt.imshow(env.render(mode="rgb_array"))
env.close()
plt.axis("off");

## 

## Required Libraries

- ReNomDL 2.6
- ReNomRL 0.2
- Numpy 1.14
- matplotlib 2.2.3


In [2]:
import gym
import numpy as np
import renom as rm
from renom.cuda import set_cuda_active
from renom_rl.discrete.dqn import DQN
from renom_rl.environ.env import BaseEnv
from renom_rl.utility import Animation
from renom_rl.utility.logger import DQNLogger
import matplotlib.pyplot as plt
from renom.utility.initializer import GlorotUniform
from PIL import Image
set_cuda_active(True)

## 環境の定義

ReNomRLでは環境を ``BaseEnv`` クラスを親クラスとして利用する必要があります。

In [3]:
class CustomEnv(BaseEnv):
    
    def __init__(self, env):
        self.action_shape = (env.action_space.n, )
        self.state_shape = (4, 84, 84)
        self.env=env
        self.reward=0
        self.resent_state_list = []
        self.animation=Animation()
        self.zero_image = np.zeros(self.state_shape, dtype=np.float32)
        self._length = 4
        
    def reset(self):
        self.resent_state_list = [self._preprocess(self.env.reset())]
        for i in range(self._length-1):
            s, _, _, _ = self.env.step(self.env.action_space.sample())
            self.resent_state_list.append(self._preprocess(s))
        return np.array(self.resent_state_list).reshape(4, 84, 84)
    
    def _preprocess(self, s):
        img = Image.fromarray(s).resize((84, 84)).convert('L')
        return np.asanyarray(img).astype(np.float32) / 255.
    
    def sample(self):
        return self.env.action_space.sample()
    
    def step(self, action):
        s, r, t, _ = self.env.step(action)
        self.resent_state_list.append(self._preprocess(s))
        self.resent_state_list = self.resent_state_list[1:]
        return np.array(self.resent_state_list).reshape(4, 84, 84), np.clip(r, -1, 1), t
    
    def terminate(self):
        return False

    def test_start(self):
        self.animation.reset()
        self.test_mode=True

    def test_step(self):
        self.animation.store(self.env.render(mode="rgb_array"))

    def test_close(self):
        self.env.close()
        self.env.viewer=None
        self.test_mode=False
 
    def reset_anime(self):
        self.animation.reset()

class MyLogger(DQNLogger):
    
    def __init__(self, *args, **kwargs):
        super(DQNLogger, self).__init__(*args, **kwargs)
        self._log = {
            'nth':[],
            'skip-nth':[],
            'q':[],
            'loss':[],
            'reward':[],
            'Conv1-l2':[],
            'Conv2-l2':[],
            'Conv3-l2':[],
            'Dense1-l2':[],
            'Dense2-l2':[],
        }
    
    def logger_episode(self, nth, mean_q, mean_loss, cum_reward, model):
        self._log['nth'].append(nth)
        self._log['q'].append(mean_q)
        self._log['loss'].append(mean_loss)
        self._log['reward'].append(cum_reward)
        if nth % 10 == 0 and nth:
            self._log['skip-nth'].append(nth)
            self._log['Conv1-l2'].append(np.sum(model[0].params['w'].as_ndarray()**2))
            self._log['Conv2-l2'].append(np.sum(model[2].params['w'].as_ndarray()**2))
            self._log['Conv3-l2'].append(np.sum(model[4].params['w'].as_ndarray()**2))
            self._log['Dense1-l2'].append(np.sum(model[7].params['w'].as_ndarray()**2))
            self._log['Dense2-l2'].append(np.sum(model[9].params['w'].as_ndarray()**2))
            self.show_graph()
            
    def show_graph(self):
        fig, ax = plt.subplots(8, 1, figsize=(8, 12))
        ax[0].set_ylabel('Q')
        ax[0].plot(self._log['nth'], self._log['q'])
        ax[1].set_ylabel('Loss')
        ax[1].plot(self._log['nth'], self._log['loss'])
        ax[2].set_ylabel('Reward')
        ax[2].plot(self._log['nth'], self._log['reward'])
        
        ax[3].set_ylabel('Conv1-l2')
        ax[3].scatter(self._log['skip-nth'], self._log['Conv1-l2'])
        
        ax[4].set_ylabel('Conv2-l2')
        ax[4].scatter(self._log['skip-nth'], self._log['Conv2-l2'])
        
        ax[5].set_ylabel('Conv3-l2')
        ax[5].scatter(self._log['skip-nth'], self._log['Conv3-l2'])
        
        ax[6].set_ylabel('Dense1-l2')
        ax[6].scatter(self._log['skip-nth'], self._log['Dense1-l2'])
        
        ax[7].set_ylabel('Dense2-l2')
        ax[7].scatter(self._log['skip-nth'], self._log['Dense2-l2'])
        
        plt.savefig('result.png')
        
        
env = gym.make("BreakoutDeterministic-v4")
custom_env = CustomEnv(env)
q_network = rm.Sequential([
    rm.Conv2d(channel=32, filter=8, stride=4, initializer=GlorotUniform()),
    rm.Relu(),
    rm.Conv2d(channel=64, filter=2, stride=2, initializer=GlorotUniform()),
    rm.Relu(),
    rm.Conv2d(channel=64, filter=3, stride=1, initializer=GlorotUniform()),
    rm.Relu(),
    rm.Flatten(),
    rm.Dense(512, initializer=GlorotUniform()),
    rm.Relu(),
    rm.Dense(custom_env.action_shape[0], initializer=GlorotUniform())
])

In [4]:
model = DQN(custom_env, q_network, optimizer=rm.Adam(0.00025), loss_func=rm.ClippedMeanSquaredError((-1, 1)), logger=MyLogger())
print(custom_env.state_shape[0])

4


In [None]:
from renom_rl.utility import EpsilonSLFilter
obj = EpsilonSLFilter(min=0.0, max=0.9, initial=0.9, test_epsilon=0.95, epsilon_step=1000000)

In [None]:
result = model.fit(epoch=175,
                   epoch_step=10000,
                   batch_size=32,
                   random_step=50,
                   test_step=None,
                   update_period=10000,
                   train_frequency=4,
                   action_filter=obj)

epoch 0001 epsilon 0.9000 loss 0.0006 rewards in epoch 0.000 episode 0000 rewards in episode 0.000.:   0%|          | 18/10000 [00:00<01:07, 147.49it/s]

Run random 50 step for storing experiences


epoch 001 avg_loss:0.0010 total reward in epoch: [train:68.000 test: 0.0] avg train reward in episode:1.214 epsilon :0.891: 100%|██████████| 10000/10000 [01:08<00:00, 145.48it/s]
epoch 002 avg_loss:0.0004 total reward in epoch: [train:68.000 test: 4.0] avg train reward in episode:1.259 epsilon :0.882: 100%|██████████| 10000/10000 [01:09<00:00, 143.60it/s]
epoch 0003 epsilon 0.8811 loss 0.0000 rewards in epoch 6.000 episode 0005 rewards in episode 1.000.:  10%|█         | 1028/10000 [00:07<01:00, 149.22it/s]

In [None]:
model.test()

In [None]:
custom_env.animation.run()
custom_env.reset_anime()

In [None]:
q_network.save("dqn_exp5.h5")
# model = DQN(custom_env, q_network)

In [None]:
model.test(render=True)

In [None]:
import time
start_t = time.time()
a = np.random.permutation(int(1e1))
print(time.time()-start_t)


# 