## Required Libraries

- ReNomDL 2.7.1
- ReNomRL 0.4

In [1]:
import gym
import numpy as np
import renom as rm
from renom.cuda import set_cuda_active
from renom_rl.discrete.dqn import DQN
from renom_rl.environ.env import BaseEnv
from renom_rl.utility import Animation
from renom_rl.utility.logger import DQNLogger
from renom_rl.utility import EpsilonSLFilter
import matplotlib.pyplot as plt
from renom.utility.initializer import GlorotUniform
from PIL import Image
set_cuda_active(True)

In [2]:
class CustomEnv(BaseEnv):
    
    def __init__(self, env):
        self.action_shape = (env.action_space.n, )
        self.state_shape = (4, 84, 84)
        self.env=env
        self.reward=0
        self.resent_state_list = []
        self.animation=Animation()
        self.zero_image = np.zeros(self.state_shape, dtype=np.float32)
        self._length = 4
        
    def reset(self):
        self.resent_state_list = [self._preprocess(self.env.reset())]
        for i in range(self._length-1):
            s, _, _, _ = self.env.step(self.env.action_space.sample())
            self.resent_state_list.append(self._preprocess(s))
        return np.array(self.resent_state_list).reshape(4, 84, 84)
    
    def _preprocess(self, s):
        img = Image.fromarray(s).resize((84, 84)).convert('L')
        return np.asanyarray(img).astype(np.float32) / 255.
    
    def sample(self):
        return self.env.action_space.sample()
    
    def step(self, action):
        s, r, t, _ = self.env.step(action)
        self.resent_state_list.append(self._preprocess(s))
        self.resent_state_list = self.resent_state_list[1:]
        return np.array(self.resent_state_list).reshape(4, 84, 84), np.clip(r, -1, 1), t
    
    def terminate(self):
        return False

    def test_start(self):
        self.animation.reset()
        self.test_mode=True

    def test_step(self):
        self.animation.store(self.env.render(mode="rgb_array"))

    def test_close(self):
        self.env.close()
        self.env.viewer=None
        self.test_mode=False
 
    def reset_anime(self):
        self.animation.reset()
        
        
env = gym.make("BreakoutDeterministic-v4")
custom_env = CustomEnv(env)
q_network = rm.Sequential([
    rm.Conv2d(channel=32, filter=8, stride=4, initializer=GlorotUniform()),
    rm.Relu(),
    rm.Conv2d(channel=64, filter=4, stride=2, initializer=GlorotUniform()),
    rm.Relu(),
    rm.Conv2d(channel=64, filter=3, stride=1, initializer=GlorotUniform()),
    rm.Relu(),
    rm.Flatten(),
    rm.Dense(512, initializer=GlorotUniform()),
    rm.Relu(),
    rm.Dense(custom_env.action_shape[0], initializer=GlorotUniform())
])

In [3]:
epoch = 175
epoch_step = 10000
random_step = 50000
update_period = 10000
train_freq = 4
batch_size = 32
action_filter = EpsilonSLFilter(min=0.1, max=1.0, initial=1.0, test_epsilon=0.05, epsilon_step=1000000)

In [None]:
model = DQN(custom_env, q_network, optimizer=rm.Adam(0.00025),
            loss_func=lambda x, y, **kwargs: rm.sum(rm.clipped_mean_squared_error(x, y)*batch_size))

In [None]:
result = model.fit(epoch=epoch,
                   epoch_step=epoch_step,
                   batch_size=batch_size,
                   random_step=random_step,
                   test_step=None,
                   update_period=update_period,
                   train_frequency=train_freq,
                   action_filter=action_filter)

Run random 50000 step for storing experiences


In [None]:
model.test()