In [1]:
import gym
import numpy as np
import renom as rm
from renom.cuda import set_cuda_active
from renom_rl.dqn import DQN
from renom_rl.env import BaseEnv
from gym.core import Env
from PIL import Image

set_cuda_active(True)
env = gym.make('BreakoutNoFrameskip-v4')

class CustomEnv(BaseEnv):
    
    def __init__(self, env):
        self.env = env
        self.action_shape = 4
        self.state_shape = (4, 84, 84)
        super(CustomEnv, self).__init__()
    
    def reset(self):
        self.env.reset()
        for _ in range(32):
            state, _, _ = self.step(self.env.action_space.sample())
        return state
    
    def sample(self):
        return int(np.random.rand()*4)
    
    def render(self):
        self.env.render()

    def _preprocess(self,state):
        resized_image = Image.fromarray(state).resize((84,110)).convert('L')
        image_array = np.asarray(resized_image.getdata()).reshape(110, 84)/255. * 2 - 1
        final_image = image_array[26:110, :]
        return final_image
    
    def step(self, action):
        state_list = []
        reward_list = []
        terminal = False
        for _ in range(4):
            s, r, t, _ = self.env.step(action)
            state_list.append(self._preprocess(s))
            reward_list.append(r)
            if t:
                terminal = True
        state = np.stack(state_list)
        return state, (np.sum(reward_list) > 0).astype(np.int), terminal

custom_env = CustomEnv(env)
q_network = rm.Sequential([rm.Conv2d(32, filter=8, stride=4),
                           rm.Relu(),
                           rm.Conv2d(64, filter=4, stride=2), 
                           rm.Relu(),
                           rm.Conv2d(64, filter=3, stride=1), 
                           rm.Relu(), 
                           rm.Flatten(), 
                           rm.Dense(512),
                           rm.Relu(), 
                           rm.Dense(custom_env.action_shape)])

In [2]:
model = DQN(custom_env, q_network)

In [4]:
model.fit(render=True, greedy_step=100000)

Run random 5000 step for storing experiences


episode 001 avg_loss: 0.007 total_reward [train:3.000 test:-] e-greedy:0.002: : 222it [00:05, 39.41it/s]
episode 002 avg_loss: 0.006 total_reward [train:1.000 test:-] e-greedy:0.003: : 121it [00:03, 37.89it/s]
episode 003 avg_loss: 0.006 total_reward [train:2.000 test:-] e-greedy:0.005: : 166it [00:04, 38.40it/s]
episode 004 avg_loss: 0.006 total_reward [train:0.000 test:-] e-greedy:0.005: : 98it [00:02, 37.78it/s]
episode 005 avg_loss: 0.005 total_reward [train:0.000 test:-] e-greedy:0.006: : 99it [00:02, 37.79it/s]
episode 006 each step reward:1.000: : 61it [00:01, 39.70it/s]

KeyboardInterrupt: 

In [None]:
import time
start_t = time.time()
a = np.random.permutation(int(1e1))
print(time.time()-start_t)

# 