# Test Proof of A2C

# Library

In [1]:
import numpy as np
import gym
import copy
import renom as rm
from renom.optimizer import Adam
from renom_rl.discrete.a2c import A2C
from renom_rl.environ.env import BaseEnv
from tqdm import tqdm
from renom_rl.utility import Animation
import matplotlib.pyplot as plt
from PIL import Image
from renom_rl.utility.additional_modules import Orthogonal
from renom.cuda import set_cuda_active
from renom_rl.utility.logger import Logger


# Model

In [2]:
set_cuda_active(True)

class ActorCritic(rm.Model):
    def __init__(self):
        self.c1=rm.Conv2d(32, filter=8, padding=0, stride=4, initializer=Orthogonal(1.412))
        self.c2=rm.Conv2d(64, filter=4, padding=0, stride=2, initializer=Orthogonal(1.412))
        self.c3=rm.Conv2d(64, filter=3, padding=0, stride=1, initializer=Orthogonal(1.412))
        self.l1=rm.Dense(512, initializer=Orthogonal(1.412))
        self.l3=rm.Dense(4, initializer=Orthogonal(gain=0.01))
        self.l4=rm.Dense(1, initializer=Orthogonal(1))
        
    def forward(self,x):
        h = self.c1(x)
        h = rm.relu(h)
        h = self.c2(h)
        h = rm.relu(h)
        h = self.c3(h)
        h = rm.relu(h)        
        h = rm.flatten(h)
        h = self.l1(h)
        h = rm.relu(h)
        act = rm.softmax(self.l3(h))
        val=self.l4(h)
        return act,val
    
model=ActorCritic()

# Environment

In [3]:
import cv2
from gym import spaces
from gym.spaces.box import Box
cv2.ocl.setUseOpenCL(False)


class NoopResetEnv(gym.Wrapper):
    def __init__(self, env, noop_max=30):
        '''工夫1のNo-Operationです。リセット後適当なステップの間何もしないようにし、
        ゲーム開始の初期状態を様々にすることｆで、特定の開始状態のみで学習するのを防ぐ'''

        gym.Wrapper.__init__(self, env)
        self.noop_max = noop_max
        self.override_num_noops = None
        self.noop_action = 0
        assert env.unwrapped.get_action_meanings()[0] == 'NOOP'

    def reset(self, **kwargs):
        """ Do no-op action for a number of steps in [1, noop_max]."""
        self.env.reset(**kwargs)
        if self.override_num_noops is not None:
            noops = self.override_num_noops
        else:
            noops = self.unwrapped.np_random.randint(
                1, self.noop_max + 1)  # pylint: disable=E1101
        assert noops > 0
        obs = None
        _=self.env.step(1)
        for _ in range(noops):
            obs, _, done, _ = self.env.step(self.noop_action)
            if done:
                obs = self.env.reset(**kwargs)
        return obs

    def step(self, ac):
        return self.env.step(ac)


class EpisodicLifeEnv(gym.Wrapper):
    def __init__(self, env):
        '''工夫2のEpisodic Lifeです。1機失敗したときにリセットし、失敗時の状態から次を始める'''
        gym.Wrapper.__init__(self, env)
        self.lives = 0
        self.was_real_done = True

    def step(self, action):
        obs, reward, done, info = self.env.step(action)
        self.was_real_done = done
        # check current lives, make loss of life terminal,
        # then update lives to handle bonus lives
        lives = self.env.unwrapped.ale.lives()
        if lives < self.lives and lives > 0:
            # for Qbert sometimes we stay in lives == 0 condtion for a few frames
            # so its important to keep lives > 0, so that we only reset once
            # the environment advertises done.
            done = True
        self.lives = lives
        return obs, reward, done, info

    def reset(self, **kwargs):
        '''5機とも失敗したら、本当にリセット'''
        if self.was_real_done:
            obs = self.env.reset(**kwargs)
        else:
            # no-op step to advance from terminal/lost life state
            obs, _, _, _ = self.env.step(0)
        self.lives = self.env.unwrapped.ale.lives()
        return obs


class MaxAndSkipEnv(gym.Wrapper):
    def __init__(self, env, skip=4):
        '''工夫3のMax and Skipです。4フレーム連続で同じ行動を実施し、最後の3、4フレームの最大値をとった画像をobsにする'''
        gym.Wrapper.__init__(self, env)
        # most recent raw observations (for max pooling across time steps)
        self._obs_buffer = np.zeros(
            (2,)+env.observation_space.shape, dtype=np.uint8)
        self._skip = skip

    def step(self, action):
        """Repeat action, sum reward, and max over last observations."""
        total_reward = 0.0
        done = None
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            if i == self._skip - 2:
                self._obs_buffer[0] = obs
            if i == self._skip - 1:
                self._obs_buffer[1] = obs
            total_reward += reward
            if done:
                break
        # Note that the observation on the done=True frame
        # doesn't matter
        max_frame = self._obs_buffer.max(axis=0)

        return max_frame, total_reward, done, info

    def reset(self, **kwargs):
        return self.env.reset(**kwargs)


class WarpFrame(gym.ObservationWrapper):
    def __init__(self, env):
        '''工夫4のWarp frameです。画像サイズをNatureのDQN論文と同じ84x84の白黒にします'''
        gym.ObservationWrapper.__init__(self, env)
        self.width = 84
        self.height = 84
        self.observation_space = spaces.Box(low=0, high=255,
                                            shape=(self.height, self.width, 1), dtype=np.uint8)

    def observation(self, frame):
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        frame = cv2.resize(frame, (self.width, self.height),
                           interpolation=cv2.INTER_AREA)
        return frame[:, :, None]


class WrapPyTorch(gym.ObservationWrapper):
    def __init__(self, env=None):
        '''PyTorchのミニバッチのインデックス順に変更するラッパー'''
        super(WrapPyTorch, self).__init__(env)
        obs_shape = self.observation_space.shape
        self.observation_space = Box(
            self.observation_space.low[0, 0, 0],
            self.observation_space.high[0, 0, 0],
            [obs_shape[2], obs_shape[1], obs_shape[0]],
            dtype=self.observation_space.dtype)

    def observation(self, observation):
        return observation.transpose(2, 0, 1)

In [4]:
def make_env(env_id, seed, rank):

    env = gym.make(env_id)
    env = NoopResetEnv(env, noop_max=30)
    env = MaxAndSkipEnv(env, skip=4)
    env.seed(seed + rank)
    env = EpisodicLifeEnv(env)
    env = WarpFrame(env)
    env = WrapPyTorch(env)

    return env

In [5]:
class Breakout(BaseEnv):
    
    def __init__(self,seed=1,i=1):
        self.env = make_env('BreakoutNoFrameskip-v4',seed,i)
        self.action_shape = (self.env.action_space.n,)
        self.action_interval = 4
        self.state_shape = (4, 84, 84)
        self.test=False
        self.lives = 5
        self.true_terminal = True
        self.animation = Animation(ratio=36.0)
        self.test_mode = False
        self.previous_frames=[np.zeros((84,84))]*4
        
    def reset(self): 
        
        self.previous_frames=[np.zeros((84,84))]*4
        self.append_and_get(self.env.reset())
        
        return np.stack(self.previous_frames) 
    
    def sample(self):
        return self.env.action_space.sample()


        
    def append_and_get(self,state):

        state = state.squeeze().copy()/255
        
        if len(self.previous_frames) > 3: 
            self.previous_frames = self.previous_frames[1:] + [state]
        else:
            self.previous_frames += [state]
        
        
    
    def step(self, action):
        
        state, reward, terminal, _ = self.env.step(action)    

        self.append_and_get(state)
        
        state_final = np.stack(self.previous_frames)            

        return state_final, reward, terminal
    

    def epoch(self):
        self.true_terminal = True
    
    def test_start(self):
        self.true_terminal = True
#         model2.save("model1.h5")
        if self.test:
            self.animation.reset()
            self.env.reset()
    
    def test_epoch_step(self):
        if self.test:
            self.animation.store(self.env.render(mode="rgb_array"))
    
    def test_close(self):
        #self.env.close() 
        if self.test:
            self.env.viewer = None
    
env = Breakout()
_=env.reset()



# Logger

In [6]:

class OriginalLogger(Logger):
    
    def __init__(self):
        super().__init__(record = False)

        self.episodes = 0
        self.total_reward=[0]
        self.tick = 0
        self.episode_lifes = 0
        self.episode_lifes_constant = 5
        self.terminal_list=[]
        self.episode_rewards=0
        self.died = 0
        
        self.max = 0
        self.entropy = 0
        
        
        self.total_loss_list=[]
        self.loss_list=[]
        self.entropy_list=[]
        self.reward_list=[]
        self.max_reward_list=[]

    
    def logger(self, **log):

        # reset
        if not log["epoch_step"][0]:
            self.episodes = 0
            self.total_reward=[0]
            self.tick = 0
            self.episode_lifes = 5
            self.terminal_list=[]
            self.episode_rewards = 0
            self.died = 0
            
            self.max=0

        #　defining variable
        e = log["epoch"][-1]
        loss = log["loss"][-1]
        terminal_index = np.where(log["terminal"].squeeze())
        

        #　for epoch calc
        if log["terminal"].any():
            self.tick += len(log["terminal"][terminal_index])
            self.died += len(log["terminal"][terminal_index])
            self.terminal_list += log["sum_reward"][terminal_index].reshape(-1,1).tolist()

        
        if self.died // self.episode_lifes_constant > 0:
            
            self.episodes += self.died//self.episode_lifes_constant
            total_reward_index_end = self.episode_lifes_constant*(self.tick//self.episode_lifes_constant)
            
            total_reward_index_start = self.episode_lifes_constant*((self.tick-1)//self.episode_lifes_constant)
            
            self.total_reward.append(float(np.sum(np.array(self.terminal_list[total_reward_index_start:total_reward_index_end]))))
            
            self.died -= 5*(self.died//self.episode_lifes_constant)
            self.episode_lifes += 5*(self.died//self.episode_lifes_constant)
            
            
        loss = log["loss"][-1]
        entropy = log["entropy"][-1]
        self.entropy = entropy
        
        if self.total_reward[-1] > self.max:
            self.max = self.total_reward[-1]

        max_val = self.max
        
        msg = "agent[0], entropy:{:0=+1.4f}, epoch {:04d}, loss {:5.4f}, epoch reward total {:4.3f},  episode:{:04.1f}, rewards: {:4.3f}, max reward: {}"\
            .format(entropy, e, loss, float(np.sum(self.total_reward)), self.episodes, self.total_reward[-1], max_val)
        
        self.total_loss_list.append(log["total_loss"][-1])
        self.loss_list.append(loss)
        self.entropy_list.append(entropy)
        self.reward_list.append(int(np.array(self.terminal_list[-1])) if len(self.terminal_list) else 0)
        self.max_reward_list.append(max_val)
        
        return msg

    

    def logger_epoch(self, **log):
        e = log["epoch"]
        entropy = self.entropy
        avg_train_reward = np.sum(self.total_reward)/(self.tick + 1)
        train_reward = np.sum(self.total_reward)
        test_reward = log["test_reward"]

        msg = "epoch {:03d} total reward in epoch: [train:{:4.3f} test:{:4.3}] " + \
            "avg train reward per episode:{:4.3f}, max rewad: {}, entropy: {:0=+1.4f}"
        msg = msg.format(e, train_reward,
                         test_reward, avg_train_reward, self.max, entropy)

        return msg
    
logger = OriginalLogger()

# Confirmation

In [7]:
# blank_list=[]
# for i in np.random.randint(0,4,10):
#     s,r,w=env.step(i)
#     blank_list.append(np.concatenate(s,1))

# res=np.stack(blank_list)
# print(res.shape)
# res=np.concatenate(res,axis=0)
# import matplotlib.pyplot as plt

# plt.figure(figsize=(20,20))

# plt.imshow(res)
# plt.show()
    

# Set RL

In [8]:
Env=Breakout()
opt=Adam(lr=0.01)
from renom_rl.utility import ProbNodeChooser
from renom_rl.utility import GradientClipping

a2c=A2C(Env,
        model,
        gradient_clipping=GradientClipping(),
        num_worker=16,
        logger=logger,
        )

# Run

In [10]:
a2c.fit(
    epoch=3,
    epoch_step=30000,
    test_step=100)

In [None]:
Env.test=True
a2c.test(1000)
Env.test=False

# Animation

In [None]:
Env.animation.run()
Env.animation.reset()

In [None]:
logger.max

In [None]:
logger.loss_list=(np.array(logger.loss_list)*2).tolist()
logger.reward_list=np.array([int(np.array(i)) for i in logger.reward_list]).squeeze().tolist()
import matplotlib.pyplot as plt

params=["total_loss","entropy","loss","reward","max_reward"]
params=[i+"_list" for i in params]
# fig=plt.figure()
plt.figure(figsize=(20,20))

for i , p in enumerate(params,1):
    plt.subplot(len(params),1, i)
    logger.graph_attribute(plt,getattr(logger,p),y_label=p,x_label="update")

plt.tight_layout()
plt.show()