<a href="https://colab.research.google.com/github/eisbetterthanpi/pytorch/blob/main/jcwleo_curiosity_driven_exploration_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### setup

In [None]:
# # https://github.com/jcwleo/curiosity-driven-exploration-pytorch
!pip install gym_super_mario_bros nes_py
!pip install tensorboardX
# https://stackoverflow.com/questions/67808779/running-gym-atari-in-google-colab
%pip install -U gym>=0.21.0
%pip install -U gym[atari,accept-rom-license]


#### config

In [None]:
# config
# # https://github.com/jcwleo/curiosity-driven-exploration-pytorch/blob/master/config.py
# env_type = "atari"
env_id = "BreakoutNoFrameskip-v4"
max_step_per_episode = 4500
learning_rate = 1e-4
num_worker = 16
num_step = 128
gamma = 0.99
lam = 0.95
use_gae = True
use_cuda = True
use_noisy_net = False
clip_grad_norm = 0.5
entropy_coef = 0.001
epoch=Epoch = 3
mini_batch = 8
ppo_eps = 0.1
life_done = False
pre_obs_norm_step = 10000
eta=ETA = 1.


#### utils

In [None]:
# utils
# https://github.com/jcwleo/curiosity-driven-exploration-pytorch/blob/master/utils.py
import numpy as np

def make_train_data(reward, done, value, gamma, num_step, num_worker):
    discounted_return = np.empty([num_worker, num_step])
    # Discounted Return
    if use_gae:
        gae = np.zeros_like([num_worker, ])
        for t in range(num_step - 1, -1, -1):
            delta = reward[:, t] + gamma * value[:, t + 1] * (1 - done[:, t]) - value[:, t]
            gae = delta + gamma * lam * (1 - done[:, t]) * gae
            discounted_return[:, t] = gae + value[:, t]
            # For Actor
        adv = discounted_return - value[:, :-1]

    else:
        running_add = value[:, -1]
        for t in range(num_step - 1, -1, -1):
            running_add = reward[:, t] + gamma * running_add * (1 - done[:, t])
            discounted_return[:, t] = running_add
        # For Actor
        adv = discounted_return - value[:, :-1]
    return discounted_return.reshape([-1]), adv.reshape([-1])


class RunningMeanStd(object):
    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
    def __init__(self, epsilon=1e-4, shape=()):
        self.mean = np.zeros(shape, 'float64')
        self.var = np.ones(shape, 'float64')
        self.count = epsilon

    def update(self, x):
        batch_mean = np.mean(x, axis=0)
        batch_var = np.var(x, axis=0)
        batch_count = x.shape[0]
        self.update_from_moments(batch_mean, batch_var, batch_count)

    def update_from_moments(self, batch_mean, batch_var, batch_count):
        delta = batch_mean - self.mean
        tot_count = self.count + batch_count
        new_mean = self.mean + delta * batch_count / tot_count
        m_a = self.var * (self.count)
        m_b = batch_var * (batch_count)
        M2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count)
        new_var = M2 / (self.count + batch_count)
        new_count = batch_count + self.count
        self.mean = new_mean
        self.var = new_var
        self.count = new_count


class RewardForwardFilter(object):
    def __init__(self, gamma):
        self.rewems = None
        self.gamma = gamma

    def update(self, rews):
        if self.rewems is None:
            self.rewems = rews
        else:
            self.rewems = self.rewems * self.gamma + rews
        return self.rewems


def softmax(z):
    assert len(z.shape) == 2
    s = np.max(z, axis=1)
    s = s[:, np.newaxis]  # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis]  # dito
    return e_x / div


#### model

In [None]:
# model
# https://github.com/jcwleo/curiosity-driven-exploration-pytorch/blob/master/model.py
import torch.nn.functional as F
import torch.nn as nn
import torch
import torch.optim as optim
import numpy as np
import math
from torch.nn import init

class NoisyLinear(nn.Module):
    """Factorised Gaussian NoisyNet"""
    def __init__(self, in_features, out_features, sigma0=0.5):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.Parameter(torch.Tensor(out_features, in_features))
        self.bias = nn.Parameter(torch.Tensor(out_features))
        self.noisy_weight = nn.Parameter(torch.Tensor(out_features, in_features))
        self.noisy_bias = nn.Parameter(torch.Tensor(out_features))
        self.noise_std = sigma0 / math.sqrt(self.in_features)
        self.reset_parameters()
        self.register_noise()

    def register_noise(self):
        in_noise = torch.FloatTensor(self.in_features)
        out_noise = torch.FloatTensor(self.out_features)
        noise = torch.FloatTensor(self.out_features, self.in_features)
        self.register_buffer('in_noise', in_noise)
        self.register_buffer('out_noise', out_noise)
        self.register_buffer('noise', noise)

    def sample_noise(self):
        self.in_noise.normal_(0, self.noise_std)
        self.out_noise.normal_(0, self.noise_std)
        self.noise = torch.mm(
            self.out_noise.view(-1, 1), self.in_noise.view(1, -1))

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        self.noisy_weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)
            self.noisy_bias.data.uniform_(-stdv, stdv)

    def forward(self, x):
        """Note: noise will be updated if x is not volatile"""
        normal_y = nn.functional.linear(x, self.weight, self.bias)
        if self.training:
            # update the noise once per update
            self.sample_noise()
        noisy_weight = self.noisy_weight * self.noise
        noisy_bias = self.noisy_bias * self.out_noise
        noisy_y = nn.functional.linear(x, noisy_weight, noisy_bias)
        return noisy_y + normal_y

    def __repr__(self):
        return self.__class__.__name__ + '(' \
               + 'in_features=' + str(self.in_features) \
               + ', out_features=' + str(self.out_features) + ')'


class Flatten(nn.Module):
    def forward(self, input):
        return input.view(input.size(0), -1)


class CnnActorCriticNetwork(nn.Module):
    def __init__(self, input_size, output_size, use_noisy_net=False):
        super(CnnActorCriticNetwork, self).__init__()
        if use_noisy_net:
            print('use NoisyNet')
            linear = NoisyLinear
        else:
            linear = nn.Linear
        self.feature = nn.Sequential(
            nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4),
            nn.LeakyReLU(),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
            nn.LeakyReLU(),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
            nn.LeakyReLU(),
            Flatten(),
            linear(7 * 7 * 64, 512),
            nn.LeakyReLU()
        )
        self.actor = nn.Sequential(
            linear(512, 512),
            nn.LeakyReLU(),
            linear(512, output_size)
        )
        self.critic = nn.Sequential(
            linear(512, 512),
            nn.LeakyReLU(),
            linear(512, 1)
        )
        for p in self.modules():
            if isinstance(p, nn.Conv2d):
                init.orthogonal_(p.weight, np.sqrt(2))
                p.bias.data.zero_()
            if isinstance(p, nn.Linear):
                init.orthogonal_(p.weight, np.sqrt(2))
                p.bias.data.zero_()
        for i in range(len(self.actor)):
            if type(self.actor[i]) == nn.Linear:
                init.orthogonal_(self.actor[i].weight, 0.01)
                self.actor[i].bias.data.zero_()
        for i in range(len(self.critic)):
            if type(self.critic[i]) == nn.Linear:
                init.orthogonal_(self.critic[i].weight, 0.01)
                self.critic[i].bias.data.zero_()

    def forward(self, state):
        x = self.feature(state)
        policy = self.actor(x)
        value = self.critic(x)
        return policy, value


class ICMModel(nn.Module):
    def __init__(self, input_size, output_size, use_cuda=True):
        super(ICMModel, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.device = torch.device('cuda' if use_cuda else 'cpu')
        feature_output = 7 * 7 * 64
        self.feature = nn.Sequential(
            nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4),
            nn.LeakyReLU(),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
            nn.LeakyReLU(),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
            nn.LeakyReLU(),
            Flatten(),
            nn.Linear(feature_output, 512)
        )

        self.inverse_net = nn.Sequential(
            nn.Linear(512 * 2, 512),
            nn.ReLU(),
            nn.Linear(512, output_size)
        )
        self.residual = [nn.Sequential(
            nn.Linear(output_size + 512, 512),
            nn.LeakyReLU(),
            nn.Linear(512, 512),
        ).to(self.device)] * 8
        self.forward_net_1 = nn.Sequential(
            nn.Linear(output_size + 512, 512),
            nn.LeakyReLU()
        )
        self.forward_net_2 = nn.Sequential(
            nn.Linear(output_size + 512, 512),
        )
        for p in self.modules():
            if isinstance(p, nn.Conv2d):
                init.kaiming_uniform_(p.weight)
                p.bias.data.zero_()
            if isinstance(p, nn.Linear):
                init.kaiming_uniform_(p.weight, a=1.0)
                p.bias.data.zero_()

    def forward(self, inputs):
        state, next_state, action = inputs
        encode_state = self.feature(state)
        encode_next_state = self.feature(next_state)
        # get pred action
        pred_action = torch.cat((encode_state, encode_next_state), 1)
        pred_action = self.inverse_net(pred_action)
        # ---------------------
        # get pred next state
        pred_next_state_feature_orig = torch.cat((encode_state, action), 1)
        pred_next_state_feature_orig = self.forward_net_1(pred_next_state_feature_orig)
        # residual
        for i in range(4):
            pred_next_state_feature = self.residual[i * 2](torch.cat((pred_next_state_feature_orig, action), 1))
            pred_next_state_feature_orig = self.residual[i * 2 + 1](
                torch.cat((pred_next_state_feature, action), 1)) + pred_next_state_feature_orig
        pred_next_state_feature = self.forward_net_2(torch.cat((pred_next_state_feature_orig, action), 1))
        real_next_state_feature = encode_next_state
        return real_next_state_feature, pred_next_state_feature, pred_action



#### agents

In [None]:
# agents
# https://github.com/jcwleo/curiosity-driven-exploration-pytorch/blob/master/agents.py
import numpy as np
import torch.nn.functional as F
import torch.nn as nn
import torch
import torch.optim as optim
from torch.distributions.categorical import Categorical

class ICMAgent(object):
    def __init__(
            self,
            input_size,
            output_size,
            num_env,
            num_step,
            gamma,
            lam=0.95,
            learning_rate=1e-4,
            ent_coef=0.01,
            clip_grad_norm=0.5,
            epoch=3,
            batch_size=128,
            ppo_eps=0.1,
            eta=0.01,
            use_gae=True,
            use_cuda=False,
            use_noisy_net=False):
        self.model = CnnActorCriticNetwork(input_size, output_size, use_noisy_net)
        self.num_env = num_env
        self.output_size = output_size
        self.input_size = input_size
        self.num_step = num_step
        self.gamma = gamma
        self.lam = lam
        self.epoch = epoch
        self.batch_size = batch_size
        self.use_gae = use_gae
        self.ent_coef = ent_coef
        self.eta = eta
        self.ppo_eps = ppo_eps
        self.clip_grad_norm = clip_grad_norm
        self.device = torch.device('cuda' if use_cuda else 'cpu')

        self.icm = ICMModel(input_size, output_size, use_cuda)
        self.optimizer = optim.Adam(list(self.model.parameters()) + list(self.icm.parameters()), lr=learning_rate)
        self.icm = self.icm.to(self.device)
        self.model = self.model.to(self.device)

    def get_action(self, state):
        state = torch.Tensor(state).to(self.device)
        state = state.float()
        policy, value = self.model(state)
        action_prob = F.softmax(policy, dim=-1).data.cpu().numpy()
        action = self.random_choice_prob_index(action_prob)
        return action, value.data.cpu().numpy().squeeze(), policy.detach()

    @staticmethod
    def random_choice_prob_index(p, axis=1):
        r = np.expand_dims(np.random.rand(p.shape[1 - axis]), axis=axis)
        return (p.cumsum(axis=axis) > r).argmax(axis=axis)

    def compute_intrinsic_reward(self, state, next_state, action):
        state = torch.FloatTensor(state).to(self.device)
        next_state = torch.FloatTensor(next_state).to(self.device)
        action = torch.LongTensor(action).to(self.device)
        action_onehot = torch.FloatTensor(len(action), self.output_size).to(self.device)
        action_onehot.zero_()
        action_onehot.scatter_(1, action.view(len(action), -1), 1)
        real_next_state_feature, pred_next_state_feature, pred_action = self.icm([state, next_state, action_onehot])
        intrinsic_reward = self.eta * F.mse_loss(real_next_state_feature, pred_next_state_feature, reduction='none').mean(-1)
        return intrinsic_reward.data.cpu().numpy()

    def train_model(self, s_batch, next_s_batch, target_batch, y_batch, adv_batch, old_policy):
        s_batch = torch.FloatTensor(s_batch).to(self.device)
        next_s_batch = torch.FloatTensor(next_s_batch).to(self.device)
        target_batch = torch.FloatTensor(target_batch).to(self.device)
        y_batch = torch.LongTensor(y_batch).to(self.device)
        adv_batch = torch.FloatTensor(adv_batch).to(self.device)
        sample_range = np.arange(len(s_batch))
        ce = nn.CrossEntropyLoss()
        forward_mse = nn.MSELoss()
        with torch.no_grad():
            policy_old_list = torch.stack(old_policy).permute(1, 0, 2).contiguous().view(-1, self.output_size).to(self.device)
            m_old = Categorical(F.softmax(policy_old_list, dim=-1))
            log_prob_old = m_old.log_prob(y_batch)
            # ------------------------------------------------------------
        for i in range(self.epoch):
            print("agent train i ",i)
            np.random.shuffle(sample_range)
            for j in range(int(len(s_batch) / self.batch_size)):
                sample_idx = sample_range[self.batch_size * j:self.batch_size * (j + 1)]
                # --------------------------------------------------------------------------------
                # for Curiosity-driven
                action_onehot = torch.FloatTensor(self.batch_size, self.output_size).to(self.device)
                action_onehot.zero_()
                action_onehot.scatter_(1, y_batch[sample_idx].view(-1, 1), 1)
                real_next_state_feature, pred_next_state_feature, pred_action = self.icm(
                    [s_batch[sample_idx], next_s_batch[sample_idx], action_onehot])
                inverse_loss = ce(pred_action, y_batch[sample_idx])
                forward_loss = forward_mse(pred_next_state_feature, real_next_state_feature.detach())
                # ---------------------------------------------------------------------------------
                policy, value = self.model(s_batch[sample_idx])
                m = Categorical(F.softmax(policy, dim=-1))
                log_prob = m.log_prob(y_batch[sample_idx])
                ratio = torch.exp(log_prob - log_prob_old[sample_idx])
                surr1 = ratio * adv_batch[sample_idx]
                surr2 = torch.clamp(
                    ratio,
                    1.0 - self.ppo_eps,
                    1.0 + self.ppo_eps) * adv_batch[sample_idx]
                actor_loss = -torch.min(surr1, surr2).mean()
                critic_loss = F.mse_loss(value.sum(1), target_batch[sample_idx])
                entropy = m.entropy().mean()
                self.optimizer.zero_grad()
                loss = (actor_loss + 0.5 * critic_loss - 0.001 * entropy) + forward_loss + inverse_loss
                print("agent train loss ",loss)
                loss.backward()
                # torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.5)
                self.optimizer.step()




#### envs

In [None]:
# envs
# https://github.com/jcwleo/curiosity-driven-exploration-pytorch/blob/master/envs.py
import cv2
import numpy as np
from abc import abstractmethod
from collections import deque
from copy import copy
import gym
from gym.spaces import Box
from gym.wrappers import FrameStack
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT
from torch.multiprocessing import Pipe, Process
from PIL import Image

class Environment(Process):
    @abstractmethod
    def run(self):
        pass

    @abstractmethod
    def reset(self):
        pass

    @abstractmethod
    def pre_proc(self, x):
        pass

    @abstractmethod
    def get_init_state(self, x):
        pass


def unwrap(env):
    if hasattr(env, "unwrapped"):
        return env.unwrapped
    elif hasattr(env, "env"):
        return unwrap(env.env)
    elif hasattr(env, "leg_env"):
        return unwrap(env.leg_env)
    else:
        return env

class NoopResetEnv(gym.Wrapper):
    def __init__(self, env, noop_max=30):
        """Sample initial states by taking random number of no-ops on reset.
        No-op is assumed to be action 0."""
        gym.Wrapper.__init__(self, env)
        self.noop_max = noop_max
        self.override_num_noops = None
        self.noop_action = 0
        assert env.unwrapped.get_action_meanings()[0] == 'NOOP'

    def reset(self, **kwargs):
        """ Do no-op action for a number of steps in [1, noop_max]."""
        self.env.reset(**kwargs)
        if self.override_num_noops is not None:
            noops = self.override_num_noops
        else:
            # noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101
            noops = self.unwrapped.np_random.integers(1, self.noop_max + 1) #pylint: disable=E1101
        assert noops > 0
        obs = None
        for _ in range(noops):
            obs, _, done, _ = self.env.step(self.noop_action)
            if done:
                obs = self.env.reset(**kwargs)
        return obs

    def step(self, ac):
        return self.env.step(ac)

class MaxAndSkipEnv(gym.Wrapper):
    def __init__(self, env, is_render, skip=4):
        """Return only every `skip`-th frame"""
        gym.Wrapper.__init__(self, env)
        # most recent raw observations (for max pooling across time steps)
        self._obs_buffer = np.zeros((2,) + env.observation_space.shape, dtype=np.uint8)
        self._skip = skip
        self.is_render = is_render

    def step(self, action):
        """Repeat action, sum reward, and max over last observations."""
        total_reward = 0.0
        done = None
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            if self.is_render:
                self.env.render()
            if i == self._skip - 2:
                self._obs_buffer[0] = obs
            if i == self._skip - 1:
                self._obs_buffer[1] = obs
            total_reward += reward
            if done:
                break
        # Note that the observation on the done=True frame
        # doesn't matter
        max_frame = self._obs_buffer.max(axis=0)
        return max_frame, total_reward, done, info

    def reset(self, **kwargs):
        return self.env.reset(**kwargs)


class MontezumaInfoWrapper(gym.Wrapper):
    def __init__(self, env, room_address):
        super(MontezumaInfoWrapper, self).__init__(env)
        self.room_address = room_address
        self.visited_rooms = set()

    def get_current_room(self):
        ram = unwrap(self.env).ale.getRAM()
        assert len(ram) == 128
        return int(ram[self.room_address])

    def step(self, action):
        obs, rew, done, info = self.env.step(action)
        self.visited_rooms.add(self.get_current_room())
        if done:
            if 'episode' not in info:
                info['episode'] = {}
            info['episode'].update(visited_rooms=copy(self.visited_rooms))
            self.visited_rooms.clear()
        return obs, rew, done, info

    def reset(self):
        return self.env.reset()


class AtariEnvironment(Environment):
    def __init__(
            self,
            env_id,
            is_render,
            env_idx,
            child_conn,
            history_size=4,
            h=84,
            w=84,
            life_done=True):
        super(AtariEnvironment, self).__init__()
        self.daemon = True
        self.env = MaxAndSkipEnv(NoopResetEnv(gym.make(env_id)), is_render)
        if 'Montezuma' in env_id:
            self.env = MontezumaInfoWrapper(self.env, room_address=3 if 'Montezuma' in env_id else 1)
        self.env_id = env_id
        self.is_render = is_render
        self.env_idx = env_idx
        self.steps = 0
        self.episode = 0
        self.rall = 0
        self.recent_rlist = deque(maxlen=100)
        self.child_conn = child_conn
        self.history_size = history_size
        self.history = np.zeros([history_size, h, w])
        self.h = h
        self.w = w
        self.reset()

    def run(self):
        super(AtariEnvironment, self).run()
        while True:
            action = self.child_conn.recv()
            if 'Breakout' in self.env_id:
                action += 1
            s, reward, done, info = self.env.step(action)
            if max_step_per_episode < self.steps:
                done = True
            log_reward = reward
            force_done = done
            self.history[:3, :, :] = self.history[1:, :, :]
            self.history[3, :, :] = self.pre_proc(s)
            self.rall += reward
            self.steps += 1
            if done:
                self.recent_rlist.append(self.rall)
                print("[Episode {}({})] Step: {}  Reward: {}  Recent Reward: {}  Visited Room: [{}]".format(
                    self.episode, self.env_idx, self.steps, self.rall, np.mean(self.recent_rlist),
                    info.get('episode', {}).get('visited_rooms', {})))
                self.history = self.reset()
            self.child_conn.send([self.history[:, :, :], reward, force_done, done, log_reward])

    def reset(self):
        self.last_action = 0
        self.steps = 0
        self.episode += 1
        self.rall = 0
        s = self.env.reset()
        self.get_init_state(
            self.pre_proc(s))
        return self.history[:, :, :]

    def pre_proc(self, X):
        X = np.array(Image.fromarray(X).convert('L')).astype('float32')
        x = cv2.resize(X, (self.h, self.w))
        return x

    def get_init_state(self, s):
        for i in range(self.history_size):
            self.history[i, :, :] = self.pre_proc(s)


class MarioEnvironment(Process):
    def __init__(
            self,
            env_id,
            is_render,
            env_idx,
            child_conn,
            history_size=4,
            life_done=True,
            h=84,
            w=84, movement=COMPLEX_MOVEMENT, sticky_action=True,
            p=0.25):
        super(MarioEnvironment, self).__init__()
        self.daemon = True
        self.env = JoypadSpace(gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT)
        self.is_render = is_render
        self.env_idx = env_idx
        self.steps = 0
        self.episode = 0
        self.rall = 0
        self.recent_rlist = deque(maxlen=100)
        self.child_conn = child_conn
        self.life_done = life_done
        self.history_size = history_size
        self.history = np.zeros([history_size, h, w])
        self.h = h
        self.w = w
        self.reset()

    def run(self):
        super(MarioEnvironment, self).run()
        while True:
            action = self.child_conn.recv()
            if self.is_render:
                self.env.render()
            obs, reward, done, info = self.env.step(action)
            # when Mario loses life, changes the state to the terminal state.
            if self.life_done:
                if self.lives > info['life'] and info['life'] > 0:
                    force_done = True
                    self.lives = info['life']
                else:
                    force_done = done
                    self.lives = info['life']
            else:
                force_done = done
            # reward range -15 ~ 15
            log_reward = reward / 15
            self.rall += log_reward
            r = log_reward
            self.history[:3, :, :] = self.history[1:, :, :]
            self.history[3, :, :] = self.pre_proc(obs)
            self.steps += 1
            if done:
                self.recent_rlist.append(self.rall)
                print("[Episode {}({})] Step: {}  Reward: {}  Recent Reward: {}  Stage: {} current x:{}   max x:{}".format(self.episode, self.env_idx, self.steps, self.rall, np.mean(self.recent_rlist), info['stage'], info['x_pos'], self.max_pos))
                self.history = self.reset()
            self.child_conn.send([self.history[:, :, :], r, force_done, done, log_reward])

    def reset(self):
        self.last_action = 0
        self.steps = 0
        self.episode += 1
        self.rall = 0
        self.lives = 3
        self.stage = 1
        self.max_pos = 0
        self.get_init_state(self.env.reset())
        return self.history[:, :, :]

    def pre_proc(self, X):
        x = cv2.cvtColor(X, cv2.COLOR_RGB2GRAY) # grayscaling
        x = cv2.resize(x, (self.h, self.w)) # resize
        return x

    def get_init_state(self, s):
        for i in range(self.history_size):
            self.history[i, :, :] = self.pre_proc(s)



In [None]:
# @title train
# https://github.com/jcwleo/curiosity-driven-exploration-pytorch/blob/master/train.py
from torch.multiprocessing import Pipe
from tensorboardX import SummaryWriter
import numpy as np
import copy


# def main():
if env_type == 'mario':
    env = JoypadSpace(gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT)
elif env_type == 'atari':
    env = gym.make(env_id)
else:
    raise NotImplementedError
input_size = env.observation_space.shape  # 4
output_size = env.action_space.n  # 2

if 'Breakout' in env_id:
    output_size -= 1

env.close()

is_load_model = False
is_render = False
# model_path = 'models/{}.model'.format(env_id)
# icm_path = 'models/{}.icm'.format(env_id)
model_path = '{}.model'.format(env_id)
icm_path = '{}.icm'.format(env_id)
writer = SummaryWriter()
batch_size = int(num_step * num_worker / mini_batch)
reward_rms = RunningMeanStd()
obs_rms = RunningMeanStd(shape=(1, 4, 84, 84))
# pre_obs_norm_step = int(default_config['ObsNormStep'])
discounted_reward = RewardForwardFilter(gamma)
agent = ICMAgent

if env_type == 'atari':
    env_t = AtariEnvironment
elif env_type == 'mario':
    env_t = MarioEnvironment
else:
    raise NotImplementedError

agent = agent(
    input_size,
    output_size,
    num_worker,
    num_step,
    gamma,
    lam=lam,
    learning_rate=learning_rate,
    ent_coef=entropy_coef,
    clip_grad_norm=clip_grad_norm,
    epoch=epoch,
    batch_size=batch_size,
    ppo_eps=ppo_eps,
    eta=eta,
    use_cuda=use_cuda,
    use_gae=use_gae,
    use_noisy_net=use_noisy_net
)

if is_load_model:
    if use_cuda:
        agent.model.load_state_dict(torch.load(model_path))
    else:
        agent.model.load_state_dict(torch.load(model_path, map_location='cpu'))

works = []
parent_conns = []
child_conns = []
for idx in range(num_worker):
    parent_conn, child_conn = Pipe()
    work = env_t(env_id, is_render, idx, child_conn)
    work.start()
    works.append(work)
    parent_conns.append(parent_conn)
    child_conns.append(child_conn)
states = np.zeros([num_worker, 4, 84, 84])

sample_episode = 0
sample_rall = 0
sample_step = 0
sample_env_idx = 0
sample_i_rall = 0
global_update = 0
global_step = 0

# normalize obs
print('Start to initailize observation normalization parameter.....')
next_obs = []
steps = 0
while steps < pre_obs_norm_step:
    steps += num_worker
    actions = np.random.randint(0, output_size, size=(num_worker,))
    for parent_conn, action in zip(parent_conns, actions):
        parent_conn.send(action)
    for parent_conn in parent_conns:
        s, r, d, rd, lr = parent_conn.recv()
        next_obs.append(s[:])
next_obs = np.stack(next_obs)
obs_rms.update(next_obs)
print('End to initalize...')

while True:
    total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_values, total_policy = \
        [], [], [], [], [], [], [], [], []
    global_step += (num_worker * num_step)
    global_update += 1
    # Step 1. n-step rollout
    for _ in range(num_step):
        actions, value, policy = agent.get_action((states - obs_rms.mean) / np.sqrt(obs_rms.var))
        for parent_conn, action in zip(parent_conns, actions):
            parent_conn.send(action)
        next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], []
        for parent_conn in parent_conns:
            s, r, d, rd, lr = parent_conn.recv()
            next_states.append(s)
            rewards.append(r)
            dones.append(d)
            real_dones.append(rd)
            log_rewards.append(lr)
        next_states = np.stack(next_states)
        rewards = np.hstack(rewards)
        dones = np.hstack(dones)
        real_dones = np.hstack(real_dones)
        # total reward = int reward
        intrinsic_reward = agent.compute_intrinsic_reward(
            (states - obs_rms.mean) / np.sqrt(obs_rms.var),
            (next_states - obs_rms.mean) / np.sqrt(obs_rms.var),
            actions)
        sample_i_rall += intrinsic_reward[sample_env_idx]
        total_int_reward.append(intrinsic_reward)
        total_state.append(states)
        total_next_state.append(next_states)
        total_reward.append(rewards)
        total_done.append(dones)
        total_action.append(actions)
        total_values.append(value)
        total_policy.append(policy)
        states = next_states[:, :, :, :]
        sample_rall += log_rewards[sample_env_idx]

        sample_step += 1
        if real_dones[sample_env_idx]:
            sample_episode += 1
            writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode)
            writer.add_scalar('data/reward_per_rollout', sample_rall, global_update)
            writer.add_scalar('data/step', sample_step, sample_episode)
            sample_rall = 0
            sample_step = 0
            sample_i_rall = 0

    # calculate last next value
    _, value, _ = agent.get_action((states - obs_rms.mean) / np.sqrt(obs_rms.var))
    total_values.append(value)
    # --------------------------------------------------
    total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape([-1, 4, 84, 84])
    total_next_state = np.stack(total_next_state).transpose([1, 0, 2, 3, 4]).reshape([-1, 4, 84, 84])
    total_action = np.stack(total_action).transpose().reshape([-1])
    total_done = np.stack(total_done).transpose()
    total_values = np.stack(total_values).transpose()
    total_logging_policy = torch.stack(total_policy).view(-1, output_size).cpu().numpy()

    # Step 2. calculate intrinsic reward
    # running mean intrinsic reward
    total_int_reward = np.stack(total_int_reward).transpose()
    total_reward_per_env = np.array([discounted_reward.update(reward_per_step) for reward_per_step in
                                        total_int_reward.T])
    mean, std, count = np.mean(total_reward_per_env), np.std(total_reward_per_env), len(total_reward_per_env)
    reward_rms.update_from_moments(mean, std ** 2, count)

    # normalize intrinsic reward
    total_int_reward /= np.sqrt(reward_rms.var)
    writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / num_worker, sample_episode)
    writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / num_worker, global_update)
    # -------------------------------------------------------------------------------------------
    # logging Max action probability
    writer.add_scalar('data/max_prob', softmax(total_logging_policy).max(1).mean(), sample_episode)

    # Step 3. make target and advantage
    target, adv = make_train_data(total_int_reward,
                                    np.zeros_like(total_int_reward),
                                    total_values,
                                    gamma,
                                    num_step,
                                    num_worker)

    adv = (adv - np.mean(adv)) / (np.std(adv) + 1e-8)
    # -----------------------------------------------

    # Step 5. Training!
    agent.train_model((total_state - obs_rms.mean) / np.sqrt(obs_rms.var),
                        (total_next_state - obs_rms.mean) / np.sqrt(obs_rms.var),
                        target, total_action,
                        adv,
                        total_policy)

    if global_step % (num_worker * num_step * 100) == 0:
        print('Now Global Step :{}'.format(global_step))
        torch.save(agent.model.state_dict(), model_path)
        torch.save(agent.icm.state_dict(), icm_path)



#### train

In [None]:
# train
# https://github.com/jcwleo/curiosity-driven-exploration-pytorch/blob/master/train.py
from torch.multiprocessing import Pipe
from tensorboardX import SummaryWriter
import numpy as np
import copy

env = gym.make(env_id)
env = JoypadSpace(gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT)
input_size = env.observation_space.shape  # 4
output_size = env.action_space.n  # 2
if 'Breakout' in env_id: output_size -= 1
env.close()

is_load_model = False
is_render = False
model_path = 'models/{}.model'.format(env_id)
icm_path = 'models/{}.icm'.format(env_id)
writer = SummaryWriter()
batch_size = int(num_step * num_worker / mini_batch)
reward_rms = RunningMeanStd()
obs_rms = RunningMeanStd(shape=(1, 4, 84, 84))
discounted_reward = RewardForwardFilter(gamma)
agent = ICMAgent

env = AtariEnvironment
# env = MarioEnvironment

agent = agent(
    input_size,
    output_size,
    num_worker,
    num_step,
    gamma,
    lam=lam,
    learning_rate=learning_rate,
    ent_coef=entropy_coef,
    clip_grad_norm=clip_grad_norm,
    epoch=epoch,
    batch_size=batch_size,
    ppo_eps=ppo_eps,
    eta=eta,
    use_cuda=use_cuda,
    use_gae=use_gae,
    use_noisy_net=use_noisy_net
)

if is_load_model:
    if use_cuda:
        agent.model.load_state_dict(torch.load(model_path))
    else:
        agent.model.load_state_dict(torch.load(model_path, map_location='cpu'))

works = []
parent_conns = []
child_conns = []
for idx in range(num_worker):
    parent_conn, child_conn = Pipe()
    work = env(env_id, is_render, idx, child_conn)
    work.start()
    works.append(work)
    parent_conns.append(parent_conn)
    child_conns.append(child_conn)
states = np.zeros([num_worker, 4, 84, 84])

sample_episode = 0
sample_rall = 0
sample_step = 0
sample_env_idx = 0
sample_i_rall = 0
global_update = 0
global_step = 0

# normalize obs
print('Start to initailize observation normalization parameter.....')
next_obs = []
steps = 0
while steps < pre_obs_norm_step:
    steps += num_worker
    actions = np.random.randint(0, output_size, size=(num_worker,))
    for parent_conn, action in zip(parent_conns, actions):
        parent_conn.send(action)
    for parent_conn in parent_conns:
        s, r, d, rd, lr = parent_conn.recv()
        next_obs.append(s[:])
next_obs = np.stack(next_obs)
obs_rms.update(next_obs)
print('End to initalize...')

# while True:
for x in range(1):
    total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_values, total_policy = \
        [], [], [], [], [], [], [], [], []
    global_step += (num_worker * num_step)
    global_update += 1
    # Step 1. n-step rollout
    for _ in range(num_step):
        actions, value, policy = agent.get_action((states - obs_rms.mean) / np.sqrt(obs_rms.var))
        for parent_conn, action in zip(parent_conns, actions):
            parent_conn.send(action)
        next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], []
        for parent_conn in parent_conns:
            s, r, d, rd, lr = parent_conn.recv()
            next_states.append(s)
            rewards.append(r)
            dones.append(d)
            real_dones.append(rd)
            log_rewards.append(lr)
        next_states = np.stack(next_states)
        rewards = np.hstack(rewards)
        dones = np.hstack(dones)
        real_dones = np.hstack(real_dones)
        # total reward = int reward
        intrinsic_reward = agent.compute_intrinsic_reward((states - obs_rms.mean) / np.sqrt(obs_rms.var), (next_states - obs_rms.mean) / np.sqrt(obs_rms.var), actions)
        sample_i_rall += intrinsic_reward[sample_env_idx]
        total_int_reward.append(intrinsic_reward)
        total_state.append(states)
        total_next_state.append(next_states)
        total_reward.append(rewards)
        total_done.append(dones)
        total_action.append(actions)
        total_values.append(value)
        total_policy.append(policy)
        states = next_states[:, :, :, :]
        sample_rall += log_rewards[sample_env_idx]
        sample_step += 1
        if real_dones[sample_env_idx]:
            sample_episode += 1
            writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode)
            writer.add_scalar('data/reward_per_rollout', sample_rall, global_update)
            writer.add_scalar('data/step', sample_step, sample_episode)
            sample_rall = 0
            sample_step = 0
            sample_i_rall = 0

    # calculate last next value
    _, value, _ = agent.get_action((states - obs_rms.mean) / np.sqrt(obs_rms.var))
    total_values.append(value)
    # --------------------------------------------------
    total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape([-1, 4, 84, 84])
    total_next_state = np.stack(total_next_state).transpose([1, 0, 2, 3, 4]).reshape([-1, 4, 84, 84])
    total_action = np.stack(total_action).transpose().reshape([-1])
    total_done = np.stack(total_done).transpose()
    total_values = np.stack(total_values).transpose()
    total_logging_policy = torch.stack(total_policy).view(-1, output_size).cpu().numpy()

    # Step 2. calculate intrinsic reward
    # running mean intrinsic reward
    total_int_reward = np.stack(total_int_reward).transpose()
    total_reward_per_env = np.array([discounted_reward.update(reward_per_step) for reward_per_step in total_int_reward.T])
    mean, std, count = np.mean(total_reward_per_env), np.std(total_reward_per_env), len(total_reward_per_env)
    reward_rms.update_from_moments(mean, std ** 2, count)

    # normalize intrinsic reward
    total_int_reward /= np.sqrt(reward_rms.var)
    writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / num_worker, sample_episode)
    writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / num_worker, global_update)
    # -------------------------------------------------------------------------------------------
    # logging Max action probability
    writer.add_scalar('data/max_prob', softmax(total_logging_policy).max(1).mean(), sample_episode)

    # Step 3. make target and advantage
    target, adv = make_train_data(total_int_reward, np.zeros_like(total_int_reward),
                                    total_values, gamma, num_step, num_worker)

    adv = (adv - np.mean(adv)) / (np.std(adv) + 1e-8)
    # -----------------------------------------------

    # Step 5. Training!
    agent.train_model((total_state - obs_rms.mean) / np.sqrt(obs_rms.var),
                        (total_next_state - obs_rms.mean) / np.sqrt(obs_rms.var),
                        target, total_action, adv, total_policy)

    if global_step % (num_worker * num_step * 100) == 0:
        print('Now Global Step :{}'.format(global_step))
        torch.save(agent.model.state_dict(), model_path)
        torch.save(agent.icm.state_dict(), icm_path)



  "We recommend you to use a symmetric and normalized Box action space (range=[-1, 1]) "
Process AtariEnvironment-18:
Traceback (most recent call last):
Process AtariEnvironment-19:
  File "/usr/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "<ipython-input-6-b0698bc31eff>", line 168, in run
    s, reward, done, info = self.env.step(action)
  File "<ipython-input-6-b0698bc31eff>", line 88, in step
    obs, reward, done, info = self.env.step(action)
  File "<ipython-input-6-b0698bc31eff>", line 72, in step
    return self.env.step(ac)
  File "/usr/local/lib/python3.7/dist-packages/gym/wrappers/time_limit.py", line 49, in step
    observation, reward, done, info = self.env.step(action)
  File "/usr/local/lib/python3.7/dist-packages/gym/wrappers/order_enforcing.py", line 37, in step
    return self.env.step(action)
  File "/usr/local/lib/python3.7/dist-packages/gym/wrappers/env_checker.py", line 41, in step
    return self.env.step(action)
Process

Start to initailize observation normalization parameter.....


  File "<ipython-input-6-b0698bc31eff>", line 168, in run
    s, reward, done, info = self.env.step(action)
  File "<ipython-input-6-b0698bc31eff>", line 72, in step
    return self.env.step(ac)
  File "<ipython-input-6-b0698bc31eff>", line 88, in step
    obs, reward, done, info = self.env.step(action)
  File "<ipython-input-6-b0698bc31eff>", line 72, in step
    return self.env.step(ac)
  File "<ipython-input-6-b0698bc31eff>", line 72, in step
    return self.env.step(ac)
  File "/usr/local/lib/python3.7/dist-packages/gym/wrappers/order_enforcing.py", line 37, in step
    return self.env.step(action)
  File "<ipython-input-6-b0698bc31eff>", line 88, in step
    obs, reward, done, info = self.env.step(action)
  File "<ipython-input-6-b0698bc31eff>", line 88, in step
    obs, reward, done, info = self.env.step(action)
  File "/usr/local/lib/python3.7/dist-packages/gym/wrappers/time_limit.py", line 49, in step
    observation, reward, done, info = self.env.step(action)
  File "<ipython-

KeyboardInterrupt: ignored

  File "/usr/lib/python3.7/multiprocessing/connection.py", line 206, in send
    self._send_bytes(_ForkingPickler.dumps(obj))
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 206, in send
    self._send_bytes(_ForkingPickler.dumps(obj))
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 206, in send
    self._send_bytes(_ForkingPickler.dumps(obj))
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 398, in _send_bytes
    self._send(buf)
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 398, in _send_bytes
    self._send(buf)
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 398, in _send_bytes
    self._send(buf)
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 398, in _send_bytes
    self._send(buf)
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 368, in _send
    n = write(sel

#### eval


In [None]:
# eval
# https://github.com/jcwleo/curiosity-driven-exploration-pytorch/blob/master/eval.py
from torch.multiprocessing import Pipe
from tensorboardX import SummaryWriter
import numpy as np
import pickle

env = JoypadSpace(gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT)
# env = gym.make(env_id)
input_size = env.observation_space.shape  # 4
output_size = env.action_space.n  # 2
if 'Breakout' in env_id: output_size -= 1

env.close()
is_render = True
model_path = 'models/{}.model'.format(env_id)
predictor_path = 'models/{}.pred'.format(env_id)
target_path = 'models/{}.target'.format(env_id)
use_cuda = False
num_worker = 1
batch_size = int(num_step * num_worker / mini_batch)
sticky_action = False

agent = RNDAgent

# env = AtariEnvironment
env = MarioEnvironment


agent = agent(
    input_size,
    output_size,
    num_worker,
    num_step,
    gamma,
    lam=lam,
    learning_rate=learning_rate,
    ent_coef=entropy_coef,
    clip_grad_norm=clip_grad_norm,
    epoch=epoch,
    batch_size=batch_size,
    ppo_eps=ppo_eps,
    use_cuda=use_cuda,
    use_gae=use_gae,
    use_noisy_net=use_noisy_net
)

print('Loading Pre-trained model....')
if use_cuda:
    agent.model.load_state_dict(torch.load(model_path))
    agent.rnd.predictor.load_state_dict(torch.load(predictor_path))
    agent.rnd.target.load_state_dict(torch.load(target_path))
else:
    agent.model.load_state_dict(torch.load(model_path, map_location='cpu'))
    agent.rnd.predictor.load_state_dict(torch.load(predictor_path, map_location='cpu'))
    agent.rnd.target.load_state_dict(torch.load(target_path, map_location='cpu'))
print('End load...')
works = []
parent_conns = []
child_conns = []
for idx in range(num_worker):
    parent_conn, child_conn = Pipe()
    work = env(env_id, is_render, idx, child_conn)
    work.start()
    works.append(work)
    parent_conns.append(parent_conn)
    child_conns.append(child_conn)
states = np.zeros([num_worker, 4, 84, 84])
steps = 0
rall = 0
rd = False
intrinsic_reward_list = []
while not rd:
    steps += 1
    actions, value_ext, value_int, policy = agent.get_action(np.float32(states) / 255.)
    for parent_conn, action in zip(parent_conns, actions):
        parent_conn.send(action)
    next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], []
    for parent_conn in parent_conns:
        s, r, d, rd, lr = parent_conn.recv()
        rall += r
        next_states = s.reshape([1, 4, 84, 84])
        next_obs = s[3, :, :].reshape([1, 1, 84, 84])

    # total reward = int reward + ext Reward
    intrinsic_reward = agent.compute_intrinsic_reward(next_obs)
    intrinsic_reward_list.append(intrinsic_reward)
    states = next_states[:, :, :, :]

    if rd:
        intrinsic_reward_list = (intrinsic_reward_list - np.mean(intrinsic_reward_list)) / np.std(
            intrinsic_reward_list)
        with open('int_reward', 'wb') as f:
            pickle.dump(intrinsic_reward_list, f)
        steps = 0
        rall = 0

