In [1]:
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT

import random
import cv2
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.regularizers import l2
import tensorflow_hub as hub
from collections import deque
from tqdm import tqdm

In [2]:
class FReLU(tf.keras.layers.Layer):
    def __init__(self, kernel_size=3):
        super(FReLU, self).__init__()
        self.f_cond = tf.keras.layers.DepthwiseConv2D(kernel_size, strides=(1, 1), padding='same')
        self.bn = tf.keras.layers.BatchNormalization()

    def call(self, inputs):
        x = self.f_cond(inputs)
        x = self.bn(x)
        x_shape = tf.keras.backend.int_shape(x)
        x = tf.keras.layers.Lambda(self.max_unit, output_shape=(x_shape[1], x_shape[2], x_shape[3]))([inputs, x])
        return x
    
    def max_unit(self, args):
        inputs , depthconv_output = args
        return tf.maximum(inputs, depthconv_output)
    
class Mish(tf.keras.layers.Activation):
    def __init__(self, activation, **kwargs):
        super(Mish, self).__init__(activation, **kwargs)
        self.__name__ = 'Mish'

def mish(inputs):
    return inputs * tf.math.tanh(tf.math.softplus(inputs))

class ActorCritic(tf.keras.Model):
    def __init__(self, act_spec, shape, hidden_sizes):
        super().__init__()
        self.layer1 = tf.keras.layers.SeparableConv2D(8, 8, strides=4)
        self.activation1 = FReLU()
        self.layer2 = tf.keras.layers.SeparableConv2D(16, 4, strides=2)
        self.activation2 = FReLU()
        self.layer3 = tf.keras.layers.SeparableConv2D(32, 3, strides=1)
        self.activation3 = FReLU()
        self.layer4 = tf.keras.layers.Flatten()
        self.lstm = tf.keras.layers.LSTM(256, return_sequences=True, return_state=True)
        self.actor = tf.keras.layers.Dense(act_spec, activation='softmax', name='actor')
        self.shape = shape
        
        self.act_rnn = tf.keras.layers.GRU(64)
        self.act_dense = tf.keras.layers.Dense(32)
        self.act_activation = Mish(mish)
        self.critic = tf.keras.layers.Dense(1, name='critic')
        
    def call(self, frames, action_seq, state):
        x = frames / 255
        x = tf.image.resize(x, [self.shape[0], self.shape[1]])
        
        y = self.act_rnn(action_seq)
        y = self.act_dense(y)
        y = self.act_activation(y)
        
        x = self.layer1(x)
        x = self.activation1(x)
        x = self.layer2(x)
        x = self.activation2(x)
        x = self.layer3(x)
        x = self.activation3(x)
        x = self.layer4(x)
        x = tf.expand_dims(x, axis=1)
        if state is None:
            x, m_state, c_state = self.lstm(x)
        else:
            x, m_state, c_state = self.lstm(x, initial_state=state)
        x = tf.squeeze(x, axis=1)
        
        x = tf.concat([x, y], 1)
        a_out = self.actor(x)
        c_out = self.critic(x)
        return a_out, c_out, [m_state, c_state]

In [3]:
class Agent():
    def __init__(self, act_spec, obs_spec):
        self.h, self.w = 84, 84
        self.gamma = 0.99
        self.act_spec = act_spec
        self.batch_size = 20
        self.gym_steps = 1000
        self.hidden_size = (256, 128)
        
        self.huber_loss = tf.keras.losses.Huber()
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
        self.model = ActorCritic(act_spec, (self.h, self.w, 3), self.hidden_size)
        
    def load_weights(self, path):
        self.model.load_weights(path)
        
    def save_weights(self, path):
        self.model.save_weights(path)
        
    def compute_loss(self, actions, action_probs, values, returns):
        advantage = np.array(returns) - tf.stop_gradient(values)
        action_log_probs = tf.keras.losses.sparse_categorical_crossentropy(actions, action_probs)
        actor_loss = tf.reduce_mean(-action_log_probs * advantage)
        critic_loss = self.huber_loss(values, returns)
        return actor_loss, critic_loss
    
    def expected_return(self, rewards, stanardize=True):
        n, returns, eps = len(rewards), [], np.finfo(np.float32).eps.item()
        discount_sum = 0
        for i in range(n):
            reward = rewards[i]
            discount_sum = reward + self.gamma * reward + self.gamma*self.gamma * discount_sum
#             discount_sum = reward + self.gamma * discount_sum
            returns.append(discount_sum)
            
        # Normalize
        returns = np.array(returns)
        returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
        returns = returns.tolist()
        return returns
    
    def train_step(self, eps):
        world, stage = 1, random.choice([1, 2, 3, 4])
#         world, stage = 1, random.choice([1])
        env = gym_super_mario_bros.make('SuperMarioBros-%d-%d-v0'%(world, stage))
        env = JoypadSpace(env, SIMPLE_MOVEMENT)
        state = env.reset()
        _, _, done, preinfo = env.step(0)
        model_state, action_history = None, [0 for _ in range(5)]
        
        ep_reward, ep_loss = 0, 0
        for i in range(int(self.gym_steps/self.batch_size)):
            with tf.GradientTape() as tape:
                actions, action_probs, values, rewards, next_data = run_episord(
                    self.model, env, state, preinfo, done, eps, model_state, action_history,
                    act_spec=self.act_spec, steps=self.batch_size, is_render=True)
                returns = agent.expected_return(rewards)
                actor_loss, critic_loss = self.compute_loss(actions, action_probs, values, returns)
                loss = actor_loss + critic_loss
                
            grads = tape.gradient(loss, self.model.trainable_variables)
            self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
            ep_loss += loss.numpy()
            ep_reward += tf.math.reduce_sum(rewards).numpy()
            
            env, state, preinfo, done = next_data[0], next_data[1], next_data[2], next_data[3]
            model_state, action_history = next_data[4], next_data[5]

        env.close()
        return ep_reward

In [4]:
def run_episord(model, env, state, preinfo, done, eps, model_state, action_history, act_spec=7, steps=1000, is_render=False):
    eps =  np.finfo(np.float32).eps.item()
    actions, values, action_probs, rewards = [], [], [], []
    seq_len = len(action_history)
    
    for step in range(steps):
        if done:
            state = env.reset()
            _, _, dones, preinfo = env.step(0)
            if is_render: env.render()
                
        history = tf.one_hot([action_history], action_spec)
        action_prob, value, model_state = model(state[np.newaxis, :, :, :], history, model_state)
        action = np.random.choice([i for i in range(act_spec)], p=np.squeeze(action_prob))
        current_state = state
        action_history.append(action)
        action_history = action_history[-seq_len:]

        state, reward, done, info = env.step(action)
        reward += (info['score'] - preinfo['score']) / 40.0
        if done:
            if info['flag_get']:
                reward += 350.0
            else:
                reward -= 50.0
        reward = reward / 10.0
        
        values.append(value)
        actions.append(action)
        action_probs.append(action_prob[0])
        rewards.append(reward)
        
        preinfo = info
        if is_render: env.render()
    return actions, action_probs, values, rewards, [env, state, preinfo, done, model_state, action_history]

In [5]:
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, SIMPLE_MOVEMENT)
state = env.reset()
action = env.action_space.sample()
#0: #1:→ #2:→＋ジャンプ #3:→→
#4:→→＋ジャンプ #5:ジャンプ #6:←
action_spec = 7

agent = Agent(action_spec, state.shape)
agent.load_weights('./agents/checkpoint/model')

In [None]:
epochs = 1000
batch_size = 100
size = 1000

with tqdm(range(epochs)) as pbar:
    for epoch in pbar:

        eps = epoch / epochs
        episord_reward = agent.train_step(eps)
        pbar.set_postfix({'EpisordReward': episord_reward})

        if epoch % 10 == 0:
            agent.save_weights('./agents/checkpoint/model')

  4%|█████▉                                                                                                                                                           | 37/1000 [16:56<7:12:07, 26.92s/it, EpisordReward=147]