In [8]:
from vizdoom import gymnasium_wrapper
import numpy as np
import random
import time
import gymnasium as gym
from collections import deque
import skimage
import keras
import math

In [9]:
environment = gym.make("VizdoomBasic-v0", render_mode = 'rgb_array')



In [10]:
class ReplayBuffer:
    def __init__(self, buffer_size):
        self.elements = []
        self.buffer_limit = buffer_size
        return None
    def insert(self, element):
        if len(self.elements) == self.buffer_limit:
            self.elements.pop(random.randint(0, self.buffer_limit - 1))
            self.elements.append(element)
        else:
            self.elements.append(element)
        return None
    def sample(self, count = 1):
        result = []
        for i in range(count):
            result.append(self.elements[random.randint(0, len(self.elements) - 1)])
        return result

@keras.saving.register_keras_serializable()
class DeepQ(keras.Layer):
    def __init__(self,
                 filters,
                 kernels,
                 strides,
                 dense_units,
                 **kwargs):
        super(DeepQ, self).__init__()
        self.filters, self.kernels, self.strides, self.dense_units = filters, kernels, strides, dense_units
        for i in range(len(filters)):
            conv = keras.layers.Conv2D(filters = filters[i],
                                       kernel_size = kernels[i],
                                       strides = strides[i],
                                       padding = 'same')
            self._layers.append(conv)
        self._layers.append(keras.layers.Flatten())
        for i in range(len(dense_units)):
            dense = keras.layers.Dense(units = dense_units[i])
            self._layers.append(dense)
        return None
    def call(self, inputs):
        x = inputs
        for layer in self._layers:
            x = layer(x)
        return x
    def get_config(self):
        config = {
            'filters': self.filters,
            'kernels': self.kernels,
            'strides': self.strides,
            'dense_units': self.dense_units,
        }
        base_config = super(DeepQ, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))
    @classmethod
    def from_config(cls, config):
        filters = config.pop('filters')
        kernels = config.pop('kernels')
        strides = config.pop('strides')
        dense_units = config.pop('dense_units')
        layer = cls(filters = filters,
                    kernels = kernels,
                    strides = strides,
                    dense_units = dense_units,
                    **config)
        return layer

In [11]:
def preprocess(frame):
    reduce_dims = frame[0]
    gs = np.mean(reduce_dims, -1) / 255
    cf = np.array(gs)[30:-10, 30:-30]
    result = skimage.transform.resize(cf, [84, 84])
    result = np.expand_dims(result, 0)
    return result

def stack_state(stack, state, is_new):
    state = preprocess(state)
    if is_new:
        stack = deque([np.zeros((84, 84)) for i in range(4)], maxlen = 4)
        for _ in range(4):
            stack.append(state)
    else:
        stack.append(state)
    tensor = np.stack(stack, -1)
    return tensor, stack

def sample_action(env: gym.Env, function: keras.Model, state, epsilon, verbosity):
    if random.random() < epsilon:
        return env.action_space.sample()
    else:
        return np.argmax(function.predict(state, verbose = verbosity))

def initialize_memory(env: gym.Env, buffer_size):
    buffer = ReplayBuffer(buffer_size)
    stack = deque([np.zeros((84, 84)) for i in range(4)], maxlen = 4)
    state, info = env.reset()
    state = state["screen"]
    state = np.expand_dims(state, 0)
    state, stacked_states = stack_state(stack, state, True)
    for i in range(1, buffer_size):
        action = env.action_space.sample()
        next_state, reward, terminated, truncated, info = env.step(action)
        next_state = next_state["screen"]
        next_state = np.expand_dims(next_state, 0)
        if terminated or truncated:
            next_state = np.zeros((state.shape))
            buffer.insert((state, action, reward, next_state, True))
            state, info = env.reset()
            state = state["screen"]
            state = np.expand_dims(state, 0)
            state, stacked_states = stack_state(stack, state, True)
        else:
            next_state, stacked_states = stack_state(stacked_states, next_state, False)
            buffer.insert((state, action, reward, next_state, False))
            state = next_state
    return buffer, stack

In [12]:
input = keras.layers.Input(shape = (84, 84, 4))
q = DeepQ(filters = [32, 64, 64],
        kernels = [8, 4, 3],
        strides = [4, 3, 1],
        dense_units = [512, 4])
output = q(input)
function = keras.models.Model(input, output)
function.compile(optimizer = keras.optimizers.Adam(0.0005), loss = 'mse')

function.save('test.keras')

r = keras.models.load_model('test.keras')

  trackable.load_own_variables(weights_store.get(inner_path))


In [13]:
def train(env: gym.Env, 
          episodes: int, 
          episode_length: int, 
          input_shape: tuple, 
          conv_filters: list, 
          conv_kernels: list, 
          conv_strides: list, 
          dense_units: list, 
          buffer_size: int, 
          learning_rate: float = 0.0005, 
          epsilon: float = 0.01, 
          epsilon_decay: float | None = None, 
          gamma: float = 0.999, 
          batch_size: int = 32, 
          reset_frequency: int = 16, 
          verbosity = 0, 
          id: int | str = 0) -> keras.Model:

    assert(len(conv_filters) == len(conv_kernels) == len(conv_strides))
    assert(len(dense_units) > 0)
    assert(0 <= epsilon <= 1)

    _w = math.floor(math.log10(episodes)) + 1
    input = keras.layers.Input(shape = input_shape)
    q = DeepQ(filters = conv_filters,
            kernels = conv_kernels,
            strides = conv_strides,
            dense_units = dense_units)
    output = q(input)
    function = keras.models.Model(input, output)

    target_input = keras.layers.Input(shape = input_shape)
    target_q = DeepQ(filters = conv_filters,
                    kernels = conv_kernels,
                    strides = conv_strides,
                    dense_units = dense_units)
    target_output = target_q(target_input)
    target_function = keras.models.Model(target_input, target_output)
    
    function.compile(optimizer = keras.optimizers.Adam(learning_rate), loss = 'mse')
    target_function.compile(optimizer = keras.optimizers.Adam(learning_rate), loss = 'mse')
    
    episode_rewards = []
    
    best_reward = 0.0
    
    buffer, stack = initialize_memory(env, buffer_size)
    
    for episode in range(episodes):
        _st = time.time()
        
        episode_rewards.append(0.0)
        state, info = env.reset()
        state = state["screen"]
        state = np.expand_dims(state, 0)
        state, stacked_states = stack_state(stack, state, True)
        for step in range(episode_length):
            if epsilon_decay is not None:
                epsilon *= math.exp(-epsilon_decay)
            action = sample_action(env, function, state, epsilon, verbosity)
            next_state, reward, terminated, truncated, info = env.step(action)
            next_state = next_state["screen"]
            next_state = np.expand_dims(next_state, 0)
            episode_rewards[-1] += reward
            if terminated or truncated:
                next_state = np.zeros((state.shape))
                buffer.insert((state, action, reward, next_state, True))
                state, info = env.reset()
                state = state["screen"]
                state = np.expand_dims(state, 0)
                state, stacked_states = stack_state(stack, state, True)
                continue
            else:
                next_state, stacked_states = stack_state(stacked_states, next_state, False)
                buffer.insert((state, action, reward, next_state, False))
                state = next_state
            
            # Train
            batch = buffer.sample(batch_size)
            states = np.array([each[0] for each in batch], ndmin = 3)
            actions = np.array([each[1] for each in batch])
            rewards = np.array([each[2] for each in batch]) 
            next_states = np.array([each[3] for each in batch], ndmin = 3)
            dones = np.array([each[4] for each in batch])
            
            states = np.squeeze(states)
            next_states = np.squeeze(next_states)
            if batch_size == 1:
                states = np.expand_dims(states, 0)
                next_states = np.expand_dims(next_states, 0)
            q_states = function.predict(states, verbose = verbosity)
            
            for i in range(batch_size):
                state, action, reward, next_state, done = states[i], actions[i], rewards[i], next_states[i], dones[i]
                q_state = q_states[i]
                q_target = reward
                
                state = np.expand_dims(state, 0)
                next_state = np.expand_dims(next_state, 0)
                
                if not done:
                    q_target += gamma * np.amax(target_function.predict(next_state, verbose = verbosity))
                q_state[action] = q_target
                function.fit(state, np.expand_dims(q_state, 0), verbose = verbosity)
            
            
            _was_done_str = "env terminated or truncated" if terminated or truncated else "env finished               "
            _et = time.time() - _st
            _ec = episode + 1
            _er = episodes - _ec
            _eta = _et / _ec * _er
            print(f"Episode {(episode + 1): {_w}}/{episodes}\t"
                  f"[{'=' * math.floor(step * 25 / episode_length)}>"
                  f"{'-' * (25 - math.floor(step * 25 / episode_length))}]"
                  f"\tFrame {step + 1} of {episode_length}, "
                  f"{int(_et)}s elapsed); {_was_done_str}", end = '\r')

            if (episode * episode_length + step) % reset_frequency == 0:
                target_function.set_weights(function.get_weights())
        
        print(f"Episode {episode + 1: {_w}}/{episodes}\t[{'=' * 26}], "
              f"{int(time.time() - _st)}s elapsed{' ' * 50}")
        if episode_rewards[-1] > best_reward:
            print(f'Reward improved from {best_reward: .3f} to {episode_rewards[-1]: .3f}, '
                  f'saving model to file "policy_{id}.keras"')
            best_reward = episode_rewards[-1]
            function.save(f'policy_{id}.keras')
        else:
            print(f'Reward of {episode_rewards[-1]: .3f} did not improve from {best_reward: .3f}')
    
    return function

In [14]:
policy = train(env = environment,
               episodes = 20,
               episode_length = 100,
               input_shape = (84, 84, 4),
               conv_filters = [32, 64, 64],
               conv_kernels = [8, 4, 3],
               conv_strides = [4, 3, 1],
               dense_units = [512, 4],
               buffer_size = 1000,
               learning_rate = 0.005,
               epsilon = 0.1,
               epsilon_decay = 0.003,
               gamma = 0.995,
               batch_size = 4,
               reset_frequency = 50,
               verbosity = 0,
               id = "thu25apr")

Reward of -24.000 did not improve from  0.000
Reward of -120.000 did not improve from  0.000
Reward of -125.000 did not improve from  0.000
Reward of -24.000 did not improve from  0.000
Reward of -130.000 did not improve from  0.000
Reward of -120.000 did not improve from  0.000
Reward of -120.000 did not improve from  0.000
Reward of -115.000 did not improve from  0.000
Reward of -125.000 did not improve from  0.000
Reward improved from  0.000 to  82.000, saving model to file "policy_thu25apr.keras"
Reward of -125.000 did not improve from  82.000
Reward of -125.000 did not improve from  82.000
Reward of -125.000 did not improve from  82.000
Reward of -24.000 did not improve from  82.000
Reward of -130.000 did not improve from  82.000
Reward of -130.000 did not improve from  82.000
Reward of -120.000 did not improve from  82.000
Reward of -125.000 did not improve from  82.000
Reward of -24.000 did not improve from  82.000
Reward of  82.000 did not improve from  82.000


In [15]:
def evaluate_policy(env: gym.Env, policy, episode_length, video_path, buffer_size, episode_trigger, fps, verbosity):
    r_env = gym.wrappers.RecordVideo(env, video_path, episode_trigger = episode_trigger)
    r_env.metadata['video.frames_per_second'] = fps

    rewards = 0.0
    
    buffer, stack = initialize_memory(r_env, buffer_size)
    state, info = r_env.reset()
    state = state["screen"]
    state = np.expand_dims(state, 0)
    state, stacked_states = stack_state(stack, state, True)
    for step in range(episode_length):
        action = np.argmax(policy.predict(state, verbose = verbosity))
        next_state, reward, terminated, truncated, info = r_env.step(action)
        next_state = next_state["screen"]
        next_state = np.expand_dims(next_state, 0)
        rewards += reward
        if terminated or truncated:
            next_state = np.zeros((state.shape))
            buffer.insert((state, action, reward, next_state, True))
            state, info = env.reset()
            state = state["screen"]
            state = np.expand_dims(state, 0)
            state, stacked_states = stack_state(stack, state, True)
            continue
        else:
            next_state, stacked_states = stack_state(stacked_states, next_state, False)
            buffer.insert((state, action, reward, next_state, False))
            state = next_state
            
    r_env.close_video_recorder()
    r_env.close()
    return rewards

evaluate_policy(env = environment,
                policy = policy,
                episode_length = 500,
                video_path = './',
                buffer_size = 100,
                episode_trigger = lambda x: True,
                fps = 8,
                verbosity = 0)

  logger.warn(


Moviepy - Building video /Users/ericssonlin/code/Python/AIML/RL/rl-video-episode-0.mp4.
Moviepy - Writing video /Users/ericssonlin/code/Python/AIML/RL/rl-video-episode-0.mp4



                                                   

Moviepy - Done !
Moviepy - video ready /Users/ericssonlin/code/Python/AIML/RL/rl-video-episode-0.mp4




Moviepy - Building video /Users/ericssonlin/code/Python/AIML/RL/rl-video-episode-1.mp4.
Moviepy - Writing video /Users/ericssonlin/code/Python/AIML/RL/rl-video-episode-1.mp4



                                                                

Moviepy - Done !
Moviepy - video ready /Users/ericssonlin/code/Python/AIML/RL/rl-video-episode-1.mp4




-500.0