### The SpaceInvader

In [None]:
import os
from copy import deepcopy

import numpy as np
import supersuit as ss
import torch
from pettingzoo.atari import space_invaders_v2
from tqdm import trange

from agilerl.algorithms.core.registry import HyperparameterConfig, RLParameter
from agilerl.algorithms.maddpg import MADDPG
from agilerl.components.multi_agent_replay_buffer import MultiAgentReplayBuffer
from agilerl.utils.algo_utils import obs_channels_to_first
from agilerl.utils.utils import create_population, observation_space_channels_to_first
from agilerl.vector.pz_async_vec_env import AsyncPettingZooVecEnv
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
NET_CONFIG = {
    "encoder_config": {
        "channel_size": [32, 32],
        "kernel_size": [3, 3], 
        "stride_size": [2, 2], 
    },
    "head_config": {"hidden_size": [32, 32]}, 
}
INIT_HP = {
    "POPULATION_SIZE": 1,
    "ALGO": "MADDPG", 
    "CHANNELS_LAST": True,
    "BATCH_SIZE": 32,  
    "O_U_NOISE": True, 
    "EXPL_NOISE": 0.1, 
    "MEAN_NOISE": 0.0,  
    "THETA": 0.15, 
    "DT": 0.01,  
    "LR_ACTOR": 0.001, 
    "LR_CRITIC": 0.001, 
    "GAMMA": 0.95, 
    "MEMORY_SIZE": 100000, 
    "LEARN_STEP": 100,  
    "TAU": 0.01,  
}
num_envs = 8
env = space_invaders_v2.parallel_env()
env = ss.frame_skip_v0(env, 4)
env = ss.clip_reward_v0(env, lower_bound=-1, upper_bound=1)
env = ss.color_reduction_v0(env, mode="B")
env = ss.resize_v1(env, x_size=84, y_size=84)
env = ss.frame_stack_v1(env, 4)
env = AsyncPettingZooVecEnv([lambda: env for _ in range(num_envs)])
env.reset()

observation_spaces = [env.single_observation_space(agent) for agent in env.agents]
action_spaces = [env.single_action_space(agent) for agent in env.agents]
if INIT_HP["CHANNELS_LAST"]:
    observation_spaces = [
        observation_space_channels_to_first(obs) for obs in observation_spaces
    ]
INIT_HP["AGENT_IDS"] = env.agents
hp_config = HyperparameterConfig(
    lr_actor=RLParameter(min=1e-4, max=1e-2),
    lr_critic=RLParameter(min=1e-4, max=1e-2),
    batch_size=RLParameter(min=8, max=512, dtype=int),
    learn_step=RLParameter(
        min=20, max=200, dtype=int, grow_factor=1.5, shrink_factor=0.75
    ),
)

In [3]:
agent: MADDPG = create_population(
    INIT_HP["ALGO"],
    observation_spaces,
    action_spaces,
    NET_CONFIG,
    INIT_HP,
    hp_config,
    population_size=INIT_HP["POPULATION_SIZE"],
    num_envs=num_envs,
    device=device,
)[0]
field_names = ["obs", "action", "reward", "next_obs", "done"]
memory = MultiAgentReplayBuffer(
    INIT_HP["MEMORY_SIZE"],
    field_names=field_names,
    agent_ids=INIT_HP["AGENT_IDS"],
    device=device,
)
agent_ids = deepcopy(env.agents)
max_steps = 20000 
learning_delay = 500 
training_steps = 10000 
eval_steps = None 
eval_loop = 1
total_steps = 0

In [None]:
print("Training...")
pbar = trange(max_steps, unit="step")
while np.less(agent.steps[-1], max_steps):
    obs, info = env.reset()  
    scores = np.zeros((num_envs, len(agent_ids)))
    completed_episode_scores = []
    steps = 0
    if INIT_HP["CHANNELS_LAST"]:
        obs = {agent_id: obs_channels_to_first(s) for agent_id, s in obs.items()}
    for idx_step in range(training_steps // num_envs):
        cont_actions, discrete_action = agent.get_action(
            obs=obs, training=True, infos=info 
        )
        if agent.discrete_actions:
            action = discrete_action
        else:
            action = cont_actions
        action = {agent: env.action_space(agent).sample() for agent in env.agents}
        next_obs, reward, termination, truncation, info = env.step(action)
        if not termination:
            assert False
        scores += np.array(list(reward.values())).transpose()
        total_steps += num_envs
        steps += num_envs
        if INIT_HP["CHANNELS_LAST"]:
            next_obs = {
                agent_id: obs_channels_to_first(ns)
                for agent_id, ns in next_obs.items()
            }
        memory.save_to_memory(
            obs,
            cont_actions,
            reward,
            next_obs,
            termination,
            is_vectorised=True,
        )
        if agent.learn_step > num_envs:
            learn_step = agent.learn_step // num_envs
            if (
                idx_step % learn_step == 0
                and len(memory) >= agent.batch_size
                and memory.counter > learning_delay
            ):
                experiences = memory.sample(agent.batch_size)
                agent.learn(experiences)
        elif len(memory) >= agent.batch_size and memory.counter > learning_delay:
            for _ in range(num_envs // agent.learn_step):
                experiences = memory.sample(agent.batch_size)
                agent.learn(experiences)

        obs = next_obs
        
        reset_noise_indices = []
        term_array = np.array(list(termination.values())).transpose()
        trunc_array = np.array(list(truncation.values())).transpose()
        for idx, (d, t) in enumerate(zip(term_array, trunc_array)):
            if np.any(d) or np.any(t):
                completed_episode_scores.append(scores[idx])
                agent.scores.append(scores[idx])
                scores[idx] = 0
                reset_noise_indices.append(idx)
        agent.reset_action_noise(reset_noise_indices)

    pbar.update(training_steps)
    agent.steps[-1] += steps
    fitness = agent.test(
        env,
        swap_channels=INIT_HP["CHANNELS_LAST"],
        max_steps=eval_steps,
        loop=eval_loop,
        sum_scores=False,
    )
    pop_episode_scores = np.array(completed_episode_scores)
    mean_scores = np.mean(pop_episode_scores, axis=0)

    print(f"--- Global steps {total_steps} ---")
    print(f"Steps {agent.steps[-1]}")
    print("Scores:")
    for idx, sub_agent in enumerate(agent_ids):
        print(f"    {sub_agent} score: {mean_scores[idx]}")
    print("Fitness")
    for idx, sub_agent in enumerate(agent_ids):
        print(f"    {sub_agent} fitness: {fitness[idx]}")
    print("Previous 5 fitness avgs")
    for idx, sub_agent in enumerate(agent_ids):
        print(
            f"  {sub_agent} fitness average: {np.mean(agent.fitness[-5:], axis=0)[idx]}"
        )
    agent.steps.append(agent.steps[-1])
path = "./models/MADDPG"
filename = "MADDPG_trained_agent.pt"
os.makedirs(path, exist_ok=True)
save_path = os.path.join(path, filename)
agent.save_checkpoint(save_path)

pbar.close()
env.close()

Inference

In [1]:
import os
import imageio
import numpy as np
import supersuit as ss
import torch
from pettingzoo.atari import space_invaders_v2
from PIL import Image, ImageDraw
from agilerl.algorithms.maddpg import MADDPG

In [None]:
def _label_with_episode_number(frame, episode_num):
    im = Image.fromarray(frame)
    drawer = ImageDraw.Draw(im)
    if np.mean(frame) < 128:
        text_color = (255, 255, 255)
    else:
        text_color = (0, 0, 0)
    drawer.text(
        (im.size[0] / 20, im.size[1] / 18), f"Episode: {episode_num+1}", fill=text_color
    )
    return im

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

env = space_invaders_v2.parallel_env(render_mode="rgb_array")
channels_last = True 
if channels_last:
    env = ss.frame_skip_v0(env, 4)
    env = ss.clip_reward_v0(env, lower_bound=-1, upper_bound=1)
    env = ss.color_reduction_v0(env, mode="B")
    env = ss.resize_v1(env, x_size=84, y_size=84)
    env = ss.frame_stack_v1(env, 4)
env.reset()
try:
    state_dim = [env.observation_space(agent).n for agent in env.agents]
    one_hot = True
except Exception:
    state_dim = [env.observation_space(agent).shape for agent in env.agents]
    one_hot = False
try:
    action_dim = [env.action_space(agent).n for agent in env.agents]
    discrete_actions = True
    max_action = None
    min_action = None
except Exception:
    action_dim = [env.action_space(agent).shape[0] for agent in env.agents]
    discrete_actions = False
    max_action = [env.action_space(agent).high for agent in env.agents]
    min_action = [env.action_space(agent).low for agent in env.agents]

if channels_last:
    state_dim = [
        (state_dim[2], state_dim[0], state_dim[1]) for state_dim in state_dim
    ]

n_agents = env.num_agents
agent_ids = env.agents

path = "./models/MADDPG/MADDPG_trained_agent.pt"
maddpg = MADDPG.load(path, device)

episodes = 10
max_steps = 500

rewards = []
frames = []
indi_agent_rewards = {
    agent_id: [] for agent_id in agent_ids
} 

for ep in range(episodes):
    state, info = env.reset()
    agent_reward = {agent_id: 0 for agent_id in agent_ids}
    score = 0
    for _ in range(max_steps):
        if channels_last:
            state = {
                agent_id: np.moveaxis(np.expand_dims(s, 0), [3], [1])
                for agent_id, s in state.items()
            }
        cont_actions, discrete_action = maddpg.get_action(
            state, training=False, infos=info
        )
        if maddpg.discrete_actions:
            action = discrete_action
        else:
            action = cont_actions
        frame = env.render()
        frames.append(_label_with_episode_number(frame, episode_num=ep))

        state, reward, termination, truncation, info = env.step(
            {agent: a.squeeze() for agent, a in action.items()}
        )
        for agent_id, r in reward.items():
            agent_reward[agent_id] += r
        score = sum(agent_reward.values())
        if any(truncation.values()) or any(termination.values()):
            break

    rewards.append(score)
    for agent_id in agent_ids:
        indi_agent_rewards[agent_id].append(agent_reward[agent_id])

    print("-" * 15, f"Episode: {ep}", "-" * 15)
    print("Episodic Reward: ", rewards[-1])
    for agent_id, reward_list in indi_agent_rewards.items():
        print(f"{agent_id} reward: {reward_list[-1]}")
env.close()
gif_path = "./videos/"
os.makedirs(gif_path, exist_ok=True)
imageio.mimwrite(
    os.path.join("./videos/", "space_invaders.gif"), frames, duration=10
)