In [1]:
import gymnasium 
import torch
import numpy as np
from gymnasium.wrappers import FrameStack, GrayScaleObservation, FrameStack, ResizeObservation, AtariPreprocessing, RecordEpisodeStatistics
from gymnasium.vector import VectorEnv
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv, VecVideoRecorder, VecEnv, VecMonitor, VecNormalize
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.atari_wrappers import AtariWrapper, MaxAndSkipEnv, WarpFrame, FireResetEnv, ClipRewardEnv, NoopResetEnv, EpisodicLifeEnv
from stable_baselines3 import PPO, DQN, TD3, SAC
from stable_baselines3.ppo.policies import CnnPolicy
from sb3_contrib import QRDQN
from matplotlib import pyplot as plt


In [2]:
from ale_py import ALEInterface
from stable_baselines3.common.env_util import make_atari_env

def make_env():
    # Se crea la instancia del entorno original
    # env = gymnasium.make("ALE/Breakout-v5", render_mode="rgb_array")
    env = gymnasium.make("BreakoutNoFrameskip-v4", render_mode="rgb_array")
    # Se aplican los wrappers deseados
    env = GrayScaleObservation(env)
    env = ResizeObservation(env, (84, 84))
    # env = EpisodicLifeEnv(env)
    env = MaxAndSkipEnv(env, skip=4)
    env = ClipRewardEnv(env)
    env = FrameStack(env, 6)
    env = RecordEpisodeStatistics(env=env, deque_size=200)
    return env

env = DummyVecEnv([lambda: make_env() for _ in range(9)])

In [None]:
import envpool
from envpool.wrappers import GymWrapper  # Opcional, si necesitas compatibilidad con la API de Gymnasium

# Creamos el entorno Atari con los parámetros deseados:
env = envpool.make(
    "BreakoutNoFrameskip-v4",
    num_envs=9,         # Crea 9 entornos vectorizados (equivalente al DummyVecEnv)
    frame_skip=4,       # Realiza un salto de 4 frames (MaxAndSkipEnv)
    frame_stack=6,      # Apila 6 frames (FrameStack)
    gray_scale=True,    # Convierte las observaciones a escala de grises (GrayScaleObservation)
    size=84,            # Redimensiona las observaciones a 84x84 (ResizeObservation)
    reward_clip=True    # Clipea las recompensas (ClipRewardEnv)
)

# Si necesitas que el entorno tenga la misma API que Gymnasium (por ejemplo, para integrarlo con Stable-Baselines3),
# puedes envolverlo usando GymWrapper:
env = GymWrapper(env)

In [4]:
env.observation_space.shape
# n_input_channels, height, width = env.observation_space.shape
# print(n_input_channels, height, width)

(6, 84, 84)

In [3]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from sb3_contrib import RecurrentPPO  


############################
# 1) Función de LR schedule
############################
def exp_decay_lr(initial_lr: float, min_lr: float = 5e-7):
    """
    Decaimiento exponencial de la tasa de aprendizaje.
    """
    def lr_schedule(progress_remaining: float) -> float:
        # progress_remaining: 1 -> start, 0 -> end
        current_lr = initial_lr * math.exp(-10 * (1 - progress_remaining))
        return max(current_lr, min_lr)
    return lr_schedule

def linear_decay_lr(initial_lr: float, min_lr: float = 5e-7):
    """
    Decaimiento lineal de la tasa de aprendizaje.
    """
    def lr_schedule(progress_remaining: float) -> float:
        # progress_remaining: 1 -> start, 0 -> end
        current_lr = initial_lr * progress_remaining
        return max(current_lr, min_lr)
    return lr_schedule

class CNNWithResidualAndAttention(BaseFeaturesExtractor):  # type: ignore
    def __init__(self, observation_space, features_dim=512, num_attention_layers=1, embed_dim=96):
        super(CNNWithResidualAndAttention, self).__init__(observation_space, features_dim)

        # Primer bloque convolucional
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=observation_space.shape[0], out_channels=64, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=64, out_channels=32, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            # nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1),
            # nn.ReLU(),
        )

       
        # Segundo bloque convolucional
        self.conv2 = nn.Sequential(
            nn.ReLU(),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=32, out_channels=16, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Flatten()
        )

        # Calcular el tamaño de las características aplanadas
        with torch.no_grad():
            sample_input = torch.as_tensor(observation_space.sample()[None]).float()
            # sample_output = self.conv2(self.conv1(sample_input))
            # print("Forma de sample_output:", sample_output.shape)  # Diagnóstico
            # n_flatten = sample_output.shape[1]
            # n_flatten = self.conv2(self.conv1(sample_input)).shape[1] + self.residual_block(sample_input).shape[1]
            x_sample = self.conv1(sample_input)
            x_sample = self.conv2(x_sample)
            # x_sample = torch.flatten(x_sample, start_dim=1)
            # print("Forma de x_sample después de flatten:", x_sample.shape)  # Diagnóstico
            n_flatten = x_sample.shape[1]

        # Proyección para ajustar al tamaño de embed_dim
        self.projection = nn.Linear(n_flatten, embed_dim)

        # Capas de atención
        self.attention_layers = nn.ModuleList([
            nn.MultiheadAttention(embed_dim=embed_dim, num_heads=4) for _ in range(num_attention_layers)
        ])

        # Proyección final
        self.linear = nn.Sequential(
            nn.Linear(embed_dim, features_dim),
            nn.ReLU()
        )

    def forward(self, observations):
        # Pasar por el primer bloque convolucional
        x = self.conv1(observations)
        #residual = self.residual_block(observations)

        # Pasar por el segundo bloque convolucional
        x = self.conv2(x)

        # Pasar por el bloque residual
       # x = x + residual  # Suma residual

        # Aplanar después de la suma
        #x = torch.flatten(x, start_dim=1)
        # print("Forma antes de la proyección:", x.shape)  # Diagnóstico
        # Proyección inicial
        x = self.projection(x)

        # Añadir dimensión de secuencia para la atención
        x = x.unsqueeze(0)

        # Aplicar capas de atención
        for attention_layer in self.attention_layers:
            x, _ = attention_layer(x, x, x)

        # Quitar la dimensión de secuencia
        x = x.squeeze(0)

        # Proyección final
        return self.linear(x)

class CNNTransformerFeaturesExtractor(BaseFeaturesExtractor):
    """
    Feature Extractor que:
      1) Aplica una pequeña CNN sobre (C,H,W) => (d_model, H', W').
      2) Reordena a secuencia de tokens: [H'*W', batch_size, d_model].
      3) Pasa por un TransformerEncoder de 6 capas, 4 cabezas.
      4) "Mean pool" sobre la secuencia => [batch_size, d_model].
      5) Proyección final => [batch_size, features_dim].
    """
    def __init__(
        self,
        observation_space: gymnasium.spaces.Box,
        features_dim: int = 512,     # dimensión final de salida
        d_model: int = 128,         # dimensión del embedding interno (transformer)
        num_layers: int = 4,        # número de capas del Transformer
        nhead: int = 4,             # número de cabezas de atención
        dim_feedforward: int = 512, # tamaño intermedio en el feedforward del Transformer
        dropout: float = 0.1
    ):
        super().__init__(observation_space, features_dim)

        # 1) CNN inicial (similar a NatureDQN)
        #    Con 3 conv para reducir la imagen a un tamaño espacial manejable
        self.conv_net = nn.Sequential(
            nn.Conv2d(observation_space.shape[0], 64, kernel_size=3, stride=2),
            Swish(),
            nn.Conv2d(64, 64, kernel_size=3, stride=2),
            Swish(),
            nn.Conv2d(64, 32, kernel_size=3, stride=2),
            Swish(),
            nn.Conv2d(32, 32, kernel_size=3, stride=1),
            Swish(),
            nn.Conv2d(32, 16, kernel_size=3, stride=1),
            Swish(),
            nn.Conv2d(16, d_model, kernel_size=3, stride=1),
            Swish()
        )

        # 2) Definimos el TransformerEncoder
        #    - PyTorch por defecto usa "batch_first=False", 
        #      es decir: [sequence_length, batch_size, d_model].
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=False
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer,
            num_layers=num_layers
        )

        # Para averiguar el tamaño final (n_flatten), haremos una pasada con dummy input
        with torch.no_grad():
            sample_input = torch.as_tensor(observation_space.sample()[None]).float()
            # shape: [1, C, H, W]
            conv_out = self.conv_net(sample_input)  # => [1, d_model, H', W']
            b, c, h, w = conv_out.shape  # b=1
            # tokens = h*w
            # reordena a [seq_len, batch, d_model] => [h*w, 1, c]
            conv_out = conv_out.view(b, c, h*w).permute(2, 0, 1)  # [h*w, b, c]
            # Pasar por transformer => [h*w, b, c]
            out_enc = self.transformer_encoder(conv_out)
            # "mean pool" en la dimensión seq_len (h*w)
            out_enc = out_enc.permute(1, 0, 2).mean(dim=1)  # => [b, c]
            n_flatten = out_enc.shape[1]  # c = d_model

        # 3) Proyección final a features_dim
        self.linear = nn.Sequential(
            nn.Linear(n_flatten, features_dim),
            Swish()
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: [batch_size, C, H, W]
        retorna: [batch_size, features_dim]
        """
        # a) CNN
        x = self.conv_net(x)      # => [b, d_model, H', W']
        b, c, h, w = x.shape

        # b) Reordenar a [h*w, b, c] => seq_len = h*w
        x = x.view(b, c, h*w)     # [b, d_model, h*w]
        x = x.permute(2, 0, 1)    # [seq_len, b, d_model]

        # c) TransformerEncoder
        x = self.transformer_encoder(x)  # => [seq_len, b, d_model]

        # d) "Mean pool" sobre seq_len (espacio)
        x = x.permute(1, 0, 2)  # => [b, seq_len, d_model]
        x = x.mean(dim=1)       # => [b, d_model]

        # e) Proyección final
        x = self.linear(x)      # => [b, features_dim]
        return x

import gymnasium
import torch
import torch.nn as nn

from stable_baselines3.common.torch_layers import BaseFeaturesExtractor

import gymnasium
import torch
import torch.nn as nn
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from torch import Tensor
from muon import Muon

class CustomResNetCNNSkip(BaseFeaturesExtractor):
    """
    Extractor de características que mejora la arquitectura Nature.
    Incorpora:
      - Tres capas convolucionales (todas con kernel máximo 3x3) en la rama principal.
      - Un bloque residual (dos convoluciones de 3x3, stride=1, padding=1) que mantiene las dimensiones.
      - Una skip connection que procesa la entrada original mediante una conv (3x3, stride=2, padding=1),
        luego se reduce la dimensión a (8,8) con AdaptiveAvgPool2d, se aplana y se proyecta a la dimensión deseada.
      
    Se asume que el espacio de observación está en formato HWC, es decir,
    observation_space.shape = (84, 84, 6).
    """
    def __init__(self, observation_space: gymnasium.spaces.Box, features_dim: int = 512):
        super(CustomResNetCNNSkip, self).__init__(observation_space, features_dim)
        # Desempaquetamos asumiendo HWC: (height, width, n_input_channels)
        height, width, n_input_channels = observation_space.shape  # Ejemplo: (84,84,6)
        
        # Rama principal:
        # Conv1: kernel=3, stride=2, padding=1 => Output: floor((84+2-3)/2)+1 = 42
        self.conv1 = nn.Sequential(
            nn.Conv2d(n_input_channels, 64, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
        )
        # Conv2: kernel=3, stride=2, padding=1 => Output: floor((42+2-3)/2)+1 = 21
        self.conv2 = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
        )
        # Conv3: kernel=3, stride=1, padding=1 => mantiene 21
        self.conv3 = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
        )
        # Bloque residual: dos convoluciones de kernel=3, stride=1, padding=1 para mantener la dimensión 21x21
        self.res_block = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
        )
        self.relu = nn.ReLU()
        # Después de la rama principal, aplicamos un AdaptiveAvgPool2d para forzar la salida a (8,8)
        # La salida de la rama principal tendrá 64 canales.
        # Por lo tanto, el tensor resultante tendrá tamaño (64, 8, 8) => 4096 elementos.
        self.avgpool = nn.AdaptiveAvgPool2d((8, 8))
        
        conv_output_dim = 8 * 8 * 64  # 4096
        self.flatten = nn.Flatten()
        self.fc = nn.Sequential(
            nn.Linear(conv_output_dim, features_dim),
            nn.ReLU(),
        )
        
        # Rama de la skip connection: procesa la entrada original
        # Convertimos la entrada (HWC) a CHW en el forward, así que aquí se asume que la entrada es (n_input_channels,84,84).
        # Aplicamos una conv con kernel=3, stride=2, padding=1 para reducir de 84 a 42.
        self.skip_proj = nn.Sequential(
            nn.Conv2d(n_input_channels, 32, kernel_size=3, stride=2, padding=1),  # salida: (64,42,42)
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((8, 8)),  # Fuerza la salida a (64,8,8)
            nn.Flatten(),
            nn.Linear(conv_output_dim, features_dim)
        )

    def forward(self, observations: torch.Tensor) -> torch.Tensor:
        # Las observaciones vienen en formato HWC: (batch, 84,84,6), por lo que las convertimos a CHW:
        x = observations.permute(0, 3, 1, 2)  # Ahora x tiene forma (batch, 6,84,84)
        
        # Rama principal:
        x_conv = self.conv1(x)      # (batch, 32, 42, 42)
        x_conv2 = self.conv2(x_conv)   # (batch, 64, 21, 21)
        x_conv3 = self.conv3(x_conv2)   # (batch, 64, 21, 21)
        # res = self.res_block(x_conv)  # (batch, 64, 21, 21)
        # x_conv = self.relu(x_conv3 + res)  # (batch, 64, 21, 21)
        # Reducimos la dimensión a 8x8:
        x_conv = self.avgpool(x_conv)     # (batch, 64, 8, 8)
        x_conv = self.flatten(x_conv)       # (batch, 4096)
        fc_out = self.fc(x_conv)            # (batch, features_dim)
        
        # Rama de la skip connection:
        # skip_out = self.skip_proj(x)        # (batch, features_dim)
        
        # Se suman ambas ramas
        features = fc_out #+ skip_out
        return features


def make_model(env: gymnasium.Env):
    policy_kwargs = dict(
        # features_extractor_class=CNNTransformerFeaturesExtractor,
        # features_extractor_class=CNNWithResidualAndAttention,
        # features_extractor_kwargs=dict(
        #     features_dim=128,     # salida final de la red
        #     d_model=128,          # tamaño de embedding para el Transformer
        #     num_layers=4,         # 6 capas en el Transformer
        #     nhead=4,              # 4 cabezas de atención
        #     dim_feedforward=128,  # FFN interno de cada capa de Transformer
        #     dropout=0.1
        # ),
        # net_arch: capas MLP tras la CNN. Por defecto "small", 
        # pero se puede personalizar:
        # net_arch=[128, 128],
        optimizer_class=torch.optim.AdamW, 
        activation_fn=nn.ReLU
    )

    model = PPO(
        policy="CnnPolicy",
        env=env,
        verbose=1,
        learning_rate=linear_decay_lr(1e-3,1e-6),#exp_decay_lr(1e-3, 5e-7),  # Se podría poner un schedule lineal, o exponencial
        n_steps=128,         
        batch_size=128*9,
        gamma=0.95,
        gae_lambda=0.99,
        ent_coef=0.01,
        clip_range=0.1,
        policy_kwargs=policy_kwargs,
    )
   
    return model


model = make_model(env)
model.learn(total_timesteps=10_000_000)

Using cuda device
-----------------------------
| time/              |      |
|    fps             | 931  |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 1152 |
-----------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 186         |
|    ep_rew_mean          | 1.44        |
| time/                   |             |
|    fps                  | 733         |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 2304        |
| train/                  |             |
|    approx_kl            | 0.002108797 |
|    clip_fraction        | 0           |
|    clip_range           | 0.1         |
|    entropy_loss         | -1.39       |
|    explained_variance   | 0.0197      |
|    learning_rate        | 0.001       |
|    loss                 | 0.0117      |
|    n_updates            | 10          |
|    policy_grad

<stable_baselines3.ppo.ppo.PPO at 0x1dd0fc40a40>

In [None]:
############ EVALUACION DEL REWARD DEL MODELO #################

from stable_baselines3.common.evaluation import evaluate_policy

vec_env = model.get_env()
mean_reward, std_reward = evaluate_policy(model, vec_env, n_eval_episodes=20, warn=False)
print(mean_reward)

In [10]:
model.save('./ppo_breakout_1M')   


In [11]:
# GIF from images

import imageio
import numpy as np

env = model.get_env()

# Record the video starting at the first step
video_folder = "/kaggle/working/videos"
video_length = 7_000
#env = VecVideoRecorder(env, video_folder,
 #                      record_video_trigger=lambda x: x == 0, video_length=video_length,
 #                      name_prefix=f"random-agent-{'Breakout-v5'}")

images = []

obs = env.reset()

img = model.env.render(mode="rgb_array")
#img = model.env.render()
for i in range(video_length + 1):
    if i ==video_length:
        print("TERMINA")
    images.append(img)
    action, _states = model.predict(obs, deterministic=False)
    obs, reward, done, info = env.step(action)
    #img = model.env.render()
    img = model.env.render(mode="rgb_array")
    #env.render()
    # VecEnv resets automatically
    # if done:
    #     obs = env.reset()
    
imageio.mimsave("10M_breakout.gif", [np.array(img) for i, img in enumerate(images) if i%2 == 0], duration=20)
env.close()

TERMINA
