In [None]:
# @title Atari Pong AI - Training Script with Custom CNN (Saving to Google Drive)

# This script trains an AI agent to play Atari Pong using Stable Baselines3 (PPO algorithm)
# and Gymnasium. It saves the trained models directly to Google Drive.
# This version includes a custom-defined CNN architecture for the policy.

import gymnasium as gym
from stable_baselines3 import PPO
# IMPORTANT: Import make_atari_env instead of make_vec_env for proper preprocessing
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.policies import ActorCriticPolicy
import torch as th
import torch.nn as nn
import os

# # --- 0. Mount Google Drive ---
# print("--- Mounting Google Drive ---")
# from google.colab import drive
# drive.mount('/content/drive')
# print("Google Drive mounted successfully.")


# --- 1. Define Custom CNN Architecture ---
# This class defines the neural network that will process the game's image observations.
# It inherits from BaseFeaturesExtractor, which is the standard way to create
# custom feature extractors in Stable Baselines3.

class CustomCNN(BaseFeaturesExtractor):
    """
    :param observation_space: The observation space of the environment.
    :param features_dim: The number of features to extract from the observation.
    """
    def __init__(self, observation_space: gym.spaces.Box, features_dim: int = 256):
        super(CustomCNN, self).__init__(observation_space, features_dim)
        # We assume CxHxW images (channels first)
        # After proper preprocessing, the observation space for Atari is a stack
        # of 4 frames of 84x84 pixels (hence, n_input_channels will be 4).
        n_input_channels = observation_space.shape[0]

        # Define the convolutional layers. You can add or remove layers here.
        # This architecture is now receiving the correct (4, 84, 84) input.
        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 16, kernel_size=8, stride=4, padding=0),
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=4, stride=2, padding=0),
            nn.ReLU(),
            # I've restored the third layer as an example of a deeper, valid network.
            # You can comment it out again if you wish.
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=0),
            nn.ReLU(),
            nn.Flatten(),
        )

        # Compute the size of the flattened features after the convolutions
        # by doing a forward pass with a dummy tensor.
        with th.no_grad():
            dummy_input = th.as_tensor(observation_space.sample()[None]).float()
            n_flatten = self.cnn(dummy_input).shape[1]

        # Define the linear layer that comes after the convolutions.
        # This layer will output the final feature vector.
        self.linear = nn.Sequential(
            nn.Linear(n_flatten, features_dim),
            nn.ReLU()
        )

    def forward(self, observations: th.Tensor) -> th.Tensor:
        """
        The forward method defines how input observations are processed by the network.
        """
        # The observations are automatically normalized to [0, 1] by the wrapper.
        return self.linear(self.cnn(observations / 255.0))


# --- Configuration ---
ENV_ID = "ALE/Pong-v5"  # The Gymnasium ID for Atari Pong

# Set the log directory to a path within your Google Drive
LOG_DIR = "./data/pong_ppo_custom_cnn_logs/"
TOTAL_TIMESTEPS = 250_000  # Total number of timesteps for training
SAVE_FREQ = 100_000  # Save model every X timesteps
N_ENVS = 4  # Number of parallel environments to run for vectorized training

# Create log directory in Google Drive if it doesn't exist
os.makedirs(LOG_DIR, exist_ok=True)

print(f"--- Starting Training for {ENV_ID} with Custom CNN ---")
print(f"Logs and models will be saved in: {LOG_DIR}")
print(f"Total timesteps: {TOTAL_TIMESTEPS}")
print(f"Number of parallel environments: {N_ENVS}")

# --- Environment Setup ---
# *** FIX: Use make_atari_env to apply the correct wrappers ***
# This handles frame-stacking, grayscale, resizing, etc., automatically.
try:
    vec_env = make_atari_env(ENV_ID, n_envs=N_ENVS, seed=0)
    # The environment is now automatically wrapped with VecFrameStack and other
    # essential preprocessing wrappers for Atari.
    print(f"Successfully created and wrapped vectorized environment for {ENV_ID}")
    print(f"Corrected Observation space shape: {vec_env.observation_space.shape}")
except Exception as e:
    print(f"ERROR: Failed to create environment '{ENV_ID}': {e}")
    exit()

# --- Model Definition with Custom Policy ---

# `policy_kwargs` is a dictionary passed to the model constructor.
# It tells the PPO model to use our `CustomCNN` class as the feature extractor.
policy_kwargs = {
    "features_extractor_class": CustomCNN,
    "features_extractor_kwargs": dict(features_dim=256),
}

# The model is now initialized with the 'CnnPolicy' but its default
# feature extractor will be replaced by our custom one via `policy_kwargs`.
model = PPO(
    "CnnPolicy",
    vec_env,
    policy_kwargs=policy_kwargs,
    verbose=1,
    tensorboard_log=LOG_DIR,
    device="auto" # Automatically uses GPU if available, otherwise CPU
)
print("\n--- PPO model initialized with Custom CNN Policy ---")
print("Model Architecture:")
print(model.policy)
print("---------------------------------------------------\n")


# --- Callbacks ---
checkpoint_callback = CheckpointCallback(
    save_freq=max(SAVE_FREQ // N_ENVS, 1),
    save_path=LOG_DIR,
    name_prefix="pong_ppo_custom_model"
)
print(f"Checkpoint callback set to save every {SAVE_FREQ} total timesteps.")

# --- Training ---
print("\n--- Starting Training Process ---")
try:
    model.learn(
        total_timesteps=TOTAL_TIMESTEPS,
        callback=checkpoint_callback,
        progress_bar=True
    )
    print("\nTraining completed!")
except KeyboardInterrupt:
    print("\nTraining interrupted by user.")
except Exception as e:
    print(f"\nAn unexpected error occurred during training: {e}")

# --- Save Final Model ---
final_model_path = os.path.join(LOG_DIR, "pong_ppo_custom_final_model")
model.save(final_model_path)
print(f"Final model saved to: {final_model_path}.zip")

# --- Optional: Evaluation (same as before) ---
print("\n--- Evaluation (Optional) ---")
# The evaluation code does not need to be changed.
# The loaded model will expect the same preprocessed observations.
try:
    loaded_model = PPO.load(final_model_path)
    eval_env = make_atari_env(ENV_ID, n_envs=1) # Use the same env creation for eval

    num_episodes = 5
    for episode in range(num_episodes):
        obs = eval_env.reset()
        episode_reward = 0
        done = False
        print(f"Starting evaluation episode {episode + 1}/{num_episodes}...")
        while not done:
            action, _states = loaded_model.predict(obs, deterministic=True)
            obs, reward, done, info = eval_env.step(action)
            episode_reward += reward[0] # Reward is an array in vec env
        print(f"Episode {episode + 1} finished with reward: {episode_reward}")
    eval_env.close()
    print("Evaluation complete.")

except Exception as e:
    print(f"Error during evaluation: {e}")
    print("Evaluation skipped.")

print("\nTo view training progress, you can use TensorBoard:")
print(f"Load TensorBoard in a new Colab cell with: %load_ext tensorboard")
print(f"Then run: %tensorboard --logdir {LOG_DIR}")

--- Starting Training for ALE/Pong-v5 with Custom CNN ---
Logs and models will be saved in: ./data/pong_ppo_custom_cnn_logs/
Total timesteps: 250000
Number of parallel environments: 4
ERROR: Failed to create environment 'ALE/Pong-v5': Namespace ALE not found. Have you installed the proper package for ALE?


NameError: name 'vec_env' is not defined

: 