In [None]:
# @title Atari Pong AI - Installation Script (More Robust)

# This script aims to provide the most robust installation for Atari Pong
# using Stable Baselines3 and Gymnasium in Google Colab.

# IMPORTANT:
# 1. Start with a FRESH COLAB NOTEBOOK (Runtime -> Restart runtime).
# 2. Run this cell FIRST and wait for it to complete.
# 3. Carefully observe ALL output.
# 4. If the "Quick Environment Test" at the end FAILS, go to "Runtime -> Restart runtime"
#    and run *this entire cell again from scratch*. This is often necessary.
# 5. If it still fails after a couple of restarts and reruns, please share the FULL output.

print("--- Starting Robust Installation for Atari Pong ---")

# 0. Ensure pip is up-to-date
print("\n0. Upgrading pip...")
!pip install --upgrade pip

# 1. Install/Upgrade core Gymnasium and specific ale-py.
#    `gymnasium` is the successor to `gym`.
#    We explicitly install `ale-py` and then `gymnasium[atari]` to ensure order.
print("\n1. Installing/Upgrading ale-py and gymnasium[atari]...")
# Install ale-py first, explicitly, to ensure it's present for gymnasium[atari]
!pip install --upgrade ale-py
# Then install gymnasium with atari extras, which should now find ale-py
!pip install --upgrade gymnasium[atari]

# 2. Install Stable Baselines3 (SB3).
print("\n2. Installing/Upgrading stable-baselines3...")
!pip install --upgrade stable-baselines3

# 3. Install AutoROM.
#    This is CRUCIAL for Atari ROM management.
print("\n3. Installing AutoROM...")
!pip install autorom[accept-rom-license]

# 4. Run AutoROM.build() to download Atari ROMs.
#    This command needs to be run explicitly. This is the most common point of failure.
print("\n4. Running AutoROM.build() to download Atari ROMs. This may take a moment...")
print("Look for messages indicating ROMs are being downloaded/accepted.")
!python -m autorom.accept-rom-license

# 5. Install OpenCV Python (cv2).
print("\n5. Installing opencv-python...")
!pip install --upgrade opencv-python

print("\n--- Installation Steps Completed ---")

# --- Robust Environment Test ---
# This test attempts to create the Pong environment to verify installation.
print("\n--- Running Robust Environment Test for 'ALE/Pong-v5' ---")
try:
    import gymnasium as gym
    # Try importing ale_py directly to check if it's found
    try:
        import ale_py
        print(f"Successfully imported ale_py version: {ale_py.__version__}")
    except ImportError:
        print("ERROR: Could not import 'ale_py'. This indicates a fundamental installation issue.")
        raise

    # Attempt to make the environment
    env_test = gym.make("ALE/Pong-v5")
    env_test.reset()
    env_test.close()
    print(f"Successfully created and reset 'ALE/Pong-v5' environment.")
    print("This indicates that the Atari ROMs and dependencies are likely set up correctly.")
    print("\nSUCCESS: You can now proceed to the training script in a new cell.")
except Exception as e:
    print(f"\nFATAL ERROR: Failed to create 'ALE/Pong-v5' environment during test: {e}")
    print("This error means the Atari ROMs or the 'ale-py' library are NOT correctly set up.")
    print("\n--- TROUBLESHOOTING STEPS ---")
    print("1. Go to 'Runtime -> Restart runtime' in the Colab menu.")
    print("2. Run *this entire installation cell* again from scratch.")
    print("3. Carefully verify the output of `!python -m autorom.accept-rom-license` for ROM downloads.")
    print("4. If the error persists after 2-3 attempts, consider trying a different Colab instance or reporting the full error output.")
# @title Atari Pong AI - Training Script with Custom CNN (Saving to Google Drive)

# This script trains an AI agent to play Atari Pong using Stable Baselines3 (PPO algorithm)
# and Gymnasium. It saves the trained models directly to Google Drive.
# This version includes a custom-defined CNN architecture for the policy.
!pip install stable-baselines3[extra]
!pip install gymnasium[atari,accept-rom-license]
!pip install torch torchvision

import gymnasium as gym
from stable_baselines3 import PPO
# IMPORTANT: Import make_atari_env instead of make_vec_env for proper preprocessing
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.policies import ActorCriticPolicy
import torch as th
import torch.nn as nn
import os

# # --- 0. Mount Google Drive ---
# print("--- Mounting Google Drive ---")
# from google.colab import drive
# drive.mount('/content/drive')
# print("Google Drive mounted successfully.")


# --- 1. Define Custom CNN Architecture ---
# This class defines the neural network that will process the game's image observations.
# It inherits from BaseFeaturesExtractor, which is the standard way to create
# custom feature extractors in Stable Baselines3.
class CustomCNN(BaseFeaturesExtractor):
    """
    A deeper CNN feature extractor for reinforcement learning with complex environments.
    """
    def __init__(self, observation_space: gym.spaces.Box, features_dim: int = 512):
        super(CustomCNN, self).__init__(observation_space, features_dim)
        n_input_channels = observation_space.shape[0]

        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 32, kernel_size=8, stride=4),
            nn.BatchNorm2d(32),
            nn.ReLU(),

            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.BatchNorm2d(64),
            nn.ReLU(),

            nn.Conv2d(64, 128, kernel_size=3, stride=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),

            nn.Conv2d(128, 128, kernel_size=3, stride=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),

            nn.Flatten()
        )

        with th.no_grad():
            dummy_input = th.as_tensor(observation_space.sample()[None]).float()
            n_flatten = self.cnn(dummy_input).shape[1]

        self.linear = nn.Sequential(
            nn.Linear(n_flatten, 1024),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(1024, features_dim),
            nn.ReLU()
        )

    def forward(self, observations: th.Tensor) -> th.Tensor:
        return self.linear(self.cnn(observations / 255.0))

# --- Configuration ---
ENV_ID = "ALE/Pong-v5"  # The Gymnasium ID for Atari Pong

# Set the log directory to a path within your Google Drive
LOG_DIR = "./data/pong_ppo_custom_cnn_logs/"
TOTAL_TIMESTEPS = 250_000  # Total number of timesteps for training
SAVE_FREQ = 100_000  # Save model every X timesteps
N_ENVS = 4  # Number of parallel environments to run for vectorized training

# Create log directory in Google Drive if it doesn't exist
os.makedirs(LOG_DIR, exist_ok=True)

print(f"--- Starting Training for {ENV_ID} with Custom CNN ---")
print(f"Logs and models will be saved in: {LOG_DIR}")
print(f"Total timesteps: {TOTAL_TIMESTEPS}")
print(f"Number of parallel environments: {N_ENVS}")

# --- Environment Setup ---
# *** FIX: Use make_atari_env to apply the correct wrappers ***
# This handles frame-stacking, grayscale, resizing, etc., automatically.
vec_env = make_atari_env(ENV_ID, n_envs=N_ENVS, seed=0)
# The environment is now automatically wrapped with VecFrameStack and other
# essential preprocessing wrappers for Atari.
print(f"Successfully created and wrapped vectorized environment for {ENV_ID}")
print(f"Corrected Observation space shape: {vec_env.observation_space.shape}")
    # print(f"ERROR: Failed to create environment '{ENV_ID}': {e}")
    # exit()

# --- Model Definition with Custom Policy ---

# `policy_kwargs` is a dictionary passed to the model constructor.
# It tells the PPO model to use our `CustomCNN` class as the feature extractor.
policy_kwargs = {
    "features_extractor_class": CustomCNN,
    "features_extractor_kwargs": dict(features_dim=256),
}

# The model is now initialized with the 'CnnPolicy' but its default
# feature extractor will be replaced by our custom one via `policy_kwargs`.
model = PPO(
    "CnnPolicy",
    vec_env,
    policy_kwargs=policy_kwargs,
    verbose=1,
    tensorboard_log=LOG_DIR,
    device="auto" # Automatically uses GPU if available, otherwise CPU
)
print("\n--- PPO model initialized with Custom CNN Policy ---")
print("Model Architecture:")
print(model.policy)
print("---------------------------------------------------\n")


# --- Callbacks ---
checkpoint_callback = CheckpointCallback(
    save_freq=max(SAVE_FREQ // N_ENVS, 1),
    save_path=LOG_DIR,
    name_prefix="pong_ppo_custom_model"
)
print(f"Checkpoint callback set to save every {SAVE_FREQ} total timesteps.")

# --- Training ---
print("\n--- Starting Training Process ---")
try:
    model.learn(
        total_timesteps=TOTAL_TIMESTEPS,
        callback=checkpoint_callback,
        progress_bar=True
    )
    print("\nTraining completed!")
except KeyboardInterrupt:
    print("\nTraining interrupted by user.")
except Exception as e:
    print(f"\nAn unexpected error occurred during training: {e}")

# --- Save Final Model ---
final_model_path = os.path.join(LOG_DIR, "pong_ppo_custom_final_model")
model.save(final_model_path)
print(f"Final model saved to: {final_model_path}.zip")

# --- Optional: Evaluation (same as before) ---
print("\n--- Evaluation (Optional) ---")
# The evaluation code does not need to be changed.
# The loaded model will expect the same preprocessed observations.
try:
    loaded_model = PPO.load(final_model_path)
    eval_env = make_atari_env(ENV_ID, n_envs=1) # Use the same env creation for eval

    num_episodes = 5
    for episode in range(num_episodes):
        obs = eval_env.reset()
        episode_reward = 0
        done = False
        print(f"Starting evaluation episode {episode + 1}/{num_episodes}...")
        while not done:
            action, _states = loaded_model.predict(obs, deterministic=True)
            obs, reward, done, info = eval_env.step(action)
            episode_reward += reward[0] # Reward is an array in vec env
        print(f"Episode {episode + 1} finished with reward: {episode_reward}")
    eval_env.close()
    print("Evaluation complete.")

except Exception as e:
    print(f"Error during evaluation: {e}")
    print("Evaluation skipped.")

print("\nTo view training progress, you can use TensorBoard:")
print(f"Load TensorBoard in a new Colab cell with: %load_ext tensorboard")
print(f"Then run: %tensorboard --logdir {LOG_DIR}")

--- Starting Robust Installation for Atari Pong ---

0. Upgrading pip...

1. Installing/Upgrading ale-py and gymnasium[atari]...
Collecting gymnasium[atari]
  Using cached gymnasium-1.2.0-py3-none-any.whl.metadata (9.9 kB)
Using cached gymnasium-1.2.0-py3-none-any.whl (944 kB)
Installing collected packages: gymnasium
  Attempting uninstall: gymnasium
    Found existing installation: gymnasium 1.1.1
    Uninstalling gymnasium-1.1.1:
      Successfully uninstalled gymnasium-1.1.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
stable-baselines3 2.6.0 requires gymnasium<1.2.0,>=0.29.1, but you have gymnasium 1.2.0 which is incompatible.[0m[31m
[0mSuccessfully installed gymnasium-1.2.0

2. Installing/Upgrading stable-baselines3...
Collecting gymnasium<1.2.0,>=0.29.1 (from stable-baselines3)
  Using cached gymnasium-1.1.1-py3-none-any.whl.metadata (9.4 kB)
U

A.L.E: Arcade Learning Environment (version 0.11.2+ecc1138)
[Powered by Stella]


--- Starting Training for ALE/Pong-v5 with Custom CNN ---
Logs and models will be saved in: ./data/pong_ppo_custom_cnn_logs/
Total timesteps: 250000
Number of parallel environments: 4
Successfully created and wrapped vectorized environment for ALE/Pong-v5
Corrected Observation space shape: (84, 84, 1)
Using cuda device
Wrapping the env in a VecTransposeImage.

--- PPO model initialized with Custom CNN Policy ---
Model Architecture:
ActorCriticCnnPolicy(
  (features_extractor): CustomCNN(
    (cnn): Sequential(
      (0): Conv2d(1, 16, kernel_size=(8, 8), stride=(4, 4))
      (1): ReLU()
      (2): Conv2d(16, 32, kernel_size=(4, 4), stride=(2, 2))
      (3): ReLU()
      (4): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
      (5): ReLU()
      (6): Flatten(start_dim=1, end_dim=-1)
    )
    (linear): Sequential(
      (0): Linear(in_features=3136, out_features=256, bias=True)
      (1): ReLU()
    )
  )
  (pi_features_extractor): CustomCNN(
    (cnn): Sequential(
      (0): Conv2d(

Output()

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 865      |
|    ep_rew_mean     | -20.7    |
| time/              |          |
|    fps             | 903      |
|    iterations      | 1        |
|    time_elapsed    | 9        |
|    total_timesteps | 8192     |
---------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 874         |
|    ep_rew_mean          | -20.6       |
| time/                   |             |
|    fps                  | 794         |
|    iterations           | 2           |
|    time_elapsed         | 20          |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.011050285 |
|    clip_fraction        | 0.0326      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.78       |
|    explained_variance   | -2.32e-05   |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0935      |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00223    |
|    value_loss           | 0.32        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 873         |
|    ep_rew_mean          | -20.6       |
| time/                   |             |
|    fps                  | 770         |
|    iterations           | 3           |
|    time_elapsed         | 31          |
|    total_timesteps      | 24576       |
| train/                  |             |
|    approx_kl            | 0.010753626 |
|    clip_fraction        | 0.0416      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.77       |
|    explained_variance   | 3.13e-05    |
|    learning_rate        | 0.0003      |
|    loss                 | 0.297       |
|    n_updates            | 20          |
|    policy_gradient_loss | -0.00246    |
|    value_loss           | 0.461       |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 877         |
|    ep_rew_mean          | -20.6       |
| time/                   |             |
|    fps                  | 758         |
|    iterations           | 4           |
|    time_elapsed         | 43          |
|    total_timesteps      | 32768       |
| train/                  |             |
|    approx_kl            | 0.014685741 |
|    clip_fraction        | 0.0545      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.77       |
|    explained_variance   | 3.76e-06    |
|    learning_rate        | 0.0003      |
|    loss                 | 0.241       |
|    n_updates            | 30          |
|    policy_gradient_loss | -0.0034     |
|    value_loss           | 0.616       |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 862         |
|    ep_rew_mean          | -20.8       |
| time/                   |             |
|    fps                  | 752         |
|    iterations           | 5           |
|    time_elapsed         | 54          |
|    total_timesteps      | 40960       |
| train/                  |             |
|    approx_kl            | 0.013073241 |
|    clip_fraction        | 0.0339      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.76       |
|    explained_variance   | 4.29e-06    |
|    learning_rate        | 0.0003      |
|    loss                 | 0.392       |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.00203    |
|    value_loss           | 0.789       |
-----------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 873          |
|    ep_rew_mean          | -20.7        |
| time/                   |              |
|    fps                  | 747          |
|    iterations           | 6            |
|    time_elapsed         | 65           |
|    total_timesteps      | 49152        |
| train/                  |              |
|    approx_kl            | 0.0068962555 |
|    clip_fraction        | 0.0099       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.77        |
|    explained_variance   | 2.5e-06      |
|    learning_rate        | 0.0003       |
|    loss                 | 0.627        |
|    n_updates            | 50           |
|    policy_gradient_loss | -9.29e-05    |
|    value_loss           | 0.967        |
------------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 879         |
|    ep_rew_mean          | -20.7       |
| time/                   |             |
|    fps                  | 744         |
|    iterations           | 7           |
|    time_elapsed         | 77          |
|    total_timesteps      | 57344       |
| train/                  |             |
|    approx_kl            | 0.013456276 |
|    clip_fraction        | 0.0523      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.75       |
|    explained_variance   | 1.07e-06    |
|    learning_rate        | 0.0003      |
|    loss                 | 0.491       |
|    n_updates            | 60          |
|    policy_gradient_loss | -0.00335    |
|    value_loss           | 1.12        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 875         |
|    ep_rew_mean          | -20.7       |
| time/                   |             |
|    fps                  | 742         |
|    iterations           | 8           |
|    time_elapsed         | 88          |
|    total_timesteps      | 65536       |
| train/                  |             |
|    approx_kl            | 0.011247346 |
|    clip_fraction        | 0.0377      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.76       |
|    explained_variance   | 1.43e-06    |
|    learning_rate        | 0.0003      |
|    loss                 | 0.778       |
|    n_updates            | 70          |
|    policy_gradient_loss | -0.0024     |
|    value_loss           | 1.23        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 871         |
|    ep_rew_mean          | -20.7       |
| time/                   |             |
|    fps                  | 740         |
|    iterations           | 9           |
|    time_elapsed         | 99          |
|    total_timesteps      | 73728       |
| train/                  |             |
|    approx_kl            | 0.011167129 |
|    clip_fraction        | 0.0318      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.74       |
|    explained_variance   | 1.79e-07    |
|    learning_rate        | 0.0003      |
|    loss                 | 0.591       |
|    n_updates            | 80          |
|    policy_gradient_loss | -0.00211    |
|    value_loss           | 1.27        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 884         |
|    ep_rew_mean          | -20.7       |
| time/                   |             |
|    fps                  | 738         |
|    iterations           | 10          |
|    time_elapsed         | 110         |
|    total_timesteps      | 81920       |
| train/                  |             |
|    approx_kl            | 0.009110609 |
|    clip_fraction        | 0.0267      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.74       |
|    explained_variance   | 5.96e-08    |
|    learning_rate        | 0.0003      |
|    loss                 | 0.659       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.00159    |
|    value_loss           | 1.41        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 886         |
|    ep_rew_mean          | -20.7       |
| time/                   |             |
|    fps                  | 737         |
|    iterations           | 11          |
|    time_elapsed         | 122         |
|    total_timesteps      | 90112       |
| train/                  |             |
|    approx_kl            | 0.007900674 |
|    clip_fraction        | 0.0316      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.75       |
|    explained_variance   | 5.96e-08    |
|    learning_rate        | 0.0003      |
|    loss                 | 1.08        |
|    n_updates            | 100         |
|    policy_gradient_loss | -0.00197    |
|    value_loss           | 1.39        |
-----------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 881          |
|    ep_rew_mean          | -20.7        |
| time/                   |              |
|    fps                  | 736          |
|    iterations           | 12           |
|    time_elapsed         | 133          |
|    total_timesteps      | 98304        |
| train/                  |              |
|    approx_kl            | 0.0069236504 |
|    clip_fraction        | 0.0365       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.74        |
|    explained_variance   | 5.96e-08     |
|    learning_rate        | 0.0003       |
|    loss                 | 0.654        |
|    n_updates            | 110          |
|    policy_gradient_loss | -0.00213     |
|    value_loss           | 1.6          |
------------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 878         |
|    ep_rew_mean          | -20.6       |
| time/                   |             |
|    fps                  | 735         |
|    iterations           | 13          |
|    time_elapsed         | 144         |
|    total_timesteps      | 106496      |
| train/                  |             |
|    approx_kl            | 0.016473968 |
|    clip_fraction        | 0.0544      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.77       |
|    explained_variance   | 5.96e-08    |
|    learning_rate        | 0.0003      |
|    loss                 | 0.808       |
|    n_updates            | 120         |
|    policy_gradient_loss | -0.00325    |
|    value_loss           | 1.52        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 886         |
|    ep_rew_mean          | -20.6       |
| time/                   |             |
|    fps                  | 734         |
|    iterations           | 14          |
|    time_elapsed         | 156         |
|    total_timesteps      | 114688      |
| train/                  |             |
|    approx_kl            | 0.013469819 |
|    clip_fraction        | 0.0343      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.79       |
|    explained_variance   | 0           |
|    learning_rate        | 0.0003      |
|    loss                 | 0.784       |
|    n_updates            | 130         |
|    policy_gradient_loss | -0.0028     |
|    value_loss           | 1.61        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 886         |
|    ep_rew_mean          | -20.6       |
| time/                   |             |
|    fps                  | 734         |
|    iterations           | 15          |
|    time_elapsed         | 167         |
|    total_timesteps      | 122880      |
| train/                  |             |
|    approx_kl            | 0.010034815 |
|    clip_fraction        | 0.025       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.79       |
|    explained_variance   | 0           |
|    learning_rate        | 0.0003      |
|    loss                 | 1.24        |
|    n_updates            | 140         |
|    policy_gradient_loss | -0.00187    |
|    value_loss           | 1.51        |
-----------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 901          |
|    ep_rew_mean          | -20.6        |
| time/                   |              |
|    fps                  | 733          |
|    iterations           | 16           |
|    time_elapsed         | 178          |
|    total_timesteps      | 131072       |
| train/                  |              |
|    approx_kl            | 0.0054440247 |
|    clip_fraction        | 0.0149       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.77        |
|    explained_variance   | -1.19e-07    |
|    learning_rate        | 0.0003       |
|    loss                 | 1.03         |
|    n_updates            | 150          |
|    policy_gradient_loss | -0.000215    |
|    value_loss           | 1.55         |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 902          |
|    ep_rew_mean          | -20.6        |
| time/                   |              |
|    fps                  | 733          |
|    iterations           | 17           |
|    time_elapsed         | 189          |
|    total_timesteps      | 139264       |
| train/                  |              |
|    approx_kl            | 0.0059566884 |
|    clip_fraction        | 0.0214       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.76        |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 0.551        |
|    n_updates            | 160          |
|    policy_gradient_loss | -0.00123     |
|    value_loss           | 1.53         |
------------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 901         |
|    ep_rew_mean          | -20.7       |
| time/                   |             |
|    fps                  | 732         |
|    iterations           | 18          |
|    time_elapsed         | 201         |
|    total_timesteps      | 147456      |
| train/                  |             |
|    approx_kl            | 0.012393981 |
|    clip_fraction        | 0.0126      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.77       |
|    explained_variance   | 0           |
|    learning_rate        | 0.0003      |
|    loss                 | 0.505       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.000708   |
|    value_loss           | 1.61        |
-----------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 889          |
|    ep_rew_mean          | -20.6        |
| time/                   |              |
|    fps                  | 732          |
|    iterations           | 19           |
|    time_elapsed         | 212          |
|    total_timesteps      | 155648       |
| train/                  |              |
|    approx_kl            | 0.0076450724 |
|    clip_fraction        | 0.016        |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.76        |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 0.745        |
|    n_updates            | 180          |
|    policy_gradient_loss | -0.000715    |
|    value_loss           | 1.58         |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 880          |
|    ep_rew_mean          | -20.7        |
| time/                   |              |
|    fps                  | 731          |
|    iterations           | 20           |
|    time_elapsed         | 223          |
|    total_timesteps      | 163840       |
| train/                  |              |
|    approx_kl            | 0.0080536865 |
|    clip_fraction        | 0.0242       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.74        |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 0.759        |
|    n_updates            | 190          |
|    policy_gradient_loss | -0.00185     |
|    value_loss           | 1.61         |
------------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 882         |
|    ep_rew_mean          | -20.7       |
| time/                   |             |
|    fps                  | 731         |
|    iterations           | 21          |
|    time_elapsed         | 235         |
|    total_timesteps      | 172032      |
| train/                  |             |
|    approx_kl            | 0.013127221 |
|    clip_fraction        | 0.0591      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.75       |
|    explained_variance   | 5.96e-08    |
|    learning_rate        | 0.0003      |
|    loss                 | 0.822       |
|    n_updates            | 200         |
|    policy_gradient_loss | -0.00399    |
|    value_loss           | 1.61        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 882         |
|    ep_rew_mean          | -20.7       |
| time/                   |             |
|    fps                  | 731         |
|    iterations           | 22          |
|    time_elapsed         | 246         |
|    total_timesteps      | 180224      |
| train/                  |             |
|    approx_kl            | 0.009198314 |
|    clip_fraction        | 0.0163      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.73       |
|    explained_variance   | 0           |
|    learning_rate        | 0.0003      |
|    loss                 | 0.71        |
|    n_updates            | 210         |
|    policy_gradient_loss | -0.000487   |
|    value_loss           | 1.6         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 884         |
|    ep_rew_mean          | -20.7       |
| time/                   |             |
|    fps                  | 731         |
|    iterations           | 23          |
|    time_elapsed         | 257         |
|    total_timesteps      | 188416      |
| train/                  |             |
|    approx_kl            | 0.008498326 |
|    clip_fraction        | 0.0178      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.73       |
|    explained_variance   | 0           |
|    learning_rate        | 0.0003      |
|    loss                 | 0.279       |
|    n_updates            | 220         |
|    policy_gradient_loss | -0.00125    |
|    value_loss           | 1.57        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 898         |
|    ep_rew_mean          | -20.7       |
| time/                   |             |
|    fps                  | 730         |
|    iterations           | 24          |
|    time_elapsed         | 269         |
|    total_timesteps      | 196608      |
| train/                  |             |
|    approx_kl            | 0.014956825 |
|    clip_fraction        | 0.0457      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.7        |
|    explained_variance   | 0           |
|    learning_rate        | 0.0003      |
|    loss                 | 0.523       |
|    n_updates            | 230         |
|    policy_gradient_loss | -0.00246    |
|    value_loss           | 1.54        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 912         |
|    ep_rew_mean          | -20.7       |
| time/                   |             |
|    fps                  | 730         |
|    iterations           | 25          |
|    time_elapsed         | 280         |
|    total_timesteps      | 204800      |
| train/                  |             |
|    approx_kl            | 0.011656603 |
|    clip_fraction        | 0.0377      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.65       |
|    explained_variance   | 0           |
|    learning_rate        | 0.0003      |
|    loss                 | 0.739       |
|    n_updates            | 240         |
|    policy_gradient_loss | -0.0024     |
|    value_loss           | 1.56        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 914         |
|    ep_rew_mean          | -20.6       |
| time/                   |             |
|    fps                  | 730         |
|    iterations           | 26          |
|    time_elapsed         | 291         |
|    total_timesteps      | 212992      |
| train/                  |             |
|    approx_kl            | 0.010436425 |
|    clip_fraction        | 0.0447      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.61       |
|    explained_variance   | -1.19e-07   |
|    learning_rate        | 0.0003      |
|    loss                 | 0.663       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.00249    |
|    value_loss           | 1.58        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 929         |
|    ep_rew_mean          | -20.6       |
| time/                   |             |
|    fps                  | 730         |
|    iterations           | 27          |
|    time_elapsed         | 302         |
|    total_timesteps      | 221184      |
| train/                  |             |
|    approx_kl            | 0.011665285 |
|    clip_fraction        | 0.0292      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.6        |
|    explained_variance   | 0           |
|    learning_rate        | 0.0003      |
|    loss                 | 0.714       |
|    n_updates            | 260         |
|    policy_gradient_loss | -0.00124    |
|    value_loss           | 1.58        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 916         |
|    ep_rew_mean          | -20.6       |
| time/                   |             |
|    fps                  | 730         |
|    iterations           | 28          |
|    time_elapsed         | 314         |
|    total_timesteps      | 229376      |
| train/                  |             |
|    approx_kl            | 0.009816372 |
|    clip_fraction        | 0.0253      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.59       |
|    explained_variance   | -1.19e-07   |
|    learning_rate        | 0.0003      |
|    loss                 | 1.03        |
|    n_updates            | 270         |
|    policy_gradient_loss | -0.0017     |
|    value_loss           | 1.55        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 891         |
|    ep_rew_mean          | -20.8       |
| time/                   |             |
|    fps                  | 729         |
|    iterations           | 29          |
|    time_elapsed         | 325         |
|    total_timesteps      | 237568      |
| train/                  |             |
|    approx_kl            | 0.008395543 |
|    clip_fraction        | 0.0121      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.61       |
|    explained_variance   | -1.19e-07   |
|    learning_rate        | 0.0003      |
|    loss                 | 0.833       |
|    n_updates            | 280         |
|    policy_gradient_loss | -0.000751   |
|    value_loss           | 1.54        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 901         |
|    ep_rew_mean          | -20.7       |
| time/                   |             |
|    fps                  | 729         |
|    iterations           | 30          |
|    time_elapsed         | 336         |
|    total_timesteps      | 245760      |
| train/                  |             |
|    approx_kl            | 0.013552099 |
|    clip_fraction        | 0.0487      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.61       |
|    explained_variance   | 5.96e-08    |
|    learning_rate        | 0.0003      |
|    loss                 | 0.988       |
|    n_updates            | 290         |
|    policy_gradient_loss | -0.00295    |
|    value_loss           | 1.63        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 902         |
|    ep_rew_mean          | -20.7       |
| time/                   |             |
|    fps                  | 729         |
|    iterations           | 31          |
|    time_elapsed         | 347         |
|    total_timesteps      | 253952      |
| train/                  |             |
|    approx_kl            | 0.011779689 |
|    clip_fraction        | 0.0262      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.61       |
|    explained_variance   | 0           |
|    learning_rate        | 0.0003      |
|    loss                 | 1.23        |
|    n_updates            | 300         |
|    policy_gradient_loss | -0.00139    |
|    value_loss           | 1.61        |
-----------------------------------------



Training completed!
Final model saved to: ./data/pong_ppo_custom_cnn_logs/pong_ppo_custom_final_model.zip

--- Evaluation (Optional) ---
Starting evaluation episode 1/5...
Episode 1 finished with reward: -21.0
Starting evaluation episode 2/5...
Episode 2 finished with reward: -21.0
Starting evaluation episode 3/5...
Episode 3 finished with reward: -21.0
Starting evaluation episode 4/5...
Episode 4 finished with reward: -21.0
Starting evaluation episode 5/5...
Episode 5 finished with reward: -21.0
Evaluation complete.

To view training progress, you can use TensorBoard:
Load TensorBoard in a new Colab cell with: %load_ext tensorboard
Then run: %tensorboard --logdir ./data/pong_ppo_custom_cnn_logs/
