<a href="https://colab.research.google.com/github/ever-oli/MLby22/blob/main/videogameRL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install "gymnasium[atari]" torch numpy imageio imageio-ffmpeg matplotlib -q

import gymnasium as gym
import ale_py
from gymnasium.wrappers import AtariPreprocessing, FrameStackObservation
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque
import imageio
import base64
from IPython.display import HTML, display

# Register ALE environments
gym.register_envs(ale_py)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Executing on device: {device}\n")

# 1. Hyperparameters & Environment Setup
env_name = 'ALE/Pong-v5'
batch_size = 32
gamma = 0.99
epsilon_start = 1.0
epsilon_end = 0.02
epsilon_decay = 0.99
learning_rate = 1e-4
target_update = 10
memory_size = 10000
num_episodes =

# Initialize environment with frameskip=1 to disable internal skipping
env = gym.make(env_name, render_mode='rgb_array', frameskip=1)
env = AtariPreprocessing(env, screen_size=84, grayscale_obs=True, frame_skip=4, scale_obs=True)
env = FrameStackObservation(env, 4)

# 2. Replay Buffer
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((np.array(state), action, reward, np.array(next_state), done))

    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
        return (np.array(state), np.array(action), np.array(reward, dtype=np.float32),
                np.array(next_state), np.array(done, dtype=np.float32))

    def __len__(self):
        return len(self.buffer)

# 3. Convolutional Neural Network (CNN)
class AtariCNN(nn.Module):
    def __init__(self, input_channels, num_actions):
        super(AtariCNN, self).__init__()
        self.network = nn.Sequential(
            nn.Conv2d(input_channels, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(3136, 512),
            nn.ReLU(),
            nn.Linear(512, num_actions)
        )

    def forward(self, x):
        return self.network(x)

input_channels = 4
num_actions = env.action_space.n

policy_net = AtariCNN(input_channels, num_actions).to(device)
target_net = AtariCNN(input_channels, num_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)
memory = ReplayBuffer(memory_size)

# 4. Training Loop
epsilon = epsilon_start
print("Starting CNN DQN Training (Demo Mode)...")

for episode in range(num_episodes):
    state, _ = env.reset()
    total_reward = 0
    done = False

    while not done:
        if random.random() > epsilon:
            with torch.no_grad():
                state_tensor = torch.FloatTensor(np.array(state)).unsqueeze(0).to(device)
                q_values = policy_net(state_tensor)
                action = q_values.argmax().item()
        else:
            action = env.action_space.sample()

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        memory.push(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward

        if len(memory) > batch_size:
            b_state, b_action, b_reward, b_next_state, b_done = memory.sample(batch_size)

            b_state = torch.FloatTensor(b_state).to(device)
            b_action = torch.LongTensor(b_action).unsqueeze(1).to(device)
            b_reward = torch.FloatTensor(b_reward).unsqueeze(1).to(device)
            b_next_state = torch.FloatTensor(b_next_state).to(device)
            b_done = torch.FloatTensor(b_done).unsqueeze(1).to(device)

            current_q_values = policy_net(b_state).gather(1, b_action)

            with torch.no_grad():
                max_next_q_values = target_net(b_next_state).max(1)[0].unsqueeze(1)
                target_q_values = b_reward + (gamma * max_next_q_values * (1 - b_done))

            loss = nn.SmoothL1Loss()(current_q_values, target_q_values)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    epsilon = max(epsilon_end, epsilon * epsilon_decay)

    if episode % target_update == 0:
        target_net.load_state_dict(policy_net.state_dict())

    print(f"Episode {episode+1:3d} | Reward: {total_reward:.1f} | Epsilon: {epsilon:.2f}")

print("\nTraining Phase Complete. Initiating Render...")

# 5. Visualizing the Agent
# Added frameskip=1 here as well
render_env = gym.make(env_name, render_mode='rgb_array', frameskip=1)
eval_env = gym.make(env_name, render_mode='rgb_array', frameskip=1)
eval_env = AtariPreprocessing(eval_env, screen_size=84, grayscale_obs=True, frame_skip=4, scale_obs=True)
eval_env = FrameStackObservation(eval_env, 4)

state, _ = eval_env.reset()
render_env.reset()
done = False
frames = []

policy_net.eval()

while not done:
    rgb_frame = render_env.render()
    frames.append(rgb_frame)

    with torch.no_grad():
        state_tensor = torch.FloatTensor(np.array(state)).unsqueeze(0).to(device)
        q_values = policy_net(state_tensor)
        action = q_values.argmax().item()

    state, reward, terminated, truncated, _ = eval_env.step(action)

    for _ in range(4):
        _, _, r_term, r_trunc, _ = render_env.step(action)
        if r_term or r_trunc:
            break

    done = terminated or truncated

eval_env.close()
render_env.close()

print(f"Captured {len(frames)} frames. Encoding MP4...")
video_path = 'pong_agent.mp4'
imageio.mimsave(video_path, frames, fps=30)

video_file = open(video_path, "r+b").read()
video_url = f"data:video/mp4;base64,{base64.b64encode(video_file).decode()}"

html_code = f"""
<video width="400" controls autoplay loop>
  <source src="{video_url}" type="video/mp4">
</video>
"""
display(HTML(html_code))
print("Process Complete.")

Executing on device: cuda

Starting CNN DQN Training (Demo Mode)...
Episode   1 | Reward: -20.0 | Epsilon: 0.99
Episode   2 | Reward: -20.0 | Epsilon: 0.98
Episode   3 | Reward: -21.0 | Epsilon: 0.97
Episode   4 | Reward: -21.0 | Epsilon: 0.96
Episode   5 | Reward: -21.0 | Epsilon: 0.95
Episode   6 | Reward: -21.0 | Epsilon: 0.94
Episode   7 | Reward: -21.0 | Epsilon: 0.93
Episode   8 | Reward: -21.0 | Epsilon: 0.92
Episode   9 | Reward: -21.0 | Epsilon: 0.91
Episode  10 | Reward: -20.0 | Epsilon: 0.90
Episode  11 | Reward: -20.0 | Epsilon: 0.90
Episode  12 | Reward: -21.0 | Epsilon: 0.89
Episode  13 | Reward: -21.0 | Epsilon: 0.88
Episode  14 | Reward: -21.0 | Epsilon: 0.87
Episode  15 | Reward: -19.0 | Epsilon: 0.86
Episode  16 | Reward: -21.0 | Epsilon: 0.85
Episode  17 | Reward: -18.0 | Epsilon: 0.84
Episode  18 | Reward: -20.0 | Epsilon: 0.83
Episode  19 | Reward: -21.0 | Epsilon: 0.83
Episode  20 | Reward: -20.0 | Epsilon: 0.82
Episode  21 | Reward: -21.0 | Epsilon: 0.81
Episode 



Process Complete.
