# Libraries & Environment

In [None]:
!pip install gym
!pip install gymnasium
!pip install ale-py==0.8.1
!pip install "autorom[accept-rom-license]==0.4.2"
!pip install pygame==2.1.0
!pip install imageio==2.36.1
!pip install "imageio-ffmpeg==0.5.1"
!pip install moviepy==2.1.1

In [None]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque
import time
import os
from moviepy import ImageSequenceClip
import cv2
from gym.wrappers import AtariPreprocessing, FrameStack
from gym.wrappers import RecordVideo

# Neural Network

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self, action_space, device):
        super(NeuralNetwork, self).__init__()

        self.device = device
        
        self.net = nn.Sequential(
            nn.Conv2d(4, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(3136, 512),
            nn.ReLU(),
            nn.Linear(512, action_space)
        )

    def forward(self, x):
        return self.net(x / 255.0)

In [None]:
class ReplayBuffer:
    def __init__(self, size):
        self.buffer = deque(maxlen=size)

    def add(self, obs, next_obs, action, reward, done):
        self.buffer.append((obs, next_obs, action, reward, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        obs, next_obs, actions, rewards, dones = zip(*batch)
        return (
            np.stack(obs),
            np.stack(next_obs),
            np.array(actions),
            np.array(rewards),
            np.array(dones)
        )

    def size(self):
        return len(self.buffer)

# Define used functions

In [None]:
def make_env(env_id, record = False):
    if capture_video:
        env = gym.make(env_id, render_mode="rgb_array")
        env = gym.wrappers.RecordVideo(env, "/kaggle/working/videos")
    else:
        env = gym.make(env_id)
    env = gym.wrappers.RecordEpisodeStatistics(env)
    env = gym.wrappers.ResizeObservation(env, (84, 84))
    env = gym.wrappers.GrayScaleObservation(env)
    env = gym.wrappers.FrameStack(env, 4)
    env.action_space.seed(seed)
    return env

def linear_schedule(start_e, end_e, duration, t):
    slope = (end_e - start_e) / duration
    return max(slope * t + start_e, end_e)

def preprocess_frame(frame):
    gray_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)  
    resized_frame = cv2.resize(gray_frame, (84, 84), interpolation=cv2.INTER_AREA)  
    return resized_frame / 255.0

def play(env_id, q_network, device, seed=1, max_steps=5000, capture_video=False, video_output_dir='/kaggle/working/videos'):
    if capture_video:
        env = make_env(env_id, record = capture_video)
        env = RecordVideo(env, video_folder=video_output_dir, episode_trigger=lambda episode: True)  # Ghi tất cả các episode
        env.seed(seed)
    else:
        env = make_env(env_id, record=capture_video)
    
    action_network = NeuralNetwork(env.action_space.n, device=device).to(device)
    action_network.load_state_dict(q_network.state_dict())

    state, _ = env.reset(seed=seed)
    total_rewards = 0
    total_steps = 0
    done = False
    with torch.no_grad():
        while not done:
            if type(state) == tuple:
                state = state[0]
            q_value = action_network(torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device))
            action = torch.argmax(q_value, dim=1).item()
            next_state, reward, done, truncated, info = env.step(action)
            total_rewards += reward
            total_steps += 1
            state = next_state
            
            if total_steps == max_steps:
                break
    env.close()

    
    return total_rewards, total_steps

# Parameters & Hyperparameters

In [None]:
# Hyperparameters
learning_rate = 1e-4
gamma = 0.99
batch_size = 32
target_network_frequency = 1000
train_frequency = 4
log_frequency = 10000
exploration_fraction = 0.1
start_e = 1.0
end_e = 0.01
learning_starts = 10000
total_timesteps = 10000000
tau = 1.0
replay_buffer_size = 100000
seed = 21521992

# Parameters
env_id = 'BreakoutNoFrameskip-v4'
capture_video = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env = make_env(env_id, capture_video)
num_actions = env.action_space.n

# Training

In [None]:
q_network = NeuralNetwork(num_actions, device = device).to(device)
target_network = NeuralNetwork(num_actions, device = device).to(device)
target_network.load_state_dict(q_network.state_dict())
target_network.eval()
optimizer = optim.Adam(q_network.parameters(), lr=learning_rate)
replay_buffer = ReplayBuffer(replay_buffer_size)
save_dir = f'/kaggle/working/runs/{env_id}__{seed}'
video_dir = f'/kaggle/working/videos/{env_id}__{seed}'
os.makedirs(save_dir,exist_ok=True)
os.makedirs(video_dir,exist_ok=True)


obs, _ = env.reset()
obs = np.array(obs)
reward_buffer = deque(maxlen=100)
reward_per_episode = 0.0
all_rewards = []
training_times = []


for global_step in range(total_timesteps):
    start_time = time.time()
    epsilon = linear_schedule(start_e, end_e, exploration_fraction * total_timesteps, global_step)
    if random.random() < epsilon:
        action = env.action_space.sample()
    else:
        with torch.no_grad():
            if type(obs) == tuple:
                obs = obs[0]
            q_values = q_network(torch.tensor(obs, dtype=torch.float32).unsqueeze(0).to(device))
            action = torch.argmax(q_values, dim=1).item()
            
    next_obs, reward, done, truncated, info = env.step(action)
    reward_per_episode += reward
    if type(next_obs) == tuple:
        next_obs = next_obs[0]
    replay_buffer.add(obs, next_obs, action, reward, done)
    obs = next_obs
    
    if done:
        obs, _ = env.reset(seed=seed)
        reward_buffer.append(reward_per_episode)
        all_rewards.append((global_step, reward_per_episode))
        reward_per_episode = 0.0

    #  Q-net
    if (global_step + 1) % train_frequency == 0 and replay_buffer.size() > batch_size:
        obs_batch, next_obs_batch, actions_batch, rewards_batch, dones_batch = replay_buffer.sample(batch_size)
        obs_batch = torch.tensor(obs_batch, dtype=torch.float32).to(device)
        next_obs_batch = torch.tensor(next_obs_batch, dtype=torch.float32).to(device)
        actions_batch = torch.tensor(actions_batch, dtype=torch.long).unsqueeze(1).to(device)
        rewards_batch = torch.tensor(rewards_batch, dtype=torch.float32).to(device)
        dones_batch = torch.tensor(dones_batch, dtype=torch.float32).to(device)

        # Compute targets using the formulation sample = r + gamma * max q(s',a')
        with torch.no_grad():
            target_max = target_network(next_obs_batch).max(dim=1)[0]
            td_target = rewards_batch + gamma * target_max * (1 - dones_batch)
            
        # Compute loss
        q_values = q_network(obs_batch).gather(1, actions_batch).squeeze()
        loss = nn.functional.mse_loss(q_values, td_target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    end_time = time.time()
    training_times.append(end_time - start_time)
    
    if (global_step + 1) % target_network_frequency == 0:
        target_network.load_state_dict(q_network.state_dict())

    if (global_step + 1) % 1000 == 0:
        q_net_path = f"{save_dir}/Step__{global_step + 1}.pth"
        video_record_path = f"{video_dir}/Step__{global_step + 1}.mp4"
        torch.save(q_network.state_dict(), q_net_path)
        np.savez_compressed(save_dir + '/results.npz', all_rewards=all_rewards, training_times = training_times)
 

    if (global_step + 1) % 10000 == 0:
        average_reward = np.mean(reward_buffer)
        print(f'Episode: {len(all_rewards)} Step: {global_step+1}/{total_timesteps} Average reward: {average_reward:.2f} Total training time: {sum(training_times):.2f} seconds')