# Libraries & Environment

In [None]:
!pip install gym
!pip install gymnasium
!pip install ale-py==0.8.1
!pip install "autorom[accept-rom-license]==0.4.2"
!pip install pygame==2.1.0
!pip install imageio==2.36.1
!pip install "imageio-ffmpeg==0.5.1"
!pip install moviepy==2.1.1

Collecting ale-py==0.8.1
  Downloading ale_py-0.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Downloading ale_py-0.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: ale-py
Successfully installed ale-py-0.8.1
Collecting autorom==0.4.2 (from autorom[accept-rom-license]==0.4.2)
  Downloading AutoROM-0.4.2-py3-none-any.whl.metadata (2.8 kB)
Collecting AutoROM.accept-rom-license (from autorom[accept-rom-license]==0.4.2)
  Downloading AutoROM.accept-rom-license-0.6.1.tar.gz (434 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m434.7/434.7 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.

In [None]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
import random
from collections import deque
import time
import os
from moviepy import ImageSequenceClip
import cv2
from gym.wrappers import AtariPreprocessing, FrameStack
from gym.wrappers import RecordVideo
import numpy as np
from tqdm import tqdm

# Neural Network

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self, action_space, device):
        super(NeuralNetwork, self).__init__()

        self.device = device

        self.net = nn.Sequential(
            nn.Conv2d(4, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(3136, 512),
            nn.ReLU(),
            nn.Linear(512, action_space)
        )

        # self.net = nn.Sequential(
        #     nn.Conv2d(4, 16, kernel_size=4, stride=2),
        #     nn.ReLU(),
        #     nn.Conv2d(16, 32, kernel_size=3, stride=2),
        #     nn.ReLU(),
        #     nn.Flatten(),
        #     nn.Linear(32 * 9 * 9, 128),
        #     nn.ReLU(),
        #     nn.Linear(128, action_space)
        # )

    def forward(self, x):
        return self.net(x / 255.0)

# Define used functions

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
def make_env(env_id, capture_video = False, seed = 1):
    if capture_video:
        env = gym.make(env_id, render_mode="rgb_array")
        env = gym.wrappers.RecordVideo(env, "/kaggle/working/videos")
    else:
        env = gym.make(env_id)
    env = gym.wrappers.RecordEpisodeStatistics(env)
    env = gym.wrappers.ResizeObservation(env, (84, 84))
    env = gym.wrappers.GrayScaleObservation(env)
    env = gym.wrappers.FrameStack(env, 4)
    env.action_space.seed(seed)
    return env

def linear_schedule(start_e, end_e, duration, t):
    slope = (end_e - start_e) / duration
    return max(slope * t + start_e, end_e)

# Hàm khởi tạo trọng số
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.kaiming_uniform_(m.weight, nonlinearity='relu')
        if m.bias is not None:
            nn.init.zeros_(m.bias)

# Hàm tính độ mới
def compute_novelty(new_behavior, behavior_archive, k=5, device = device):
    if not behavior_archive:
        return 0
    behavior_archive = torch.tensor(np.array(behavior_archive), device=device, dtype=torch.float32).view(len(behavior_archive), -1)
    new_behavior = torch.tensor(new_behavior, device=device, dtype=torch.float32).view(1, -1)
    # L2 norm (Euclidian distance)
    distances = torch.norm(behavior_archive - new_behavior, dim=1)
    distances = distances[distances > 0]
    if len(distances) > 0:
        sorted_distances, _ = torch.topk(distances, k=min(k, len(distances)), largest=False)
        return torch.mean(sorted_distances).item()
    else:
        return 0.1  # smoothed
# Sửa hàm tính đặc trưng hành vi

def behavior_characterization(policy, env, device, seed=1, max_steps=200):
    policy.eval()
    behaviors = []
    step = 0
    state = env.reset(seed=seed)
    done = False

    while not done:
        if isinstance(state, tuple):
            state = state[0]
        if isinstance(state, gym.wrappers.frame_stack.LazyFrames):
            state = np.array(state)
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)

        with torch.no_grad():
            action_probs = policy(state_tensor)
            action = torch.argmax(action_probs, dim=1).item()

        next_state, _, done, _, _ = env.step(action)
        behaviors.append(state_tensor)
        # behaviors.append(state)
        state = next_state
        step += 1
        if step == max_steps:
            break

    behaviors_tensor = torch.stack(behaviors, dim=0)  # (steps, *)
    return torch.mean(behaviors_tensor, dim=0).cpu().numpy()
    # return np.mean(behaviors, axis=0)

def evaluate_policy(policy, env, device, seed=1, max_steps=200):
    total_rewards = 0
    state = env.reset(seed=seed)
    done = False
    trajectory = []
    step = 0

    while not done:
        if isinstance(state, tuple):
            state = state[0]
        if isinstance(state, gym.wrappers.frame_stack.LazyFrames):
            state = np.array(state)
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)

        with torch.no_grad():
            action_probs = policy(state_tensor)
            action = torch.argmax(action_probs, dim=1).item()

        next_state, reward, done, _, _ = env.step(action)
        trajectory.append(state_tensor)
        # trajectory.append(state)
        total_rewards += reward
        state = next_state
        step += 1
        if step == max_steps:
            break

    trajectory_tensor = torch.stack(trajectory, dim=0)  # (steps, *)
    return torch.mean(trajectory_tensor, dim=0).cpu().numpy(), total_rewards
    # return np.mean(trajectory, axis=0), total_rewards

def play(policy, env_id, device, seed, max_steps, video_dir):
    env = make_env(env_id, capture_video = True, seed = seed)
    num_actions = env.action_space.n
    action_net = NeuralNetwork(num_actions, device).to(device)
    action_net.load_state_dict(policy.state_dict())
    action_net.eval()
    total_rewards = 0
    step = 0
    state, info = env.reset()
    done = False
    while not done:
        if isinstance(state, tuple):
                state = state[0]
        if isinstance(state, gym.wrappers.frame_stack.LazyFrames):
            state = np.array(state)
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
        with torch.no_grad():
            action_probs = action_net(state_tensor)
            action = torch.argmax(action_probs, dim=1).item()
        next_state, reward, done, _, _ = env.step(action)
        step += 1
        total_rewards += reward
        state = next_state
        if done or step == max_steps:
            break
    env.close()
    return step, total_rewards

# Hàm hiển thị video chơi game
def display_video(video_path):
    return Video(video_path, embed=True, width=720, height=480)

# Parameters & Hyperparameters

In [None]:
# Hyperparameters
learning_rate = 1e-4
gamma = 0.99
seed = 21521992
pop_size = 3
update_frequency = 100
log_frequency = 50
sigma = 0.1
alpha = 0.001
n_workers = 3
max_iterations = 5000
max_steps = 2000


# Parameters
env_id = 'BreakoutNoFrameskip-v4'
capture_video = True


# NS-ES Training

In [None]:
all_best_states = []
all_best_rewards = []
all_novelties = []
training_times = []

# Tạo thư mục lưu trữ kết quả

save_dir = f'/kaggle/working/runs/NS-ES/{env_id}__{seed}'
video_dir = f'/kaggle/working/videos/NS-ES/{env_id}__{seed}'
os.makedirs(save_dir,exist_ok=True)
os.makedirs(video_dir,exist_ok=True)
video_result_log = video_dir + '/video_log.txt'

# Khởi tạo
env = make_env(env_id, capture_video = False)
num_actions = env.action_space.n

population = [NeuralNetwork(num_actions, device).apply(initialize_weights).to(device) for _ in range(pop_size)]
behavior_archive = deque(maxlen=500)
rewards = []

# Tính toán BC và thêm vào archive
for policy in population:
    behavior_archive.append(behavior_characterization(policy, env, device, seed, max_steps))

for iter in tqdm(range(max_iterations)):
# for iter in tqdm(range(20)):
    # Tính điểm thưởng của quần thể hiện tại
    pop_rewards = []
    pop_behaviors = []
    pop_novelties = []

    start_time = time.time()

    # Tính toán behavior + novelty (để sample) và điểm thưởng của quần thể
    for policy in population:
        behavior, reward = evaluate_policy(policy, env, device, seed, max_steps)
        pop_rewards.append(reward)
        pop_behaviors.append(behavior)
        pop_novelties.append(compute_novelty(behavior, behavior_archive))

    best_index = np.argmax(pop_rewards)
    all_best_rewards.append(max(pop_rewards))
    all_best_states.append(population[best_index].state_dict())

    # Sample
    probs = [novelty / sum(pop_novelties) for novelty in pop_novelties]
    selected_index = np.random.choice(range(pop_size), p=probs)
    selected_policy = population[selected_index]

    # Thêm nhiễu
    policies = [selected_policy for _ in range(n_workers)]
    new_novelties = []
    for i in range(n_workers):
        for param in policies[i].parameters():
            if param.requires_grad:
                noise = torch.normal(0, sigma, size=param.size(), device=param.device)
                param.data.add_(noise)
        bc = behavior_characterization(policies[i], env, device, seed, max_steps)
        new_novelties.append(compute_novelty(bc, behavior_archive))

    # Cập nhật chính sách
    for param in population[selected_index].parameters():
        if param.requires_grad:
            noise = torch.normal(0, sigma, size=param.size(), device=param.device)
            novelty_mean = torch.mean(torch.tensor(new_novelties, device=param.device))

            # scale dựa theo phân phối
            # novelty_std = torch.std(torch.tensor(new_novelties, device=param.device))
            # if novelty_std > 0:
            #     norm_novelty = (torch.tensor(new_novelties, device=param.device) - novelty_mean) / novelty_std
            # else:
            #     norm_novelty = torch.zeros_like(torch.tensor(new_novelties, device=param.device))

            # scale bằng hàm kích hoạt phi tuyến
            norm_novelty = torch.tanh(torch.tensor(new_novelties, device=param.device))       # hoặc đổi thành sigmoid
            scale_value =  torch.mean(norm_novelty)
            update_value = alpha * (1 / sigma) * scale_value * noise

            # update_value = alpha * novelty_mean * (1 / sigma) * noise
            param.data.add_(update_value)

    # Thêm vào archive
    bc = behavior_characterization(population[selected_index], env, device, seed, max_steps)
    all_novelties.append(compute_novelty(bc, behavior_archive))
    behavior_archive.append(bc)

    training_times.append(time.time() - start_time)

    if (iter+1) % update_frequency == 0:
    # if (iter+1) % 2 == 0:
        best_new_novelty = np.max(all_novelties[-5:]) if iter >= 5 else np.max(all_novelties)
        best_new_reward = np.max(all_best_rewards[-5:]) if iter >= 5 else np.max(all_best_rewards)
        print(f"Iteration {iter+1}/{max_iterations}, Best new novelty: {best_new_novelty:.2f}, Best new reward: {best_new_reward:.2f}, Total executing time: {sum(training_times):.2f} seconds")

    if (iter+1) % log_frequency == 0:
    # if (iter+1) % 2 == 0:
        # Lưu trạng thái quần thể
        np.savez_compressed(os.path.join(save_dir,f'results.npz'),
                            rewards = all_best_rewards, training_times = training_times)
        torch.save(all_best_states, os.path.join(save_dir,f"Iteration_{iter+1}.pth"))

        # Chơi game và lưu video
        best_policy_state = all_best_states[-100:][np.argsort(all_best_rewards[-100:])[-1]]
        best_policy = NeuralNetwork(num_actions, device).to(device)
        best_policy.load_state_dict(best_policy_state)
        best_policy.eval()
        total_step, total_reward = play(best_policy, env_id, device, seed, max_steps, video_dir)

# NSR-ES Training

In [None]:
# hyperparameters
novelty_weight = 0.4
reward_weight = 1 - novelty_weight

In [None]:
all_best_states = []
all_best_rewards = []
all_novelties = []
training_times = []

# Tạo thư mục lưu trữ kết quả

save_dir = f'/kaggle/working/runs/NSR-ES/{env_id}__{seed}'
video_dir = f'/kaggle/working/videos/NSR-ES/{env_id}__{seed}'
os.makedirs(save_dir,exist_ok=True)
os.makedirs(video_dir,exist_ok=True)
video_result_log = video_dir + '/video_log.txt'

# Khởi tạo
env = make_env(env_id, capture_video = False)
num_actions = env.action_space.n

population = [NeuralNetwork(num_actions, device).apply(initialize_weights).to(device) for _ in range(pop_size)]
behavior_archive = deque(maxlen=500)
rewards = []

# Tính toán BC và thêm vào archive
for policy in population:
    behavior_archive.append(behavior_characterization(policy, env, device, seed, max_steps))

for iter in tqdm(range(max_iterations)):
# for iter in tqdm(range(20)):
    # Tính điểm thưởng của quần thể hiện tại
    pop_rewards = []
    pop_behaviors = []
    pop_novelties = []

    start_time = time.time()

    # Tính toán behavior + novelty (để sample) và điểm thưởng của quần thể
    for policy in population:
        behavior, reward = evaluate_policy(policy, env, device, seed, max_steps)
        pop_rewards.append(reward)
        pop_behaviors.append(behavior)
        pop_novelties.append(compute_novelty(behavior, behavior_archive))

    best_index = np.argmax(pop_rewards)
    all_best_rewards.append(max(pop_rewards))
    all_best_states.append(population[best_index].state_dict())

    # Sample
    probs = [novelty / sum(pop_novelties) for novelty in pop_novelties]
    selected_index = np.random.choice(range(pop_size), p=probs)
    selected_policy = population[selected_index]

    # Thêm nhiễu
    policies = [selected_policy for _ in range(n_workers)]
    new_novelties = []
    new_rewards = []

    for i in range(n_workers):
        for param in policies[i].parameters():
            if param.requires_grad:
                noise = torch.normal(0, sigma, size=param.size(), device=param.device)
                param.data.add_(noise)
        bc, score = evaluate_policy(policies[i], env, device, seed, max_steps)
        new_novelties.append(compute_novelty(bc, behavior_archive))
        new_rewards.append(score)

    # Cập nhật chính sách
    for param in population[selected_index].parameters():
        if param.requires_grad:
            noise = torch.normal(0, sigma, size=param.size(), device=param.device)
            novelty_mean = torch.mean(torch.tensor(new_novelties, device=param.device))
            reward_mean = torch.mean(torch.tensor(new_rewards, device = param.device))

            norm_novelty = torch.tanh(novelty_mean)       # hoặc đổi thành sigmoid
            norm_reward = torch.tanh(reward_mean)
            update_value = alpha * (1 / sigma) * (novelty_weight * norm_novelty + reward_weight * norm_reward) * noise

            # update_value = alpha * novelty_mean * (1 / sigma) * noise
            param.data.add_(update_value)

    # Thêm vào archive
    bc = behavior_characterization(population[selected_index], env, device, seed, max_steps)
    all_novelties.append(compute_novelty(bc, behavior_archive))
    behavior_archive.append(bc)

    training_times.append(time.time() - start_time)

    if (iter+1) % update_frequency == 0:
    # if (iter+1) % 2 == 0:
        best_new_novelty = np.max(all_novelties[-5:]) if iter >= 5 else np.max(all_novelties)
        best_new_reward = np.max(all_best_rewards[-5:]) if iter >= 5 else np.max(all_best_rewards)
        print(f"Iteration {iter+1}/{max_iterations}, Best new novelty: {best_new_novelty:.2f}, Best new reward: {best_new_reward:.2f}, Total executing time: {sum(training_times):.2f} seconds")

    if (iter+1) % log_frequency == 0:
    # if (iter+1) % 2 == 0:
        # Lưu trạng thái quần thể
        np.savez_compressed(os.path.join(save_dir,f'results.npz'),
                            rewards = all_best_rewards, training_times = training_times)
        torch.save(all_best_states, os.path.join(save_dir,f"Iteration_{iter+1}.pth"))

        # Chơi game và lưu video
        best_policy_state = all_best_states[-100:][np.argsort(all_best_rewards[-100:])[-1]]
        best_policy = NeuralNetwork(num_actions, device).to(device)
        best_policy.load_state_dict(best_policy_state)
        best_policy.eval()
        total_step, total_reward = play(best_policy, env_id, device, seed, max_steps, video_dir)
