# Libraries & Environment

In [None]:
!pip install gym
!pip install gymnasium
!pip install ale-py==0.8.1
!pip install "autorom[accept-rom-license]==0.4.2"
!pip install pygame==2.1.0
!pip install imageio==2.36.1
!pip install "imageio-ffmpeg==0.5.1"
!pip install moviepy==2.1.1

Collecting ale-py==0.8.1
  Downloading ale_py-0.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Downloading ale_py-0.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: ale-py
Successfully installed ale-py-0.8.1
Collecting autorom==0.4.2 (from autorom[accept-rom-license]==0.4.2)
  Downloading AutoROM-0.4.2-py3-none-any.whl.metadata (2.8 kB)
Collecting AutoROM.accept-rom-license (from autorom[accept-rom-license]==0.4.2)
  Downloading AutoROM.accept-rom-license-0.6.1.tar.gz (434 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m434.7/434.7 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.

In [None]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
import random
from collections import deque
import time
import os
from moviepy import ImageSequenceClip
import cv2
from gym.wrappers import AtariPreprocessing, FrameStack
from gym.wrappers import RecordVideo
import numpy as np
from tqdm import tqdm

# Neural Network

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self, action_space, device):
        super(NeuralNetwork, self).__init__()

        self.device = device

        self.net = nn.Sequential(
            nn.Conv2d(4, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(3136, 512),
            nn.ReLU(),
            nn.Linear(512, action_space)
        )

        # self.net = nn.Sequential(
        #     nn.Conv2d(4, 16, kernel_size=4, stride=2),
        #     nn.ReLU(),
        #     nn.Conv2d(16, 32, kernel_size=3, stride=2),
        #     nn.ReLU(),
        #     nn.Flatten(),
        #     nn.Linear(32 * 9 * 9, 128),
        #     nn.ReLU(),
        #     nn.Linear(128, action_space)
        # )

    def forward(self, x):
        return self.net(x / 255.0)

# Define used functions

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
def make_env(env_id, capture_video = False, seed = 1):
    if capture_video:
        env = gym.make(env_id, render_mode="rgb_array")
        env = gym.wrappers.RecordVideo(env, "/kaggle/working/videos")
    else:
        env = gym.make(env_id)
    env = gym.wrappers.RecordEpisodeStatistics(env)
    env = gym.wrappers.ResizeObservation(env, (84, 84))
    env = gym.wrappers.GrayScaleObservation(env)
    env = gym.wrappers.FrameStack(env, 4)
    env.action_space.seed(seed)
    return env

def linear_schedule(start_e, end_e, duration, t):
    slope = (end_e - start_e) / duration
    return max(slope * t + start_e, end_e)

# Thay đổi hàm tính độ mới. Ở đây mỗi khi cập nhật archive, mỗi hành vi đều được coi là có sự đóng góp vào tính mới của archive
def compute_novelty(behavior_archive, k=5):
    novelty_scores = []
    for i in range(len(behavior_archive)):
        distances = [np.linalg.norm(behavior_archive[i] - b) for j, b in enumerate(behavior_archive) if i != j]
        novelty_scores.append(np.mean(sorted(distances)[:k]))
    return novelty_scores

def behavior_characterization(policy, env, device, seed=1, max_steps=200):
    policy.eval()
    behaviors = []
    step = 0
    state = env.reset(seed=seed)
    done = False

    while not done:
        if isinstance(state, tuple):
            state = state[0]
        if isinstance(state, gym.wrappers.frame_stack.LazyFrames):
            state = np.array(state)
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)

        with torch.no_grad():
            action_probs = policy(state_tensor)
            action = torch.argmax(action_probs, dim=1).item()

        next_state, _, done, _, _ = env.step(action)
        behaviors.append(state_tensor)
        # behaviors.append(state)
        state = next_state
        step += 1
        if step == max_steps:
            break

    behaviors_tensor = torch.stack(behaviors, dim=0)  # (steps, *)
    return torch.mean(behaviors_tensor, dim=0).cpu().numpy()
    # return np.mean(behaviors, axis=0)

def evaluate_policy(policy, env, device, seed=1, max_steps=200):
    total_rewards = 0
    state = env.reset(seed=seed)
    done = False
    trajectory = []
    step = 0

    while not done:
        if isinstance(state, tuple):
            state = state[0]
        if isinstance(state, gym.wrappers.frame_stack.LazyFrames):
            state = np.array(state)
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)

        with torch.no_grad():
            action_probs = policy(state_tensor)
            action = torch.argmax(action_probs, dim=1).item()

        next_state, reward, done, _, _ = env.step(action)
        trajectory.append(state_tensor)
        # trajectory.append(state)
        total_rewards += reward
        state = next_state
        step += 1
        if step == max_steps:
            break

    trajectory_tensor = torch.stack(trajectory, dim=0)  # (steps, *)
    return torch.mean(trajectory_tensor, dim=0).cpu().numpy(), total_rewards
    # return np.mean(trajectory, axis=0), total_rewards

def play(policy, env_id, device, seed, max_steps, video_dir):
    env = make_env(env_id, capture_video = True, seed = seed)
    num_actions = env.action_space.n
    action_net = NeuralNetwork(num_actions, device).to(device)
    action_net.load_state_dict(policy.state_dict())
    action_net.eval()
    total_rewards = 0
    step = 0
    state, info = env.reset()
    done = False
    while not done:
        if isinstance(state, tuple):
                state = state[0]
        if isinstance(state, gym.wrappers.frame_stack.LazyFrames):
            state = np.array(state)
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
        with torch.no_grad():
            action_probs = action_net(state_tensor)
            action = torch.argmax(action_probs, dim=1).item()
        next_state, reward, done, _, _ = env.step(action)
        step += 1
        total_rewards += reward
        state = next_state
        if done or step == max_steps:
            break
    env.close()
    return step, total_rewards

# Hàm hiển thị video chơi game
def display_video(video_path):
    return Video(video_path, embed=True, width=720, height=480)

# Parameters & Hyperparameters

In [None]:
# Hyperparameters
gamma = 0.99
seed = 21521992
pop_size = 20     # Kích thước quần thể
max_gens = 200    # Số lượng thế hệ
num_episodes_per_eval = 1
distance_count = 5    # Tham số k (kNN) khi tính toán novelty
update_frequency = 10
max_steps = 2000  # Giới hạn số bước tối đa cho mỗi tập
sigma = 0.1
alpha = 0.001
update_frequency = 10
log_frequency = 10


# Parameters
env_id = 'BreakoutNoFrameskip-v4'
capture_video = True

# NS-ES Training

In [None]:
save_dir = f'/kaggle/working/runs/NS-ESc/{env_id}__{seed}'
video_dir = f'/kaggle/working/videos/NS-ESc/{env_id}__{seed}'
os.makedirs(save_dir, exist_ok=True)
os.makedirs(video_dir, exist_ok=True)
video_result_log = video_dir + '/video_log.txt'

# Khởi tạo
env = make_env(env_id, capture_video=False)
num_actions = env.action_space.n

population = [NeuralNetwork(num_actions, device).to(device) for _ in range(pop_size)]
behavior_archive = deque(maxlen=500)
training_times = []
all_best_rewards = []

# Tính toán BC và thêm vào archive
for policy in population:
    behavior_archive.append(behavior_characterization(policy, env, device, seed, max_steps))

best_policy_state = None
best_reward = float('-inf')

for gen in tqdm(range(max_gens)):
    pop_behaviors = []
    pop_rewards = []

    start_time = time.time()

    # Đánh giá quần thể
    for policy in population:
        behavior, reward = evaluate_policy(env, policy, num_episodes_per_eval, max_steps, seed)
        pop_behaviors.append(behavior)
        pop_rewards.append(reward)

    all_best_rewards.append(np.max(pop_rewards))

    # Thêm các behavior vào archive
    behavior_archive.extend(pop_behaviors)

    # Đánh giá độ mới và xếp hạng quần thể
    novelties = compute_novelty(behavior_archive, k=distance_count)
    pop_novelties = novelties[-pop_size:]

    # Lựa chọn 10 cá thể tốt nhất để giữ lại
    sorted_indices = np.argsort(pop_novelties)
    top_indices = sorted_indices[-10:]
    remain_indices = sorted_indices[:-10]
    new_population = [population[i] for i in top_indices]
    best_policy_state = population[top_indices[0]].state_dict()    # cá thể tốt nhất

    # Lấy 10 cá thể tốt nhất và 30 cá thể trong số 40 cá thể còn lại để tiến hành đột biến
    mutant_indices = []
    mutant_indices.extend(top_indices)
    while len(mutant_indices) < pop_size - 10:
        mutant_indices.append(np.random.choice(remain_indices, replace=False))

    for idx in mutant_indices:
        parent = population[idx]
        child = NeuralNetwork(num_actions, device).to(device)
        for child_param, parent_param in zip(child.parameters(), parent.parameters()):
            noise = torch.normal(mean=0.0, std=sigma, size=parent_param.shape, device=parent_param.device)
            child_param.data = parent_param.data + noise  # Thêm nhiễu
        new_population.append(child)

    population = new_population
    training_times.append(time.time() - start_time)

    # Lưu trạng thái cá thể tốt nhất
    if (gen + 1) % log_frequency == 0:
        torch.save(best_policy_state, os.path.join(save_dir, f"Generation_{gen + 1}.pt"))
        np.savez_compressed(os.path.join(save_dir,f'results.npz'),
                            rewards = all_best_rewards, training_times = training_times)

        # Chơi game và lưu video
        best_policy = NeuralNetwork(num_actions, device).to(device)
        best_policy.load_state_dict(best_policy_state)
        best_policy.eval()
        total_step, total_reward = play(best_policy, env_id, device, seed, max_steps, video_dir)

# NSR-ES Training

In [None]:
# Hyperparameters
gamma = 0.99
seed = 21521992
pop_size = 20     # Kích thước quần thể
max_gens = 200    # Số lượng thế hệ
num_episodes_per_eval = 1
distance_count = 5    # Tham số k (kNN) khi tính toán novelty
update_frequency = 10
max_steps = 2000  # Giới hạn số bước tối đa cho mỗi tập
sigma = 0.1
alpha = 0.001
update_frequency = 10
log_frequency = 10
novelty_weight = 0.5
reward_weight = 1 - novelty_weight

# Parameters
env_id = 'BreakoutNoFrameskip-v4'
capture_video = True

In [None]:
save_dir = f'/kaggle/working/runs/NSR-ESc/{env_id}__{seed}'
video_dir = f'/kaggle/working/videos/NSR-ESc/{env_id}__{seed}'
os.makedirs(save_dir, exist_ok=True)
os.makedirs(video_dir, exist_ok=True)
video_result_log = video_dir + '/video_log.txt'

# Khởi tạo
env = make_env(env_id, capture_video=False)
num_actions = env.action_space.n

population = [NeuralNetwork(num_actions, device).to(device) for _ in range(pop_size)]
behavior_archive = deque(maxlen=500)
training_times = []
all_best_rewards = []

# Tính toán BC và thêm vào archive
for policy in population:
    behavior_archive.append(behavior_characterization(policy, env, device, seed, max_steps))

best_policy_state = None
best_reward = float('-inf')

for gen in tqdm(range(max_gens)):
    pop_behaviors = []
    pop_rewards = []

    start_time = time.time()

    # Đánh giá quần thể
    for policy in population:
        behavior, reward = evaluate_policy(env, policy, num_episodes_per_eval, max_steps, seed)
        pop_behaviors.append(behavior)
        pop_rewards.append(reward)

    # Thêm các behavior vào archive
    behavior_archive.extend(pop_behaviors)

    all_best_rewards.append(np.max(pop_rewards))

    # Đánh giá độ mới và xếp hạng quần thể
    novelties = compute_novelty(behavior_archive, k=distance_count)
    pop_novelties = novelties[-pop_size:]
    pop_scores = novelty_weight * np.array(pop_novelties) + reward_weight * np.array(pop_rewards)

    # Lựa chọn 10 cá thể tốt nhất để giữ lại
    sorted_indices = np.argsort(pop_scores)
    top_indices = sorted_indices[-10:]
    remain_indices = sorted_indices[:-10]
    new_population = [population[i] for i in top_indices]
    best_policy_state = population[top_indices[0]].state_dict()    # cá thể tốt nhất

    # Lấy 10 cá thể tốt nhất và 30 cá thể trong số 40 cá thể còn lại để tiến hành đột biến
    mutant_indices = []
    mutant_indices.extend(top_indices)
    while len(mutant_indices) < pop_size - 10:
        mutant_indices.append(np.random.choice(remain_indices, replace=False))

    for idx in mutant_indices:
        parent = population[idx]
        child = NeuralNetwork(num_actions, device).to(device)
        for child_param, parent_param in zip(child.parameters(), parent.parameters()):
            noise = torch.normal(mean=0.0, std=sigma, size=parent_param.shape, device=parent_param.device)
            child_param.data = parent_param.data + noise  # Thêm nhiễu
        new_population.append(child)

    population = new_population
    training_times.append(time.time() - start_time)

    # Lưu trạng thái cá thể tốt nhất
    if (gen + 1) % log_frequency == 0:
        torch.save(best_policy_state, os.path.join(save_dir, f"Generation_{gen + 1}.pt"))
        np.savez_compressed(os.path.join(save_dir,f'results.npz'),
                            rewards = all_best_rewards, training_times = training_times)

        # Chơi game và lưu video
        best_policy = NeuralNetwork(num_actions, device).to(device)
        best_policy.load_state_dict(best_policy_state)
        best_policy.eval()
        total_step, total_reward = play(best_policy, env_id, device, seed, max_steps, video_dir)