In [None]:
import torch.nn as nn
from tqdm.notebook import tqdm_notebook as tqdm
# from tqdm import tqdm
import gym_agent as ga

In [None]:
total_timesteps = 200000
max_episode_steps = 500
chkpt_dir = "checkpoints/LunarLander-v2/DQN"

env_id = 'LunarLander-v3'

# DQN

In [None]:
class DeepQNetwork(nn.Module):
    def __init__(self, n_inp: int, n_out: int, features: list[int] = [64, 64]):
        super().__init__()

        layers = [n_inp] + features + [n_out]

        self.net = nn.Sequential()

        for i in range(len(layers)-1):
            self.net.append(nn.Linear(layers[i], layers[i+1]))

            if i < len(layers)-2:
                self.net.append(nn.ReLU())

    def forward(self, X):
        return self.net(X)

In [None]:
def net_factory():
    return DeepQNetwork(
        n_inp=8,
        n_out=4,
        features=[256, 256]
    )

In [None]:
policy = ga.core.TargetPolicy(
    net_factory_func=net_factory,
    lr = 1e-3
)

In [None]:
agent = ga.core.DQN(
    policy=policy,
    env_factory_fn=ga.make,
    env_kwargs={"id": env_id, "max_episode_steps": max_episode_steps},
    eps_decay_steps=100000,
)

In [None]:
agent.fit(total_timesteps=200000, save_best=True, save_dir=chkpt_dir, progress_bar=tqdm)

In [None]:
agent.load(chkpt_dir, "best")

In [None]:
agent.play(jupyter=True)

In [None]:
agent.plot_scores(rolling_window=100)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def plot_rolling_stats(scores, window):
    """
    Plot rolling min, mean, and max of a list of scores,
    with a shaded region between min and max.

    Parameters:
    - scores: list of numeric values
    - window: int, rolling window size
    """
    if not scores:
        raise ValueError("Scores list cannot be empty.")
    if window < 1:
        raise ValueError("Window size must be at least 1.")

    # Create pandas Series
    s = pd.Series(scores)

    # Compute rolling statistics
    rolling_max = s.rolling(window).max()
    rolling_mean = s.rolling(window).mean()
    rolling_min = s.rolling(window).min()

    # Plot
    plt.figure(figsize=(10, 6))

    # Original scores
    plt.plot(s, label="Original Scores", color="gray", alpha=0.5)

    # Shaded region between rolling min and max
    plt.fill_between(s.index, rolling_min, rolling_max,
                     color="skyblue", alpha=0.3, label="Min–Max Range")

    # Mean line
    plt.plot(rolling_mean, label=f"Rolling Mean (window={window})", color="blue", linewidth=2)

    plt.title("Rolling Statistics of Scores")
    plt.xlabel("Index")
    plt.ylabel("Score")
    plt.legend()
    plt.grid(True)
    plt.show()


In [None]:
scores = [10, 12, 15, 20, 18, 22, 19, 25, 30, 28, 26, 24]
plot_rolling_stats(scores, window=3)


# A2C

In [None]:
class Feature_Extractor(nn.Module):
    def __init__(self, n_inp: int, features: list[int] = [256, 256]):
        super().__init__()

        layers = [n_inp] + features

        self.net = nn.Sequential()

        for i in range(len(layers)-1):
            self.net.append(nn.Linear(layers[i], layers[i+1]))
            self.net.append(nn.ReLU())

    def forward(self, X):
        return self.net(X)

In [None]:
def feature_extractor_factory():
    return Feature_Extractor(
        n_inp=8,
        features=[256, 256]
    )

def actor_factory():
    net = nn.Sequential(
        Feature_Extractor(n_inp=8, features=[256, 256]),
        nn.Linear(256, 4)
    )
    return net

def critic_factory():
    net = nn.Sequential(
        Feature_Extractor(n_inp=8, features=[256, 256]),
        nn.Linear(256, 1)
    )
    return net

In [None]:
policy = ga.core.ActorCriticPolicy(
    actor_factory_func=actor_factory,
    critic_factory_func=critic_factory,
    actor_lr = 0.00083,
    # feature_extractor_func=feature_extractor_factory,
)

In [None]:
agent = ga.core.A2C(
    policy=policy,
    env_factory_fn=ga.make,
    env_kwargs={"id": env_id, "max_episode_steps": max_episode_steps},
    num_envs=4,
    # n_steps=8,
    # entropy_coef=0.001,
    # gamma=0.995,
    gae_lambda=0.95,
)

In [None]:
agent.fit(total_timesteps=total_timesteps, save_best=True, save_dir="checkpoints/LunarLander-v2/A2C", progress_bar=tqdm)

In [None]:
agent.plot_scores()

In [None]:
max(agent.mean_scores)

In [None]:
import torch
import numpy as np
from stable_baselines3.common.buffers import RolloutBuffer

In [None]:
buffer = RolloutBuffer(buffer_size=5, observation_space=agent.envs.single_observation_space, action_space=agent.envs.single_action_space, device=agent.device, n_envs=4)

In [None]:
indice = np.arange(0, 20, 5)

In [None]:
agent.memory.observations[indice].shape

In [None]:
agent.memory.episode_starts.shape

In [None]:
for i in range(5):
    obs = agent.memory.observations[indice + i]
    actions = agent.memory.actions[indice + i].flatten()
    rewards = agent.memory.rewards[indice + i].flatten()
    eps_starts = agent.memory.episode_starts[i].flatten()
    values = torch.from_numpy(agent.memory.values[indice + i].flatten())
    log_probs = torch.from_numpy(agent.memory.log_probs[indice + i].flatten())

    buffer.add(
        obs,
        actions,
        rewards,
        eps_starts,
        values,
        log_probs
    )


In [None]:
from stable_baselines3 import A2C
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import SubprocVecEnv

# Parallel environments
vec_env = make_vec_env("LunarLander-v3", n_envs=1, vec_env_cls=SubprocVecEnv)

model = A2C("MlpPolicy", vec_env, verbose=1)
model.learn(total_timesteps=100000)
model.save("temp")

del model # remove to demonstrate saving and loading

model = A2C.load("temp")

obs = vec_env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = vec_env.step(action)
    vec_env.render("human")

In [None]:
model.rollout_buffer.observations.shape

In [None]:
model.policy

In [None]:
model = A2C.load("temp")

obs = vec_env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = vec_env.step(action)
    vec_env.render("human")

In [None]:
vec_env.action_space

In [None]:
import numpy as np

In [None]:
a = np.stack([[1, 2, 3], [4, 5, 6]])
a

In [None]:
from gym_agent.core.vec_env.subproc_vec_env import SubprocVecEnv
import gym_agent as ga

from stable_baselines3 import A2C

In [None]:
env = SubprocVecEnv([lambda: ga.make(id="LunarLander-v3", max_episode_steps=20, render_mode="rgb_array") for _ in range(4)])

In [None]:
done = False

obs, _ = env.reset()
while not done:
    obs, *_ = env.step(env.action_space.sample())

    env.render("human")

In [None]:
from gymnasium import Env

In [None]:
import numpy as np

In [None]:
scores = []
current_scores = np.array([1.0, 2.2, 3.0, 4.5])
dones = np.array([False, True, False, True])
rewards = np.array([0.5, 1.0, 0.2, 0.3])

In [None]:
current_scores += rewards
current_scores

In [None]:
scores.extend(current_scores[dones].tolist())

In [None]:
current_scores[dones] = 0.0
current_scores

In [None]:
scores , current_scores

In [None]:
eps_start = 1.0
eps_end = 0.01
eps_decay = 0.9999998

eps = eps_start

i = 0

while eps > eps_end:
    eps = eps * eps_decay
    i += 1

print(i)

In [None]:
(eps_end/eps_start) ** (1/time_step)

In [None]:
import numpy as np

In [None]:
a = np.ma.array([[2, 1],[0, 0],[3, 4]], mask=[[0, 0],[1,1],[0,0]], dtype=int)

In [None]:
np.sort(a)

In [None]:
a.dtype

In [None]:
b = {"a": 3, "b": 4}
c = {"a": None, "b": 3}

In [None]:
a.update(b)
a

In [None]:
import numpy as np

In [None]:
np.empty_like("3")

In [None]:
np.empty_like((4, 2))

In [None]:
import torch

In [None]:
m = torch.tensor([-6, 0, 3])

In [None]:
sigmoid_m = torch.sigmoid(m)

In [None]:
torch.stop_gradient(sigma_m)