In [4]:
%load_ext autoreload
%autoreload 2
# Prepare the environment
try:
    from easypip import easyimport
except ModuleNotFoundError:
    from subprocess import run

    assert (
        run(["pip", "install", "easypip"]).returncode == 0
    ), "Could not install easypip"
    from easypip import easyimport

easyimport("swig")
easyimport("bbrl_utils").setup(maze_mdp=False)

import os
import copy
import numpy as np
import gymnasium as gym
import math
import bbrl_gymnasium  # noqa: F401
import torch
import torch.nn as nn
from bbrl.agents import Agent, Agents, TemporalAgent
from bbrl_utils.algorithms import EpochBasedAlgo
from bbrl_utils.nn import build_mlp, setup_optimizer, soft_update_params
from bbrl_utils.notebook import setup_tensorboard
from bbrl.visu.plot_policies import plot_policy
from omegaconf import OmegaConf
from td3_ddpg import DDPG, run_ddpg, TD3, run_td3
import utils
import matplotlib.pyplot as plt
from wrappers import FeatureFilterWrapper, ObsTimeExtensionWrapper, ActionTimeExtensionWrapper

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
params = {
    "save_best": False,
    "base_dir": "${gym_env.env_name}/ddpg-S${algorithm.seed}_${current_time:}",
    "collect_stats": True,
    # Set to true to have an insight on the learned policy
    # (but slows down the evaluation a lot!)
    "plot_agents": True,
    "algorithm": {
        "seed": 1,
        "max_grad_norm": 0.5,
        "epsilon": 0.02,
        "n_envs": 1,
        "n_steps": 100,
        "nb_evals": 10,
        "discount_factor": 0.8,
        "buffer_size": 1e6,
        "batch_size": 64,
        "tau_target": 0.05,
        "eval_interval": 2_000,
        "max_epochs": 6_000,
        # Minimum number of transitions before learning starts
        "learning_starts": 10_000,
        "action_noise": 0.1,
        "architecture": {
            "actor_hidden_size": [400, 300],
            "critic_hidden_size": [400, 300],
        },
    },
    "gym_env": {
        "env_name": "CartPoleContinuous-v1",
    },
    "actor_optimizer": {
        "classname": "torch.optim.Adam",
        "lr": 1e-3,
        # "eps": 5e-5,
    },
    "critic_optimizer": {
        "classname": "torch.optim.Adam",
        "lr": 1e-3,
        # "eps": 5e-5,
    },
}

In [6]:
def feature_filter_wrapper_1(env):
    """Wrapper pour enlever la première feature (par exemple, l'index 1)"""
    return FeatureFilterWrapper(env, 1)

def feature_filter_wrapper_3(env):
    """Wrapper pour enlever la troisième feature (par exemple, l'index 3)"""
    return FeatureFilterWrapper(env, 3)

def feature_filter_wrapper_both(env):
    """Wrapper pour enlever les deux features : première et troisième"""
    return FeatureFilterWrapper(FeatureFilterWrapper(env, 3), 1)


# Fonctions pour ObsTimeExtensionWrapper et ActionTimeExtensionWrapper avec feature filtering
def obs_time_extension_wrapper_dx(env):
    """Appliquer ObsTimeExtensionWrapper et enlever la première feature."""
    return ObsTimeExtensionWrapper(feature_filter_wrapper_1(env))

def obs_time_extension_wrapper_dtheta(env):
    """Appliquer ObsTimeExtensionWrapper et enlever la troisième feature."""
    return ObsTimeExtensionWrapper(feature_filter_wrapper_3(env))

def obs_time_extension_wrapper_both(env):
    """Appliquer ObsTimeExtensionWrapper et enlever les deux features."""
    return ObsTimeExtensionWrapper(feature_filter_wrapper_both(env))

def action_time_extension_wrapper_dx(env):
    """Appliquer ActionTimeExtensionWrapper et enlever la première feature."""
    return ActionTimeExtensionWrapper(feature_filter_wrapper_1(env))

def action_time_extension_wrapper_dtheta(env):
    """Appliquer ActionTimeExtensionWrapper et enlever la troisième feature."""
    return ActionTimeExtensionWrapper(feature_filter_wrapper_3(env))

def action_time_extension_wrapper_both(env):
    """Appliquer ActionTimeExtensionWrapper et enlever les deux features."""
    return ActionTimeExtensionWrapper(feature_filter_wrapper_both(env))

def full_extension_wrapper_dx(env):
    """Appliquer ObsTimeExtensionWrapper, ActionTimeExtensionWrapper et enlever la première feature."""
    return ObsTimeExtensionWrapper(ActionTimeExtensionWrapper(feature_filter_wrapper_1(env)))

def full_extension_wrapper_dtheta(env):
    """Appliquer ObsTimeExtensionWrapper, ActionTimeExtensionWrapper et enlever la troisième feature."""
    return ObsTimeExtensionWrapper(ActionTimeExtensionWrapper(feature_filter_wrapper_3(env)))

def full_extension_wrapper_both(env):
    """Appliquer ObsTimeExtensionWrapper, ActionTimeExtensionWrapper et enlever les deux features."""
    return ObsTimeExtensionWrapper(ActionTimeExtensionWrapper(feature_filter_wrapper_both(env)))


# TESTS

In [None]:
results_test={}
fonction_wrappers_ici = obs_time_extension_wrapper_both
ddpg_instance = DDPG(OmegaConf.create(params), [fonction_wrappers_ici])
critic_losses, actor_losses, rewards_per_step, steps, best_rewards, running_rewwards =run_ddpg(ddpg_instance)

Matplotlib backend: module://matplotlib_inline.backend_inline


  0%|          | 0/6000 [00:00<?, ?it/s]

In [None]:
plt.figure(figsize=(12, 8))
plt.suptitle(f'Learning Curves for test', fontsize=16)

# Tracer la perte des critiques
plt.subplot(2, 2, 1)
plt.plot(steps, critic_losses, label=f'Critic Losses test')
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.title("Critic Losses")
plt.legend()

# Tracer la perte de l'acteur
plt.subplot(2, 2, 2)
plt.plot(steps, actor_losses, label=f'Actor Losses test')
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.title("Actor Losses")
plt.legend()

# Tracer les récompenses par étape
plt.subplot(2, 2, 3)
plt.plot(steps, rewards_per_step, label=f'Rewards per Step test')
plt.xlabel("Steps")
plt.ylabel("Reward")
plt.title("Rewards per Step")
plt.legend()

# Tracer les meilleures récompenses
plt.subplot(2, 2, 4)
plt.plot(steps, running_rewwards, label=f'Best Rewards test')
plt.xlabel("Steps")
plt.ylabel("Best Reward")
plt.title("Best Rewards")
plt.legend()