In [1]:
import gymnasium as gym
import numpy as np

from stable_baselines3 import DDPG
from stable_baselines3 import SAC
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
import importlib

import math
import matplotlib.pyplot as plt

In [None]:
from envs.pendulum import PendulumEnv as baseline

env = baseline(render_mode='human')

n_actions = env.action_space.shape[-1]
print(env.action_space)
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

model = SAC("MlpPolicy", env, action_noise=action_noise, verbose=1)
model.learn(total_timesteps=10000, log_interval=10)
model.save("./saved_models/sac_baseline")
env = model.get_env()

In [3]:
from envs.pendulum import PendulumEnv as baseline

env = baseline(render_mode='human')

n_actions = env.action_space.shape[-1]
print(env.action_space)
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

model = DDPG("MlpPolicy", env, action_noise=action_noise, verbose=1)
model.learn(total_timesteps=10000, log_interval=10)
model.save("./saved_models/ddpg_baseline")
env = model.get_env()

Box(-1.0, 1.0, (1,), float32)
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 32       |
|    ep_rew_mean     | -23.6    |
| time/              |          |
|    episodes        | 10       |
|    fps             | 27       |
|    time_elapsed    | 11       |
|    total_timesteps | 320      |
| train/             |          |
|    actor_loss      | 1.21     |
|    critic_loss     | 0.0743   |
|    learning_rate   | 0.001    |
|    n_updates       | 204      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 25.4     |
|    ep_rew_mean     | -24.6    |
| time/              |          |
|    episodes        | 20       |
|    fps             | 27       |
|    time_elapsed    | 18       |
|    total_timesteps | 507      |
| train/             |          |
|    actor_loss      | 2.

In [4]:
from envs.pendulum_fullRandom import PendulumEnv as full_random

env = full_random(render_mode='human')

n_actions = env.action_space.shape[-1]
print(env.action_space)
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

model = SAC("MlpPolicy", env, action_noise=action_noise, verbose=1)
model.learn(total_timesteps=10000, log_interval=10)
model.save("./saved_models/sac_fullRandom")
env = model.get_env()

Box(-1.0, 1.0, (1,), float32)
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 25.6     |
|    ep_rew_mean     | -27      |
| time/              |          |
|    episodes        | 10       |
|    fps             | 28       |
|    time_elapsed    | 8        |
|    total_timesteps | 256      |
| train/             |          |
|    actor_loss      | 0.0893   |
|    critic_loss     | 0.0382   |
|    ent_coef        | 0.955    |
|    ent_coef_loss   | -0.0777  |
|    learning_rate   | 0.0003   |
|    n_updates       | 155      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 24.6     |
|    ep_rew_mean     | -27.3    |
| time/              |          |
|    episodes        | 20       |
|    fps             | 28       |
|    time_elapsed    | 17       |
|    total_timesteps | 49

In [5]:
from envs.pendulum_fullRandom import PendulumEnv as full_random

env = full_random(render_mode='human')

n_actions = env.action_space.shape[-1]
print(env.action_space)
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

model = DDPG("MlpPolicy", env, action_noise=action_noise, verbose=1)
model.learn(total_timesteps=10000, log_interval=10)
model.save("./saved_models/ddpg_fullRandom")
env = model.get_env()

Box(-1.0, 1.0, (1,), float32)
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 63.4     |
|    ep_rew_mean     | -36.8    |
| time/              |          |
|    episodes        | 10       |
|    fps             | 27       |
|    time_elapsed    | 23       |
|    total_timesteps | 634      |
| train/             |          |
|    actor_loss      | 0.952    |
|    critic_loss     | 0.115    |
|    learning_rate   | 0.001    |
|    n_updates       | 533      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 312      |
|    ep_rew_mean     | -32.3    |
| time/              |          |
|    episodes        | 20       |
|    fps             | 27       |
|    time_elapsed    | 225      |
|    total_timesteps | 6231     |
| train/             |          |
|    actor_loss      | 0.

In [6]:
from envs.pendulum_piecewiseReward import PendulumEnv as piecewise

env = piecewise(render_mode='human')

n_actions = env.action_space.shape[-1]
print(env.action_space)
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

model = SAC("MlpPolicy", env, action_noise=action_noise, verbose=1)
model.learn(total_timesteps=10000, log_interval=10)
model.save("./saved_models/sac_piecewiseReward")
env = model.get_env()

Box(-1.0, 1.0, (1,), float32)
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 20.6     |
|    ep_rew_mean     | -16.9    |
| time/              |          |
|    episodes        | 10       |
|    fps             | 28       |
|    time_elapsed    | 7        |
|    total_timesteps | 206      |
| train/             |          |
|    actor_loss      | -0.235   |
|    critic_loss     | 0.313    |
|    ent_coef        | 0.969    |
|    ent_coef_loss   | -0.0522  |
|    learning_rate   | 0.0003   |
|    n_updates       | 105      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 22.9     |
|    ep_rew_mean     | -18.8    |
| time/              |          |
|    episodes        | 20       |
|    fps             | 28       |
|    time_elapsed    | 15       |
|    total_timesteps | 45

In [7]:
from envs.pendulum_piecewiseReward import PendulumEnv as piecewise

env = piecewise(render_mode='human')

n_actions = env.action_space.shape[-1]
print(env.action_space)
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

model = DDPG("MlpPolicy", env, action_noise=action_noise, verbose=1)
model.learn(total_timesteps=10000, log_interval=10)
model.save("./saved_models/ddpg_piecewiseReward")
env = model.get_env()

Box(-1.0, 1.0, (1,), float32)
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 22.4     |
|    ep_rew_mean     | -16.7    |
| time/              |          |
|    episodes        | 10       |
|    fps             | 26       |
|    time_elapsed    | 8        |
|    total_timesteps | 224      |
| train/             |          |
|    actor_loss      | 0.993    |
|    critic_loss     | 0.238    |
|    learning_rate   | 0.001    |
|    n_updates       | 122      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 21.9     |
|    ep_rew_mean     | -15.5    |
| time/              |          |
|    episodes        | 20       |
|    fps             | 27       |
|    time_elapsed    | 16       |
|    total_timesteps | 437      |
| train/             |          |
|    actor_loss      | 1.

In [8]:
from envs.pendulum_penalizeDeflection import PendulumEnv as penalize

env = penalize(render_mode='human')

n_actions = env.action_space.shape[-1]
print(env.action_space)
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

model = SAC("MlpPolicy", env, action_noise=action_noise, verbose=1)
model.learn(total_timesteps=10000, log_interval=10)
model.save("./saved_models/sac_penalizeDeflection")
env = model.get_env()

Box(-1.0, 1.0, (1,), float32)
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 22.6     |
|    ep_rew_mean     | -17.1    |
| time/              |          |
|    episodes        | 10       |
|    fps             | 28       |
|    time_elapsed    | 8        |
|    total_timesteps | 226      |
| train/             |          |
|    actor_loss      | -0.202   |
|    critic_loss     | 0.222    |
|    ent_coef        | 0.963    |
|    ent_coef_loss   | -0.0627  |
|    learning_rate   | 0.0003   |
|    n_updates       | 125      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 21.9     |
|    ep_rew_mean     | -15.4    |
| time/              |          |
|    episodes        | 20       |
|    fps             | 28       |
|    time_elapsed    | 15       |
|    total_timesteps | 43

In [9]:
from envs.pendulum_penalizeDeflection import PendulumEnv as penalize

env = penalize(render_mode='human')

n_actions = env.action_space.shape[-1]
print(env.action_space)
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

model = DDPG("MlpPolicy", env, action_noise=action_noise, verbose=1)
model.learn(total_timesteps=10000, log_interval=10)
model.save("./saved_models/ddpg_penalizeDeflection")
env = model.get_env()

Box(-1.0, 1.0, (1,), float32)
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 34.3     |
|    ep_rew_mean     | -14.8    |
| time/              |          |
|    episodes        | 10       |
|    fps             | 27       |
|    time_elapsed    | 12       |
|    total_timesteps | 343      |
| train/             |          |
|    actor_loss      | 0.728    |
|    critic_loss     | 0.207    |
|    learning_rate   | 0.001    |
|    n_updates       | 211      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 129      |
|    ep_rew_mean     | -30      |
| time/              |          |
|    episodes        | 20       |
|    fps             | 27       |
|    time_elapsed    | 95       |
|    total_timesteps | 2581     |
| train/             |          |
|    actor_loss      | -0