In [2]:
import numpy as np
import torch
import gymnasium as gym
from stable_baselines3 import A2C
from stable_baselines3.common.buffers import RolloutBuffer
from stable_baselines3.common.vec_env import DummyVecEnv

# === Setup Pendulum environment ===
env = gym.make("Pendulum-v1")
dummy_env = DummyVecEnv([lambda: env])

model = A2C("MlpPolicy", dummy_env, verbose=0, device="cpu")

n_steps = 5
buffer = RolloutBuffer(
    buffer_size=n_steps,
    observation_space=env.observation_space,
    action_space=env.action_space,
    device=model.device,
    gamma=model.gamma,
    gae_lambda=model.gae_lambda,
)

# === Globals ===
step_count = 0
obs_last = None
action_last = None
value_last = None
log_prob_last = None

def reset_env():
    global obs_last
    obs_last = dummy_env.reset()
    return obs_last

def get_action(obs):
    global obs_last, action_last, value_last, log_prob_last
    obs_last = np.array(obs, dtype=np.float32).reshape((1, -1))
    obs_tensor = torch.tensor(obs_last).float().to(model.device)
    with torch.no_grad():
        action_tensor, value_tensor, log_prob_tensor = model.policy.forward(obs_tensor)
    action_last = action_tensor
    value_last = value_tensor
    log_prob_last = log_prob_tensor
    return action_tensor.cpu().numpy()[0]

def my_step(action):
    next_obs, reward, terminated, truncated, _ = env.step(np.array(action))
    done = terminated or truncated
    return next_obs, reward, done

def store_transition(reward, done, next_obs):
    global step_count, obs_last, action_last, value_last, log_prob_last
    reward = np.array([reward], dtype=np.float32)
    done = np.array([done], dtype=bool)
    next_obs = np.array(next_obs, dtype=np.float32).reshape((1, -1))
    buffer.add(obs_last, action_last, reward, done, value_last, log_prob_last)
    step_count += 1
    obs_last = next_obs
    if step_count % n_steps == 0:
        with torch.no_grad():
            last_val = model.policy.predict_values(torch.tensor(next_obs).float().to(model.device))
        buffer.compute_returns_and_advantage(last_val, dones=done)
        model.train()

def save_model(path="a2c_pendulum"):
    model.save(path)

def load_model(path="a2c_pendulum"):
    global model
    model = A2C.load(path)
    model.set_env(dummy_env)
    return True


In [3]:
def compute_a2c_loss(policy, rollout_data, value_coef=0.5, entropy_coef=0.01):
    observations = rollout_data.observations
    actions = rollout_data.actions
    returns = rollout_data.returns
    advantages = rollout_data.advantages
    old_log_probs = rollout_data.old_log_prob

    # Get action distribution and value predictions
    dist = policy.get_distribution(observations)
    value_preds = policy.predict_values(observations)

    # Log probs and entropy from the current policy
    new_log_probs = dist.log_prob(actions)
    entropy = dist.entropy().mean()

    # Actor loss
    policy_loss = -(advantages * new_log_probs).mean()

    # Critic loss
    value_loss = torch.nn.functional.mse_loss(returns, value_preds)

    # Total loss
    total_loss = policy_loss + value_coef * value_loss - entropy_coef * entropy
    return total_loss


In [4]:
import numpy as np
import torch
import gymnasium as gym
from stable_baselines3 import A2C
from stable_baselines3.common.buffers import RolloutBuffer
from stable_baselines3.common.vec_env import DummyVecEnv

# === Setup Pendulum environment ===
env = gym.make("Pendulum-v1")
dummy_env = DummyVecEnv([lambda: env])

model = A2C("MlpPolicy", dummy_env, verbose=0, device="cpu")

n_steps = 5
buffer = RolloutBuffer(
    buffer_size=n_steps,
    observation_space=env.observation_space,
    action_space=env.action_space,
    device=model.device,
    gamma=model.gamma,
    gae_lambda=model.gae_lambda,
)

# === Globals ===
step_count = 0
obs_last = None
action_last = None
value_last = None
log_prob_last = None

def reset_env():
    global obs_last
    obs_last = dummy_env.reset()
    return obs_last

def get_action(obs):
    global obs_last, action_last, value_last, log_prob_last
    obs_last = np.array(obs, dtype=np.float32).reshape((1, -1))
    obs_tensor = torch.tensor(obs_last).float().to(model.device)
    with torch.no_grad():
        action_tensor, value_tensor, log_prob_tensor = model.policy.forward(obs_tensor)
    action_last = action_tensor
    value_last = value_tensor
    log_prob_last = log_prob_tensor
    return action_tensor.cpu().numpy()[0]

def my_step(action):
    next_obs, reward, terminated, truncated, _ = env.step(np.array(action))
    done = terminated or truncated
    return next_obs, reward, done

def store_transition(reward, done, next_obs):
    global step_count, obs_last, action_last, value_last, log_prob_last
    reward = np.array([reward], dtype=np.float32)
    done = np.array([done], dtype=bool)
    next_obs = np.array(next_obs, dtype=np.float32).reshape((1, -1))
    buffer.add(obs_last, action_last, reward, done, value_last, log_prob_last)
    step_count += 1
    obs_last = next_obs
    if step_count % n_steps == 0:
        with torch.no_grad():
            last_val = model.policy.predict_values(torch.tensor(next_obs).float().to(model.device))

        buffer.compute_returns_and_advantage(last_val, dones=done)

        # Manual A2C training loop
        model.policy.train()
        model.policy.optimizer.zero_grad()
        for rollout_data in buffer.get(batch_size=None):
            loss = compute_a2c_loss(model.policy, rollout_data)
            loss.backward()
        model.policy.optimizer.step()

        buffer.reset()


def save_model(path="a2c_pendulum"):
    model.save(path)

def load_model(path="a2c_pendulum"):
    global model
    model = A2C.load(path)
    model.set_env(dummy_env)
    return True


In [5]:
obs_last = reset_env()

In [6]:
action = get_action(obs_last)
next_obs, reward, done = my_step(action)
store_transition(reward, done, next_obs)

In [7]:
print("Step count:", step_count)

Step count: 1


In [8]:
def train_multiple_episodes(n_episodes=100):
    global step_count
    obs = reset_env()
    for episode in range(n_episodes):
        
        done = False
        episode_reward = 0
        while not done:
            action = get_action(obs)
            next_obs, reward, done = my_step(action)
            store_transition(reward, done, next_obs)
            obs = next_obs
            episode_reward += reward
            if done:
                break  # Optional, since the loop exits on `done` anyway
        print(f"Episode {episode + 1}: Reward = {episode_reward:.2f}, Total Steps = {step_count}")


In [None]:
train_multiple_episodes(1000)

Episode 1: Reward = -765.26, Total Steps = 201
Episode 2: Reward = -2.14, Total Steps = 202
Episode 3: Reward = -2.81, Total Steps = 203
Episode 4: Reward = -3.95, Total Steps = 204
Episode 5: Reward = -5.67, Total Steps = 205
Episode 6: Reward = -7.94, Total Steps = 206
Episode 7: Reward = -10.58, Total Steps = 207
Episode 8: Reward = -13.24, Total Steps = 208
Episode 9: Reward = -15.75, Total Steps = 209
Episode 10: Reward = -14.06, Total Steps = 210
Episode 11: Reward = -11.79, Total Steps = 211
Episode 12: Reward = -9.42, Total Steps = 212
Episode 13: Reward = -7.41, Total Steps = 213
Episode 14: Reward = -5.27, Total Steps = 214
Episode 15: Reward = -3.87, Total Steps = 215
Episode 16: Reward = -2.83, Total Steps = 216
Episode 17: Reward = -1.85, Total Steps = 217
Episode 18: Reward = -1.28, Total Steps = 218
Episode 19: Reward = -0.99, Total Steps = 219
Episode 20: Reward = -0.72, Total Steps = 220
Episode 21: Reward = -0.53, Total Steps = 221
Episode 22: Reward = -0.30, Total St

  value_loss = torch.nn.functional.mse_loss(returns, value_preds)


Episode 90: Reward = -0.90, Total Steps = 290
Episode 91: Reward = -1.29, Total Steps = 291
Episode 92: Reward = -1.92, Total Steps = 292
Episode 93: Reward = -2.54, Total Steps = 293
Episode 94: Reward = -3.65, Total Steps = 294
Episode 95: Reward = -5.09, Total Steps = 295
Episode 96: Reward = -7.21, Total Steps = 296
Episode 97: Reward = -9.45, Total Steps = 297
Episode 98: Reward = -12.07, Total Steps = 298
Episode 99: Reward = -15.03, Total Steps = 299
Episode 100: Reward = -14.62, Total Steps = 300
Episode 101: Reward = -12.59, Total Steps = 301
Episode 102: Reward = -10.39, Total Steps = 302
Episode 103: Reward = -8.17, Total Steps = 303
Episode 104: Reward = -6.04, Total Steps = 304
Episode 105: Reward = -4.22, Total Steps = 305
Episode 106: Reward = -3.00, Total Steps = 306
Episode 107: Reward = -2.30, Total Steps = 307
Episode 108: Reward = -1.65, Total Steps = 308
Episode 109: Reward = -1.15, Total Steps = 309
Episode 110: Reward = -0.89, Total Steps = 310
Episode 111: Rewar

: 

In [None]:
import gymnasium as gym

from stable_baselines3.common.policies import MlpPolicy
from stable_baselines3.common import make_vec_env
from stable_baselines3 import A2C

# Parallel environments
env = make_vec_env('CartPole-v1', n_envs=4)

model = A2C(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=25000)
model.save("a2c_cartpole")

del model # remove to demonstrate saving and loading

model = A2C.load("a2c_cartpole")

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()

Using cuda device




------------------------------------
| rollout/              |          |
|    ep_len_mean        | 28.1     |
|    ep_rew_mean        | 28.1     |
| time/                 |          |
|    fps                | 1633     |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -0.593   |
|    explained_variance | 0.0306   |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 0.273    |
|    value_loss         | 26.4     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 30.7     |
|    ep_rew_mean        | 30.7     |
| time/                 |          |
|    fps                | 1832     |
|    iterations         | 200      |
|    time_elapsed       | 2        |
|    total_timesteps    | 4000     |
| train/                |          |
|

DependencyNotInstalled: pygame is not installed, run `pip install "gymnasium[classic-control]"`

In [5]:
import gymnasium as gym
import matplotlib.pyplot as plt
from stable_baselines3 import A2C

# Create the Pendulum environment
env = gym.make('Pendulum-v1')

# Initialize the A2C model
model = A2C('MlpPolicy', env, verbose=1)

# Train the model (you can adjust the number of timesteps)
model.learn(total_timesteps=10000)

# Run the model and store the rewards for each step
obs, _ = env.reset()  # Correcting for the tuple returned by reset()
done = False
rewards = []
steps = []

step_count = 0
while not done:
    action, _states = model.predict(obs, deterministic=True)
    result = env.step(action)  # Get the step result

    print(result)

    # If there are 4 returned values (older Gym versions)
    if len(result) == 4:
        obs, reward, done, info = result
    # If there are 3 returned values (newer Gym versions)
    elif len(result) == 3:
        obs, reward, done = result
        info = {}  # Assign an empty dictionary if info is not returned


    print(action, obs, reward, done, info)

    rewards.append(reward)
    steps.append(step_count)
    
    # Print the reward for each step (optional)
    print(f"Step {step_count}, Reward: {reward}")
    
    step_count += 1

    plt.figure(figsize=(10, 6))
    plt.plot(steps, rewards, label="Reward per Step")
    plt.xlabel("Step")
    plt.ylabel("Reward")
    plt.title("Reward vs. Step in Pendulum Environment")
    plt.legend()
    plt.show()



env.close()




Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 200       |
|    ep_rew_mean        | -1.39e+03 |
| time/                 |           |
|    fps                | 398       |
|    iterations         | 100       |
|    time_elapsed       | 1         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.42     |
|    explained_variance | 7.89e-05  |
|    learning_rate      | 0.0007    |
|    n_updates          | 99        |
|    policy_loss        | -47.6     |
|    std                | 1         |
|    value_loss         | 833       |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 200       |
|    ep_rew_mean        | -1.52e+03 |
| time/                 |           |
|    fps                | 484       |
|    iterations         | 200       |
|    time_elapsed       | 2         |
|    total_t

NameError: name 'reward' is not defined