# Multi-Agent Bidding (Baselines)

In this walkthrough, we'll provide a brief example of how to use the custom bidding environment, as well as implementing some baselines. 

### Custom Environment Interaction

In [1]:
# imports
import gymnasium as gym
from torch.utils.tensorboard import SummaryWriter
import numpy as np
%load_ext tensorboard

# stable baselines
from stable_baselines3.common.env_checker import check_env
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env

# custom environment
from envs.bidding import BiddingEnv

env = BiddingEnv()

env.render() # choose from verbose, bids, plot

In [2]:
obs = env.reset()

# print('Observation space: \n', env.observation_space)
# print('Action space: \n', env.action_space)
# print('Random Action (Bid Matrix): \n', env.action_space.sample())
# print('Random Observation: \n', env.observation_space.sample())

env.render(mode='human')

for step in range(10):
    
    obs, reward, done, truncated, info = env.step(env.action_space.sample())
    env.render(mode='human')
    print(f'\n Step {step + 1} with reward = {reward}')

    if done:
        env.render(mode='human')
        print("Completed, final reward =", reward)
        break

print('optimal reward = ', env.optimal_reward())

Step: 0

Robots:
  Robot 1: Robot at (5, 6) with type B-navbot
  Robot 2: Robot at (1, 1) with type B-navbot
  Robot 3: Robot at (5, 9) with type A-humanbot
  Robot 4: Robot at (7, 0) with type C-embedbot
  Robot 5: Robot at (2, 5) with type B-navbot
  Robot 6: Robot at (2, 4) with type A-humanbot
  Robot 7: Robot at (9, 3) with type C-embedbot
  Robot 8: Robot at (5, 1) with type A-humanbot
  Robot 9: Robot at (5, 6) with type C-embedbot

Tasks:
  Task 1: Task at (6, 9) with prize 99 and type B-transport
  Task 2: Task at (0, 7) with prize 54 and type A-manipulation
  Task 3: Task at (2, 0) with prize 48 and type B-transport
  Task 4: Task at (7, 4) with prize 16 and type A-manipulation
  Task 5: Task at (7, 0) with prize 10 and type A-manipulation
  Task 6: Task at (7, 4) with prize 70 and type B-transport

Bidding Matrix:
╒══════════╤══════════╤══════════╤══════════╤══════════╤══════════╕
│   Task 1 │   Task 2 │   Task 3 │   Task 4 │   Task 5 │   Task 6 │
╞══════════╪══════════╪════

### Benchmarking with Stable Baselines

In [3]:
env = BiddingEnv()

# check stable baselines compatibility
check_env(env, warn=True)

# tensorboard logging
tensorboard_log_dir = "./runs/baselines/bidding_stable_baselines"



In [4]:
# 1) baseline random policy
class RandomPolicy:
    def __init__(self, action_space):
        self.action_space = action_space

    def predict(self, observation):
        return self.action_space.sample()

# tensorboard random policy
log_dir = "./runs/baselines/random_policy"
writer = SummaryWriter(log_dir)

# evaluate random policy
random_policy = RandomPolicy(env.action_space)
obs, _ = env.reset(seed=42)
total_reward, total_opt_reward = 0, 0

action_mean, action_std, rewards = [], [], []

for step in range(100):  # 1000 steps for example
    action = random_policy.predict(obs)

    obs, reward, done, _, _ = env.step(action)
    total_reward += reward
    total_opt_reward += env.optimal_reward()

    # log to tensorboard
    writer.add_scalar("Random_Policy/AvgBid", np.mean(action), step)
    writer.add_scalar("Random_Policy/StdBid", np.std(action), step)
    writer.add_scalar("Random_Policy/Reward", reward, step)
    writer.add_scalar("Random_Polcy/OptimalReward", env.optimal_reward(), step)

    action_mean.append(np.mean(action))
    action_std.append(np.std(action))
    rewards.append(reward)

    # print('avg bid of ', np.mean(action), ' with std of ', np.std(action), ' gave reward of ', reward)

    if done:
        obs, _ = env.reset()

print(f"Total reward for random policy: {round(total_reward, 2)} out of optimal {round(total_opt_reward, 2)}")

writer.close() # close tensorboard writer

Total reward for random policy: 49565.61 out of optimal 52959.15


In [5]:
# 2) baseline ppo policy
vec_env = make_vec_env(lambda: BiddingEnv(), n_envs=1) # vectorize env for ppo
ppo_model = PPO("MultiInputPolicy", vec_env, verbose=1, tensorboard_log="./runs/baselines/sb3_ppo")
ppo_model.learn(total_timesteps=100000)

ppo_model.load

# evaluate ppo policy
mean_reward, std_reward = evaluate_policy(ppo_model, vec_env, n_eval_episodes=10)
print(f"Mean reward for PPO: {mean_reward}, Std: {std_reward}")

# close envs (unnecessary in exisitng close() implementation)
vec_env.close()

Using cpu device
Logging to ./runs/baselines/sb3_ppo/PPO_3
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 497      |
| time/              |          |
|    fps             | 3285     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1           |
|    ep_rew_mean          | 503         |
| time/                   |             |
|    fps                  | 2476        |
|    iterations           | 2           |
|    time_elapsed         | 1           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.019647548 |
|    clip_fraction        | 0.242       |
|    clip_range           | 0.2         |
|    entropy_loss         | -76.6       |
|    explaine

In [29]:
# 3) baseline heuristic policy
class HeuristicPolicy:
    def __init__(self, bidding_matrix):
        self.bidding_matrix = bidding_matrix

    def predict(self, observation):
        # given prize range is (0, 4), always bid 2
        return np.ones(self.bidding_matrix.shape) * 1

# tensorboard random policy
log_dir = "./runs/baselines/heuristic_policy"
writer = SummaryWriter(log_dir)

# evaluate heuristic policy
heuristic_policy = HeuristicPolicy(env.bidding_matrix)
obs, _ = env.reset(seed=42)
total_reward = 0

for _ in range(1000):  # 1000 steps for example
    action = heuristic_policy.predict(obs)
    obs, reward, done, _, _ = env.step(action)
    total_reward += reward

    # log to tensorboard
    writer.add_scalar("Random_Policy/AvgBid", np.mean(action), step)
    writer.add_scalar("Random_Policy/StdBid", np.std(action), step)
    writer.add_scalar("Random_Policy/Reward", reward, step)

    if done:
        obs, _ = env.reset()

print(f"Total reward for heuristic policy: {total_reward}")

writer.close() # close tensorboard writer

Total reward for heuristic policy: -1777.2110281092396
