In [2]:
!pip install stable-baselines3 gymnasium tensorboard

Collecting stable-baselines3
  Downloading stable_baselines3-2.3.2-py3-none-any.whl.metadata (5.1 kB)
Collecting gymnasium
  Using cached gymnasium-0.29.1-py3-none-any.whl.metadata (10 kB)
Downloading stable_baselines3-2.3.2-py3-none-any.whl (182 kB)
Using cached gymnasium-0.29.1-py3-none-any.whl (953 kB)
Installing collected packages: gymnasium, stable-baselines3
  Attempting uninstall: gymnasium
    Found existing installation: gymnasium 1.0.0
    Uninstalling gymnasium-1.0.0:
      Successfully uninstalled gymnasium-1.0.0
Successfully installed gymnasium-0.29.1 stable-baselines3-2.3.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import gymnasium as gym
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback
from gymnasium.vector import SyncVectorEnv
from torch.utils.tensorboard import SummaryWriter

# Custom callback to log actions to TensorBoard
class ActionLoggingCallback(BaseCallback):
    def __init__(self, log_dir, verbose=0):
        super(ActionLoggingCallback, self).__init__(verbose)
        self.writer = SummaryWriter(log_dir)

    def _on_step(self) -> bool:
        # Log the actions taken by the model
        action = self.locals["actions"]  # Actions taken at the current step
        episode = self.num_timesteps
        self.writer.add_scalar("Action/Guessed_Bid", action[0], episode)  # Log the first environment's action
        return True

    def _on_training_end(self) -> None:
        self.writer.close()

# Define the custom bidding environment
class BiddingEnv(gym.Env):
    def __init__(self):
        super(BiddingEnv, self).__init__()
        self.action_space = gym.spaces.Discrete(200)  # Actions from 0 to 199
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(1,), dtype=np.float32)
    
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)  # Optional seeding for reproducibility
        self.state = np.array([0], dtype=np.float32)
        return self.state, {}

    def step(self, action):
        reward = -abs(action - 100)  # Max reward at bid = 100
        self.state = np.array([0], dtype=np.float32)
        done = True  # Single-step environment
        return self.state, reward, done, False, {}

# Function to create an instance of the environment
def make_env():
    return BiddingEnv()

# Instantiate and vectorize the environment
env = SyncVectorEnv([make_env])

# Set up the TensorBoard log directory
tensorboard_log_dir = "./runs/bidding_ppo"
callback = ActionLoggingCallback(log_dir=tensorboard_log_dir)

# Initialize the PPO model with TensorBoard logging enabled
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=tensorboard_log_dir)

# Start training, with custom logging of guessed values
model.learn(total_timesteps=10000, callback=callback)

# Close the environment
env.close()


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./runs/bidding_ppo/PPO_2
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | -47.9    |
| time/              |          |
|    fps             | 4442     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1            |
|    ep_rew_mean          | -48.2        |
| time/                   |              |
|    fps                  | 2945         |
|    iterations           | 2            |
|    time_elapsed         | 1            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0002213335 |
|    clip_fraction        | 0            |
|    clip_rang

In [5]:
import gymnasium as gym
import numpy as np
from torch.utils.tensorboard import SummaryWriter
import random

# Custom callback to log actions to TensorBoard
class ActionLoggingCallback:
    def __init__(self, log_dir):
        self.writer = SummaryWriter(log_dir)
        self.episode = 0

    def log(self, action, reward):
        # Log the action (guessed bid) and reward
        self.writer.add_scalar("Action/Guessed_Bid", action, self.episode)
        self.writer.add_scalar("Reward/Episode_Reward", reward, self.episode)
        self.episode += 1

    def close(self):
        self.writer.close()

# Define the custom bidding environment
class BiddingEnv(gym.Env):
    def __init__(self):
        super(BiddingEnv, self).__init__()
        self.action_space = gym.spaces.Discrete(200)  # Actions from 0 to 199
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(1,), dtype=np.float32)
    
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.state = np.array([0], dtype=np.float32)
        return self.state, {}

    def step(self, action):
        reward = -abs(action - 100)  # Max reward at bid = 100
        self.state = np.array([0], dtype=np.float32)
        done = True  # Single-step environment
        return self.state, reward, done, False, {}

# Instantiate the environment
env = BiddingEnv()

# Initialize the Q-table
q_table = np.zeros((1, env.action_space.n))  # Only one state in our environment
learning_rate = 0.1
discount_factor = 0.99
epsilon = 1.0  # Start with exploration
epsilon_decay = 0.995
min_epsilon = 0.01
total_episodes = 10000

# Set up TensorBoard logging
tensorboard_log_dir = "./runs/bidding_qlearning"
callback = ActionLoggingCallback(log_dir=tensorboard_log_dir)

# Q-learning training loop
for episode in range(total_episodes):
    state, _ = env.reset()
    done = False
    total_reward = 0

    while not done:
        # Epsilon-greedy action selection
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()  # Explore
        else:
            action = np.argmax(q_table[0])  # Exploit known best action

        # Take the chosen action and observe the reward
        _, reward, done, _, _ = env.step(action)
        total_reward += reward

        # Q-value update
        old_q_value = q_table[0, action]
        next_max = np.max(q_table[0])
        q_table[0, action] = (1 - learning_rate) * old_q_value + learning_rate * (reward + discount_factor * next_max)

        # Logging action and reward per episode
        callback.log(action, total_reward)

    # Decay epsilon to reduce exploration over time
    if epsilon > min_epsilon:
        epsilon *= epsilon_decay

callback.close()
env.close()


### Self Coded test...

In [None]:
import gymnasium as gym
from gymnasium import spaces

class CustomEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self):
        super(CustomEnv, self).__init__()
        self.action_space = spaces.Discrete(200)  # Actions from 0 to 199
        self.observation_space = spaces.Box(low=0, high=1, shape=(1,), dtype=np.float32)

        def step(self, action):
            # return observaton, reward, done, info
            pass

        def reset(self):
            # return observaton
            pass

        def render(self, mode='human'):
            pass

        def close(self):
            pass

    
env = CustomEnv()
model = A2C("MlpPolicy", env, verbose=1)

        
