In [None]:
import gymnasium as gym
import matplotlib.pyplot as plt
from stable_baselines3 import PPO

# Create Gym environment
env = gym.make("CartPole-v1")


# Define self-play logic
class SelfPlayEnv(gym.Env):
    def __init__(self):
        super(SelfPlayEnv, self).__init__()
        self.env = gym.make("CartPole-v1")
        self.action_space = self.env.action_space  # Agent 1 actions
        self.observation_space = self.env.observation_space

        # Disturbance action space for Agent 2
        self.disturbance_space = gym.spaces.Discrete(3)  # Left, None, Right

    def reset(self):
        return self.env.reset(seed = 0)

    def step(self, action):
        # Agent 1 action
        obs, reward, done, info = self.env.step(action)

        # Agent 2 disturbance
        disturbance = np.random.choice([-1, 0, 1])  # Apply random force
        self.env.env.state[1] += disturbance * 0.01  # Modify velocity

        # Adjust reward for self-play scenario
        reward -= abs(disturbance * 0.1)  # Penalize disturbances
        return obs, reward, done, info

    def render(self, mode = "human"):
        return self.env.render()

    def close(self):
        self.env.close()


# Initialize self-play environment
self_play_env = SelfPlayEnv()

# Train PPO on the self-play environment
model = PPO("MlpPolicy", self_play_env, verbose = 1)
model.learn(total_timesteps = 50000, progress_bar = True)

# Evaluate and visualize the results
obs, info = self_play_env.reset()
rewards = []
for _ in range(200):
    action, _ = model.predict(obs)
    obs, reward, terminated, truncated, _ = self_play_env.step(action)
    done = terminated or truncated
    rewards.append(reward)
    if done:
        obs, info = self_play_env.reset()

plt.plot(rewards)
plt.title("Rewards Over Time")
plt.xlabel("Steps")
plt.ylabel("Reward")
plt.show()


In [None]:
import gymnasium as gym
import matplotlib.pyplot as plt
from stable_baselines3 import PPO


# Define Self-Play Environment
class SelfPlayEnv(gym.Env):
    def __init__(self):
        super(SelfPlayEnv, self).__init__()
        self.env = gym.make("CartPole-v1")
        self.action_space = self.env.action_space  # Agent 1 actions
        self.observation_space = self.env.observation_space

        # Disturbance action space for Agent 2
        self.disturbance_space = gym.spaces.Discrete(3)  # Left, None, Right

    def reset(self, seed = None, options = None):
        obs, _ = self.env.reset(seed = seed, options = options)
        return obs, {}

    def step(self, action):
        # Agent 1 action
        obs, reward, terminated, truncated, info = self.env.step(action)

        # Agent 2 disturbance (random force)
        disturbance = np.random.choice([-1, 0, 1])  # Apply random force
        self.env.unwrapped.state[1] += disturbance * 0.01  # Modify velocity

        # Adjust reward for self-play scenario
        reward -= abs(disturbance * 0.1)  # Penalize disturbances

        return obs, reward, terminated, truncated, info

    def render(self):
        self.env.render()

    def close(self):
        self.env.close()


# Initialize Self-Play Environment
self_play_env = SelfPlayEnv()

# Train PPO on the Self-Play Environment
model = PPO("MlpPolicy", self_play_env, verbose = 1)
model.learn(total_timesteps = 50000)

# Evaluate and Visualize Results
obs, _ = self_play_env.reset()
rewards = []
steps = []

for step in range(200):
    action, _ = model.predict(obs)
    obs, reward, terminated, truncated, _ = self_play_env.step(action)
    rewards.append(reward)
    steps.append(step)
    if terminated or truncated:
        obs, _ = self_play_env.reset()

# Plotting the Rewards
plt.plot(steps, rewards, label = "Rewards")
plt.title("Rewards Over Steps in Self-Play Environment")
plt.xlabel("Steps")
plt.ylabel("Reward")
plt.legend()
plt.show()


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Dummy environment setup
state_dim = 4
num_actions = 3
num_bins = 10

# Discretize the value range
min_value, max_value = -1, 1  # Example Q-value range
bins = torch.linspace(min_value, max_value, num_bins)


# Neural network for Q-value classification
class QNetwork(nn.Module):
    def __init__(self, state_dim, num_actions, num_bins):
        super(QNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.ReLU(),
            nn.Linear(128, num_actions * num_bins)
        )
        self.num_actions = num_actions
        self.num_bins = num_bins

    def forward(self, x):
        x = self.fc(x)
        return x.view(-1, self.num_actions, self.num_bins)


# Initialize network, loss, and optimizer
q_net = QNetwork(state_dim, num_actions, num_bins)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(q_net.parameters(), lr = 0.001)

# Dummy training loop
for epoch in range(100):
    # Simulate a batch of states
    states = torch.rand((32, state_dim))

    # Simulate Q-values and actions
    q_values = torch.rand((32, num_actions)) * (max_value - min_value) + min_value

    # Assign each Q-value to a bin
    target_bins = ((q_values - min_value) / (max_value - min_value) * (num_bins - 1)).long()

    # Forward pass
    logits = q_net(states)  # Shape: (batch_size, num_actions, num_bins)

    # Loss calculation
    loss = 0
    for action in range(num_actions):
        loss += criterion(logits[:, action, :], target_bins[:, action])

    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item()}")
