In [1]:
#Ch Mubashir_56892.
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# -----------------------------
# 1. Dummy Environment and Data
# -----------------------------
# Assume we have a simple environment with 4 states and 2 actions
num_states = 4
num_actions = 2
gamma = 0.99  # Discount factor

# Random example episode
episode_states = torch.tensor([[0,1,0,0],
                               [1,0,0,0],
                               [0,0,1,0]], dtype=torch.float32)  # 3 steps
episode_actions = torch.tensor([0,1,0])
episode_rewards = torch.tensor([1.0, 0.5, 2.0])

# -----------------------------
# 2. Define a simple Policy Network
# -----------------------------
class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(PolicyNetwork, self).__init__()
        self.fc = nn.Linear(state_dim, action_dim)

    def forward(self, x):
        return torch.softmax(self.fc(x), dim=-1)

policy_net = PolicyNetwork(num_states, num_actions)
optimizer = optim.Adam(policy_net.parameters(), lr=0.01)

# -----------------------------
# 3. Calculate Cumulative Reward
# -----------------------------
cumulative_reward = 0
discounted_rewards = []
for r in reversed(episode_rewards):
    cumulative_reward = r + gamma * cumulative_reward
    discounted_rewards.insert(0, cumulative_reward)
discounted_rewards = torch.tensor(discounted_rewards)
print("Discounted / Cumulative Rewards:", discounted_rewards)

# -----------------------------
# 4. Compute Policy Loss (REINFORCE)
# -----------------------------
log_probs = torch.log(policy_net(episode_states).gather(1, episode_actions.unsqueeze(1)).squeeze())
policy_loss = -torch.sum(log_probs * discounted_rewards)
print("Policy Loss:", policy_loss.item())

# -----------------------------
# 5. Compute Value Function & Bellman Error
# -----------------------------
# Dummy value network
value_net = nn.Linear(num_states, 1)
predicted_values = value_net(episode_states).squeeze()

# Bellman target: R_t + gamma * V(s_{t+1})
next_values = torch.cat([predicted_values[1:], torch.tensor([0.0])])
bellman_target = episode_rewards + gamma * next_values
bellman_error = nn.MSELoss()(predicted_values, bellman_target)
print("Bellman Error (MSE between V and target):", bellman_error.item())

# -----------------------------
# 6. Compute Mean Squared Error (MSE) Example
# -----------------------------
# Just compute MSE between predicted values and discounted rewards
mse_loss = nn.MSELoss()(predicted_values, discounted_rewards)
print("Mean Squared Error (MSE):", mse_loss.item())


Discounted / Cumulative Rewards: tensor([3.4552, 2.4800, 2.0000])
Policy Loss: 4.843930244445801
Bellman Error (MSE between V and target): 1.569450855255127
Mean Squared Error (MSE): 7.567729949951172


Explanation of Metrics:

Cumulative Reward / Discounted Reward :

Sum of rewards with discount factor applied.

Measures total “goodness” of an episode.

Policy Loss :

For REINFORCE, policy loss = -sum(log_prob * discounted_reward)

Maximizing cumulative reward by updating policy parameters.

Bellman Error :

Difference between predicted value V(s) and Bellman target R + gamma*V(s').

Used in value-based or actor-critic methods.

Mean Squared Error (MSE) :

MSE between predicted value and discounted reward (alternative baseline). **bold text**