In [1]:
import numpy as np
import collections

# --- Gridworld Environment Configuration ---
GRID_SIZE = 5
A_POS = (0, 1)
A_PRIME_POS = (4, 1)
B_POS = (0, 3)
B_PRIME_POS = (2, 3)
GAMMA = 0.9  # Discount factor

# --- Simulation Configuration ---
NUM_EPISODES = 50000
MAX_STEPS_PER_EPISODE = 1000

# Actions: 0: North, 1: South, 2: East, 3: West
ACTIONS = [np.array([-1, 0]), np.array([1, 0]), np.array([0, 1]), np.array([0, -1])]

def get_policy(state):
    """
    Defines the policy pi(a|s).
    For this problem, we use a uniform random policy.
    """
    return np.random.choice(len(ACTIONS))

def get_next_state_and_reward(state, action, reward_setting=1):
    """
    Returns the next state and reward based on the current state and action.
    """
    # Handle special states A and B
    if state == A_POS:
        reward = 10 if reward_setting == 1 else 16
        return A_PRIME_POS, reward
    if state == B_POS:
        reward = 5 if reward_setting == 1 else 11
        return B_PRIME_POS, reward

    # Calculate potential next state
    next_state = tuple((np.array(state) + ACTIONS[action]))

    # Check for moving off the grid
    if not (0 <= next_state[0] < GRID_SIZE and 0 <= next_state[1] < GRID_SIZE):
        reward = -1 if reward_setting == 1 else 5
        return state, reward # Stay in the same state

    # Standard move
    reward = 0 if reward_setting == 1 else 6
    return next_state, reward

def compute_value_function(reward_setting):
    """
    Performs Monte Carlo simulation to compute the state-value function.
    """
    # Initialize returns dictionary to store a list of returns for each state
    returns = collections.defaultdict(list)

    print(f"\n⏳ Running simulation for Reward Setting {reward_setting}...")

    for episode in range(NUM_EPISODES):
        # Start each episode from a random state
        initial_state = (np.random.randint(GRID_SIZE), np.random.randint(GRID_SIZE))
        current_state = initial_state
        episode_history = []

        # Generate an episode
        for _ in range(MAX_STEPS_PER_EPISODE):
            action = get_policy(current_state)
            next_state, reward = get_next_state_and_reward(current_state, action, reward_setting)
            episode_history.append((current_state, reward))
            current_state = next_state

        # Calculate returns for the episode (First-Visit MC)
        G = 0
        visited_states = set()
        # Iterate backwards through the episode history
        for state, reward in reversed(episode_history):
            G = reward + GAMMA * G
            # If this is the first time we've visited the state in this episode
            if state not in visited_states:
                returns[state].append(G)
                visited_states.add(state)

    # Calculate the average return for each state to get the value function
    value_function = np.zeros((GRID_SIZE, GRID_SIZE))
    for r in range(GRID_SIZE):
        for c in range(GRID_SIZE):
            state = (r, c)
            if returns[state]:
                value_function[r, c] = np.mean(returns[state])

    return value_function

# --- Main Execution ---
# Calculate value function for the first reward setting
v_setting1 = compute_value_function(reward_setting=1)

# Calculate value function for the second reward setting
v_setting2 = compute_value_function(reward_setting=2)

# --- Display Results ---
np.set_printoptions(precision=1, suppress=True)

print("\n---")
print("\n## Value Function under Reward Setting 1")
print("Rewards: Off-grid=-1, A=+10, B=+5, Other=0")
print(v_setting1)

print("\n---")
print("\n## Value Function under Reward Setting 2")
print("Rewards: Off-grid=5, A=+16, B=+11, Other=6")
print(v_setting2)

print("\n---")
print("\n## Difference (V_setting2 - V_setting1)")
print(v_setting2 - v_setting1)


⏳ Running simulation for Reward Setting 1...

⏳ Running simulation for Reward Setting 2...

---

## Value Function under Reward Setting 1
Rewards: Off-grid=-1, A=+10, B=+5, Other=0
[[ 5.3  8.4  4.9  4.6  3. ]
 [ 2.6  3.1  2.2  1.5  1.3]
 [ 0.7  0.7  0.4 -0.2  0.2]
 [-0.2 -0.4 -0.3 -0.3 -0.3]
 [-0.5 -0.9 -0.6 -0.5 -0.5]]

---

## Value Function under Reward Setting 2
Rewards: Off-grid=5, A=+16, B=+11, Other=6
[[62.2 62.8 61.7 58.8 59.4]
 [55.7 54.2 52.6 51.7 53.6]
 [50.1 47.  45.2 43.2 48.4]
 [46.3 42.  41.7 42.4 47. ]
 [47.  40.7 43.1 45.2 49.1]]

---

## Difference (V_setting2 - V_setting1)
[[57.  54.4 56.8 54.2 56.4]
 [53.1 51.1 50.5 50.2 52.3]
 [49.4 46.3 44.8 43.4 48.2]
 [46.5 42.4 42.  42.7 47.3]
 [47.5 41.6 43.8 45.7 49.5]]
