In [3]:
import numpy as np
# Define the grid world environment
grid = np.array([
    [0, 0, 0, 1],
    [0, 1, 0, -1],
    [0, 0, 0, 0],
    [1, 0, 1, 0]
])

# Define possible actions (up, down, left, right)
actions = [(0, -1), (0, 1), (-1, 0), (1, 0)]
# Define the discount factor
gamma = 0.9

# Initialize the state-value function
state_values = np.zeros(grid.shape)
# Initialize the policy (random policy)
policy = np.ones((*grid.shape, len(actions))) / len(actions)

# Helper function to check if a state is valid
def is_valid_state(state):
    i, j = state
    return 0 <= i < grid.shape[0] and 0 <= j < grid.shape[1]


# Helper function to check if a state is terminal
def is_terminal_state(state):
    return grid[state[0], state[1]] in [-1, 1]


# Helper function to take an action in the environment
def take_action(state, action):
    i, j = state
    di, dj = action
    new_state = (i + di, j + dj)
    return new_state if is_valid_state(new_state) else state

# Helper function to calculate the expected value for a state
def calculate_expected_value(state, action, state_values):
    i, j = state
    expected_value = 0
    for a in range(len(actions)):
        new_state = take_action(state, actions[a])
        expected_value += policy[i, j, a] * (grid[new_state[0], new_state[1]] + gamma * state_values[new_state[0], new_state[1]])
    return expected_value

def policy_iteration(episodes):
    for _ in range(episodes):
        episode = []
        state = (2, 0)

        while not is_terminal_state(state):
            action = np.random.choice(len(actions), p=policy[state[0], state[1]])
            new_state = take_action(state, actions[action])
            episode.append((state, action, grid[new_state[0], new_state[1]]))
            state = new_state

        returns = 0
        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            returns = gamma * returns + reward
            state_values[state[0], state[1]] += (1 / len(episode)) * (returns - state_values[state[0], state[1]])

    return state_values

def value_iteration():
    theta = 1e-6
    while True:
        delta = 0
        for i in range(grid.shape[0]):
            for j in range(grid.shape[1]):
                if is_terminal_state((i, j)):
                    continue
                v = state_values[i, j]
                new_state_values = [calculate_expected_value((i, j), a, state_values) for a in range(len(actions))]
                state_values[i, j] = max(new_state_values)
                delta = max(delta, abs(v - state_values[i, j]))
        if delta < theta:
            break

    return state_values



In [4]:
if __name__ == "__main__":
    print("Policy Iteration:")
    monte_carlo_result = policy_iteration(1000)
    print(monte_carlo_result)

    print("\nValue Iteration:")
    value_iteration_result = value_iteration()
    print(value_iteration_result)


Policy Iteration:
[[0.65639178 0.88307755 0.80650206 0.        ]
 [0.88668829 0.         0.38908785 0.        ]
 [0.90082    0.79006757 0.6867741  0.46459491]
 [0.         0.88596878 0.         0.42844117]]

Value Iteration:
[[ 0.54972415  0.65001732  0.57811278  0.        ]
 [ 0.6937504   0.          0.23014871  0.        ]
 [ 0.72874891  0.70527341  0.44477035 -0.06977614]
 [ 0.          0.84991809  0.          0.42600068]]
