In [1]:
import numpy as np

# Define the grid size
grid_size = 4

# Initialize rewards
rewards = np.full((grid_size, grid_size), -0.1)
rewards[2, 1] = -1  # Danger state s3,2
rewards[2, 3] = -1  # Danger state s3,4
rewards[3, 3] = 10  # Goal state s4,4

# Manually set initial value function V_0(s)
V = np.zeros((grid_size, grid_size))
V[2, 1] = -1.0  # Danger state s3,2
V[2, 3] = -1.0  # Danger state s3,4
V[3, 3] = 10.0  # Goal state s4,4

# Define actions and corresponding transitions
actions = {
    "Up": (-1, 0),
    "Down": (1, 0),
    "Left": (0, -1),
    "Right": (0, 1)
}

# Transition probabilities
prob_intended = 0.7
prob_other = 0.1

# Discount factor
gamma = 0.9

# Function to get next state
def get_next_state(state, action):
    return (state[0] + action[0], state[1] + action[1])

# Function to check if a state is valid
def is_valid_state(state):
    return 0 <= state[0] < grid_size and 0 <= state[1] < grid_size

# Function to perform value iteration
def value_iteration(V, rewards, actions, prob_intended, prob_other, gamma, iterations=1):
    V_new = np.copy(V)
    for _ in range(iterations):
        V_temp = np.copy(V_new)
        for i in range(grid_size):
            for j in range(grid_size):
                if (i, j) == (3, 3):  # Goal state
                    continue  # Value of goal state remains unchanged
                values = []
                for action_name, action in actions.items():
                    value = 0
                    next_state = get_next_state((i, j), action)
                    if is_valid_state(next_state):
                        value += prob_intended * (
                            rewards[next_state[0], next_state[1]] + gamma * V_new[next_state[0], next_state[1]]
                        )
                    else:
                        value += prob_intended * (rewards[i, j] + gamma * V_new[i, j])

                    for other_action_name, other_action in actions.items():
                        if other_action_name != action_name:
                            next_state = get_next_state((i, j), other_action)
                            if is_valid_state(next_state):
                                value += prob_other * (
                                    rewards[next_state[0], next_state[1]] + gamma * V_new[next_state[0], next_state[1]]
                                )
                            else:
                                value += prob_other * (rewards[i, j] + gamma * V_new[i, j])

                    values.append(value)

                V_temp[i, j] = max(values)
        V_new = np.copy(V_temp)
    return V_new

# Perform value iteration for one iteration

V_new = value_iteration(V, rewards, actions, prob_intended, prob_other, gamma, iterations=1)

# Print the new value function after the iteration
print("V_1(s):")
print(V_new)


V_1(s):
[[-0.41769113  2.9514063   5.19934689  8.42034327]
 [ 2.89844461  4.75501819  9.52775666 11.74553166]
 [ 5.0075909   9.54244782 12.82952442 16.77110887]
 [ 8.91908285 12.5015446  16.94229861 10.        ]]
