In [5]:
import numpy as np

# Define the grid world environment
environment = np.array([
    [0, 0, 0, 0],
    [0, -1, 0, -1],
    [0, 0, 0, -1],
    [0, -1, 0, 1]
])

# Define the Q-table
q_table = np.zeros((4, 4, 4))  # 4x4 grid world

# Define the learning parameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.1  # Exploration rate

# Training loop
num_episodes = 1000
max_steps_per_episode = 100

for episode in range(num_episodes):
    state = (0, 0)  # Start state
    done = False
    steps = 0

    while not done and steps < max_steps_per_episode:
        # Epsilon-greedy exploration
        if np.random.rand() < epsilon:
            action = np.random.randint(4)
        else:
            action = np.argmax(q_table[state])

        # Take action and observe the next state and reward
        if action == 0:  # Up
            next_state = (state[0] - 1, state[1])
        elif action == 1:  # Down
            next_state = (state[0] + 1, state[1])
        elif action == 2:  # Left
            next_state = (state[0], state[1] - 1)
        elif action == 3:  # Right
            next_state = (state[0], state[1] + 1)

        # Check for boundary constraints
        next_state = (
            max(0, min(next_state[0], environment.shape[0] - 1)),
            max(0, min(next_state[1], environment.shape[1] - 1))
        )

        reward = environment[next_state]

        # Q-table update
        q_table[state][action] = (1 - alpha) * q_table[state][action] + alpha * (
                reward + gamma * np.max(q_table[next_state]))

        state = next_state
        steps += 1

        # Check if the goal is reached
        if reward == 1:
            print("Goal reached!")
            done = True

# Print the learned Q-table
print("Learned Q-table:")
print(q_table)


Goal reached!
Goal reached!
Goal reached!
Goal reached!
Goal reached!
Learned Q-table:
[[[ 0.          0.          0.          0.        ]
  [ 0.         -1.          0.          0.        ]
  [ 0.          0.          0.          0.        ]
  [ 0.         -0.99999999  0.          0.        ]]

 [[ 0.          0.          0.         -0.97972444]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.         -0.5217031  -0.65132156]
  [ 0.         -0.336196    0.         -0.271     ]]

 [[ 0.          0.          0.          0.        ]
  [-0.19       -0.19        0.          0.        ]
  [ 0.          0.          0.         -0.1       ]
  [-0.1         0.40951     0.          0.        ]]

 [[ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]]]
