In [11]:
import numpy as np

In [27]:
# define the grid world environment
environment = np.array([[0, 0, 0, 0],
                       [0, -1, 0, -1],
                       [0, 0, 0, -1],
                       [1, -1, 0, 1]])

In [28]:
# define the q-table
q_table = np.zeros((4, 4, 4))

In [29]:
# define the hyperparameters
learning_rate = 0.1
discount_factor = 0.99
epsilon = 0.1
num_episodes = 1000
max_steps_per_episode = 100

In [30]:
# Q-learning algorithm
for episode in range(num_episodes):
    state = (0, 0)  # Initial state

    for step in range(max_steps_per_episode):
        # Choose an action using epsilon-greedy policy
        if np.random.uniform(0, 1) < epsilon:
            action = np.random.randint(0, 4)  # Random action
        else:
            action = np.argmax(q_table[state])

        # Perform the action and observe the next state and reward
        next_state = None
        reward = None

        if action == 0:  # Up
            if state[0] > 0:
                next_state = (state[0] - 1, state[1])
        elif action == 1:  # Down
            if state[0] < 3:
                next_state = (state[0] + 1, state[1])
        elif action == 2:  # Left
            if state[1] > 0:
                next_state = (state[0], state[1] - 1)
        elif action == 3:  # Right
            if state[1] < 3:
                next_state = (state[0], state[1] + 1)

        if next_state is not None:
            reward = environment[next_state]

        # Update the Q-table using the Q-learning update rule
        if next_state is not None and reward is not None:
            q_table[state][action] += learning_rate * (reward + discount_factor * np.max(q_table[next_state]) - q_table[state][action])

        # Update the state
        state = next_state

        if state is None or environment[state] == 1:  # Reached the goal state or an invalid state
            break

In [31]:
# Test the learned policy
state = (0, 0)   # Initial state
steps = 0        # Number of steps taken

while environment[state] != 1 and steps < max_steps_per_episode:
    action = np.argmax(q_table[state])

    if action == 0:  # Up
        if state[0] > 0:
            state = (state[0] - 1, state[1])
    elif action == 1:  # Down
        if state[0] < 3:
            state = (state[0] + 1, state[1])
    elif action == 2:  # Left
        if state[1] > 0:
            state = (state[0], state[1] - 1)
    elif action == 3:  # Right
        if state[1] < 3:
            state = (state[0], state[1] + 1)

    steps += 1

if steps < max_steps_per_episode:
    print("Goal reached!")
else:
    print("Goal not reached!")

Goal not reached!


###### experiment

In [36]:
import numpy as np

# Define the grid world environment
environment = np.array([
    [0, 0, 0, 0],
    [0, -1, 0, -1],
    [0, 0, 0, -1],
    [0, -1, 0, 1]
])

# Define the Q-table
q_table = np.zeros((4, 4, 4))  # 4x4 grid with 4 possible actions

# Define the hyperparameters
learning_rate = 0.1
discount_factor = 0.99
epsilon = 0.1
num_episodes = 1000
max_steps_per_episode = 100

# Q-learning algorithm
for episode in range(num_episodes):
    state = (0, 0)  # Initial state

    for step in range(max_steps_per_episode):
        # Choose an action using epsilon-greedy policy
        if np.random.uniform(0, 1) < epsilon:
            action = np.random.randint(0, 4)  # Random action
        else:
            action = np.argmax(q_table[state])

        # Perform the action and observe the next state and reward
        next_state = None
        reward = None

        if action == 0:  # Up
            if state[0] > 0:
                next_state = (state[0] - 1, state[1])
        elif action == 1:  # Down
            if state[0] < 3:
                next_state = (state[0] + 1, state[1])
        elif action == 2:  # Left
            if state[1] > 0:
                next_state = (state[0], state[1] - 1)
        elif action == 3:  # Right
            if state[1] < 3:
                next_state = (state[0], state[1] + 1)

        if next_state is not None:
            reward = environment[next_state]

        # Update the Q-table using the Q-learning update rule
        if next_state is not None and reward is not None:
            q_table[state][action] += learning_rate * (reward + discount_factor * np.max(q_table[next_state]) - q_table[state][action])

        # Update the state
        state = next_state

        if state is None or environment[state] == 1:  # Reached the goal state or an invalid state
            break

# Test the learned policy
state = (0, 0)  # Initial state
steps = 0  # Number of steps taken

while environment[state] != 1 and steps < max_steps_per_episode:
    action = np.argmax(q_table[state])

    if action == 0:  # Up
        if state[0] > 0:
            state = (state[0] - 1, state[1])
    elif action == 1:  # Down
        if state[0] < 3:
            state = (state[0] + 1, state[1])
    elif action == 2:  # Left
        if state[1] > 0:
            state = (state[0], state[1] - 1)
    elif action == 3:  # Right
        if state[1] < 3:
            state = (state[0], state[1] + 1)

    steps += 1

if steps < max_steps_per_episode:
    print("Goal reached!")
else:
    print("Goal not reached!")


Goal not reached!
