In [13]:
import numpy as np

### Algorithm 1

In [None]:
# create a simple environment (grid world)
# 0 represents an empty space
# 1 represents a wall
# 2 represents the goal
# The agent starts at (0, 0) and can move up, down, left, or right

In [15]:
# Grid World environment
environment = np.array([[0, 0, 0, 1],
                       [0, 1, 0, 1],
                       [0, 0, 0, 1],
                       [1, 0, 2, 1]])

In [16]:
# Set the parameters
learning_rate = 0.1           # learning rate
discount_factor = 0.9         # Discount factor
num_episodes = 1000           # Number of episodes
max_steps_per_episode = 100  # Maximum number of steps per episode

In [17]:
# Initialize the Q-table
num_states = np.prod(environment.shape)
num_actions = 4       # Up, Down, Left, Right
q_table = np.zeros((num_states, num_actions))

In [18]:
# Convert the environment into a 1D state representation
def get_state_index(state):
    rows, cols = environment.shape
    return state[0] * cols + state[1]

In [35]:
# Q-learning algorithm
for episode in range(num_episodes):
    state = (0, 0)    # Initial state
    total_reward = 0  # Total reward for the episode
    
    for step in range(max_steps_per_episode):
        # Choose an action based on the Q-table (epsilon-greedy strategy)
        if np.random.rand() < 0.1:
            action = np.random.randint(num_actions)
        else:
            state_index = get_state_index(state)
            action = np.argmax(q_table[state_index])
            
        # Perform the action and observe the next state and reward
        if action == 0:      # Up
            next_state = (state[0] - 1, state[1])
        elif action == 1:    # Down
            next_state = (state[0] + 1, state[1])
        elif action == 2:    # Left
            next_state = (state[0], state[1] - 1)
        elif action == 1:    # Right
            next_state = (state[0], state[1] + 1)
            
        if next_state[0] < 0 or next_state[0] >= environment.shape[0] or next_state[1] < 0 or next_state[1] >= environment.shape[1] or environment[next_state] == 1:
            # Invalid move, stay in the current state and receive a negative reward
            next_state = state
            reward = -1
        elif environment[next_state] == 2:
            # Reached the goal, receive a positive reward
            reward = 10
        else:
            # Move to a valid empty space, receive a small negative reward
            reward = -0.1
        
        # Update the Q-table
        state_index = get_state_index(state)
        next_state_index = get_state_index(next_state)
        q_table[state_index, action] += learning_rate * (reward + discount_factor * np.max(q_table[next_state_index]) - q_table[state_index, action])
        
        total_reward += reward
        state = next_state
        
        # Check if the goal is reached
        if environment[state] == 2:
            print("Goal has been reached")
            break

In [36]:
print(f"Episode {episode + 1}: Total Reward = {total_reward}")

Episode 1000: Total Reward = -13.599999999999982
