In [2]:
import numpy as np
import random

# Define the environment
class GridWorld:
    def __init__(self):
        self.grid = np.array([
            ['S', ' ', ' ', ' ', 'G'],
            [' ', 'X', ' ', 'X', ' '],
            [' ', 'X', ' ', 'X', ' '],
            [' ', ' ', ' ', 'X', ' '],
            [' ', 'X', ' ', ' ', ' ']
        ])
        self.start = (0, 0)
        self.goal = (0, 4)
        self.state = self.start
        self.actions = ['up', 'down', 'left', 'right']

    def reset(self):
        self.state = self.start
        return self.state

    def step(self, action):
        row, col = self.state
        if action == 'up':
            row = max(row - 1, 0)
        elif action == 'down':
            row = min(row + 1, 4)
        elif action == 'left':
            col = max(col - 1, 0)
        elif action == 'right':
            col = min(col + 1, 4)

        next_state = (row, col)

        if self.grid[row, col] == 'X':
            next_state = self.state  # Stay in the same place if hit an obstacle

        reward = -1
        done = False
        if next_state == self.goal:
            reward = 10
            done = True

        self.state = next_state
        return next_state, reward, done

    def get_state_index(self, state):
        return state[0] * 5 + state[1]

# Define the Q-learning algorithm
class QLearningAgent:
    def __init__(self, env, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.q_table = np.zeros((25, len(env.actions)))

    def choose_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(self.env.actions)
        else:
            state_index = self.env.get_state_index(state)
            return self.env.actions[np.argmax(self.q_table[state_index])]

    def learn(self, state, action, reward, next_state):
        state_index = self.env.get_state_index(state)
        next_state_index = self.env.get_state_index(next_state)
        action_index = self.env.actions.index(action)

        best_next_action = np.argmax(self.q_table[next_state_index])
        td_target = reward + self.gamma * self.q_table[next_state_index][best_next_action]
        td_error = td_target - self.q_table[state_index][action_index]

        self.q_table[state_index][action_index] += self.alpha * td_error

# Train the agent
env = GridWorld()
agent = QLearningAgent(env)

episodes = 1000
for episode in range(episodes):
    state = env.reset()
    done = False

    while not done:
        action = agent.choose_action(state)
        next_state, reward, done = env.step(action)
        agent.learn(state, action, reward, next_state)
        state = next_state

# Test the agent
state = env.reset()
done = False
total_reward = 0

while not done:
    action = agent.choose_action(state)
    next_state, reward, done = env.step(action)
    total_reward += reward
    state = next_state

    print(f"Action: {action}, State: {next_state}, Reward: {reward}")

print(f"Total Reward: {total_reward}")


Action: right, State: (0, 1), Reward: -1
Action: right, State: (0, 2), Reward: -1
Action: right, State: (0, 3), Reward: -1
Action: right, State: (0, 4), Reward: 10
Total Reward: 7
