In [None]:
import numpy as np
import random

# ------------------------
# Robot Environment
# ------------------------
class RobotEnv:
    def __init__(self, size=5):
        self.size = size
        self.start = (0, 0)
        self.goal = (4, 4)
        self.obstacle = (2, 2)
        self.reset()

    def reset(self):
        self.pos = self.start
        return self.pos

    def step(self, action):
        x, y = self.pos

        # Actions: 0=up, 1=down, 2=left, 3=right
        if action == 0: x = max(0, x - 1)
        if action == 1: x = min(self.size - 1, x + 1)
        if action == 2: y = max(0, y - 1)
        if action == 3: y = min(self.size - 1, y + 1)

        next_pos = (x, y)

        # Rewards
        if next_pos == self.goal:
            return next_pos, 10, True
        elif next_pos == self.obstacle:
            return next_pos, -5, False
        else:
            return next_pos, -1, False  # small penalty for movement

# ------------------------
# Q-Learning Robot
# ------------------------
env = RobotEnv()
Q = np.zeros((5, 5, 4))     # 5x5 grid, 4 actions

alpha = 0.1                 # learning rate
gamma = 0.9                 # discount factor
epsilon = 0.3               # exploration rate
episodes = 2000

for _ in range(episodes):
    state = env.reset()
    done = False

    while not done:
        x, y = state

        # Epsilon-greedy strategy
        if random.random() < epsilon:
            action = random.randint(0, 3)        # explore
        else:
            action = np.argmax(Q[x, y])          # exploit

        next_state, reward, done = env.step(action)
        nx, ny = next_state

        # Update Q-values
        Q[x, y, action] += alpha * (
            reward + gamma * np.max(Q[nx, ny]) - Q[x, y, action]
        )

        state = next_state

# ------------------------
# Test Trained Robot
# ------------------------
state = env.reset()
path = [state]
done = False

while not done:
    x, y = state
    action = np.argmax(Q[x, y])
    state, reward, done = env.step(action)
    path.append(state)

print("Robotâ€™s optimal learned path:")
print(path)
