In [5]:
import numpy as np
import random


In [6]:

# ------------------------
# Robot Environment
# ------------------------
class RobotEnv:
    def __init__(self, size=5):
        self.size = size
        self.start = (0, 0)
        self.goal = (4, 4)
        self.obstacle = (2, 2)
        self.reset()

    def reset(self):
        self.pos = self.start
        return self.pos

    def step(self, action):
        x, y = self.pos

        # Actions: 0=up, 1=down, 2=left, 3=right
        if action == 0: x = max(0, x - 1)
        if action == 1: x = min(self.size - 1, x + 1)
        if action == 2: y = max(0, y - 1)
        if action == 3: y = min(self.size - 1, y + 1)

        self.pos = (x, y)

        if self.pos == self.goal:
            return self.pos, 10, True
        elif self.pos == self.obstacle:
            return self.pos, -5, False
        else:
            return self.pos, -1, False

In [7]:
# ------------------------
# Q-Learning
# ------------------------
env = RobotEnv()
Q = np.zeros((5, 5, 4))

alpha = 0.1
gamma = 0.9
epsilon = 0.3
episodes = 2000
MAX_STEPS = 50   # <-- IMPORTANT FIX

for _ in range(episodes):
    state = env.reset()
    done = False

    for step in range(MAX_STEPS):    # <-- Prevent infinite loops
        x, y = state

        # Epsilon-greedy
        if random.random() < epsilon:
            action = random.randint(0, 3)
        else:
            action = np.argmax(Q[x, y])

        next_state, reward, done = env.step(action)
        nx, ny = next_state

        # Update Q-value
        Q[x, y, action] += alpha * (
            reward + gamma * np.max(Q[nx, ny]) - Q[x, y, action]
        )

        state = next_state

        if done:
            break


In [8]:

# ------------------------
# Test the trained robot
# ------------------------
state = env.reset()
path = [state]
done = False

for step in range(MAX_STEPS):  # <-- avoid infinite loops during testing too
    x, y = state
    action = np.argmax(Q[x, y])
    state, reward, done = env.step(action)
    path.append(state)
    if done:
        break

print("Learned Q-values:")
print(Q)

print("\nRobot’s optimal learned path:")
print(path)


Learned Q-values:
[[[-1.39065582 -0.43406231 -1.39065581 -0.434062  ]
  [-0.4340621   0.62881827 -1.39065584  0.62882   ]
  [ 0.62881976  1.8098     -0.43406286  1.80979985]
  [ 1.61474061  3.122       0.44875564  2.77923595]
  [ 0.70796055  4.57193866  0.72450409  1.37982998]]

 [[-1.46181154 -1.21039337 -0.83238872  0.62881999]
  [-0.47997571  1.32379574 -0.45431471  1.8098    ]
  [ 0.62881989 -0.8780001   0.62881997  3.122     ]
  [ 1.80979944  4.58        1.80979997  4.57999998]
  [ 2.91538254  6.2         3.0422935   4.40749729]]

 [[-1.74757095 -1.80233902 -1.6051075   0.99082361]
  [-0.81069234  3.03217112 -1.42269484 -1.75500186]
  [ 1.55035614  4.22609884  0.56882622  4.58      ]
  [ 3.12199998  6.19999981 -0.87800016  6.2       ]
  [ 4.57999999  8.          4.58        6.19999999]]

 [[-1.22997316 -1.15077802 -1.33570736  1.17696124]
  [-1.00756789  4.55677847 -0.99523064  2.17379054]
  [-1.32487744  6.19306694  0.75140027  3.15476075]
  [ 4.46461676  8.          4.07788308  