In [1]:
import numpy as np
import random

# -----------------------------
# Simple GridWorld Environment
# -----------------------------
class GridWorld:
    def __init__(self, size=4):
        self.size = size
        self.start = (0, 0)
        self.goal = (size - 1, size - 1)
        self.reset()

    def reset(self):
        self.pos = self.start
        return self.pos

    def step(self, action):
        x, y = self.pos

        if action == 0:   # up
            x = max(0, x - 1)
        elif action == 1: # down
            x = min(self.size - 1, x + 1)
        elif action == 2: # left
            y = max(0, y - 1)
        elif action == 3: # right
            y = min(self.size - 1, y + 1)

        self.pos = (x, y)

        # Reward
        if self.pos == self.goal:
            return self.pos, 1, True  # reached goal
        else:
            return self.pos, 0, False  # no reward

# -----------------------------
# Q-Learning Algorithm
# -----------------------------
env = GridWorld(size=4)

# Q-table: 4x4 grid with 4 actions each
Q = np.zeros((4, 4, 4))

alpha = 0.1      # learning rate
gamma = 0.9      # discount factor
epsilon = 0.3    # exploration probability
episodes = 2000

for _ in range(episodes):
    state = env.reset()
    done = False

    while not done:
        x, y = state

        # Epsilon-greedy action selection
        if random.random() < epsilon:
            action = random.choice([0, 1, 2, 3])  # explore
        else:
            action = np.argmax(Q[x, y])           # exploit

        next_state, reward, done = env.step(action)
        nx, ny = next_state

        # Q-learning update rule
        Q[x, y, action] = Q[x, y, action] + alpha * (
            reward + gamma * np.max(Q[nx, ny]) - Q[x, y, action]
        )

        state = next_state

# -----------------------------
# Test the trained agent
# -----------------------------
state = env.reset()
done = False
path = [state]

print("Learned Q-values:")
print(Q)

while not done:
    x, y = state
    action = np.argmax(Q[x, y])
    state, reward, done = env.step(action)
    path.append(state)

print("\nPath taken by the agent:")
print(path)


Learned Q-values:
[[[0.53144099 0.58880048 0.531441   0.59049   ]
  [0.59049    0.65548617 0.531441   0.6561    ]
  [0.6561     0.72899998 0.59049    0.729     ]
  [0.72899999 0.81       0.6561     0.72899999]]

 [[0.40992949 0.30520302 0.43292838 0.65597117]
  [0.59049    0.64521947 0.54448622 0.72899907]
  [0.63371818 0.76128498 0.60086144 0.81      ]
  [0.729      0.9        0.729      0.81      ]]

 [[0.49125071 0.00730628 0.03697619 0.06985335]
  [0.23244184 0.03552389 0.09077878 0.80051481]
  [0.62383582 0.55591331 0.53722415 0.9       ]
  [0.81       1.         0.80999994 0.89999999]]

 [[0.11636897 0.         0.         0.        ]
  [0.24365956 0.00733798 0.01164024 0.        ]
  [0.29908555 0.11152262 0.01381338 0.86491483]
  [0.         0.         0.         0.        ]]]

Path taken by the agent:
[(0, 0), (0, 1), (0, 2), (0, 3), (1, 3), (2, 3), (3, 3)]
