### Import libraries

In [1]:
import numpy as np
import random

### Define GridWorld Environment

In [16]:
class GridWorld:
    def __init__(self, size=5):  # default 5x5 grid
        self.size = size
        self.goal = (size - 1, size - 1)
        self.reset()

    def reset(self):
        self.agent_pos = (0, 0)
        return self.agent_pos

    def step(self, action):
        x, y = self.agent_pos
        if action == 0:   # Up
            x = max(x - 1, 0)
        elif action == 1: # Down
            x = min(x + 1, self.size - 1)
        elif action == 2: # Left
            y = max(y - 1, 0)
        elif action == 3: # Right
            y = min(y + 1, self.size - 1)

        new_pos = (x, y)

        if new_pos == self.agent_pos:
            reward = -1
        elif new_pos == self.goal:
            reward = 10
        else:
            reward = -1

        self.agent_pos = new_pos
        done = (new_pos == self.goal)

        return new_pos, reward, done

    def state_to_index(self, pos):
        return pos[0] * self.size + pos[1]


### Define Q-learning Agent

In [17]:
class QLearningAgent:
    def __init__(self, env):
        self.env = env
        self.q_table = np.zeros((env.size * env.size, 4))  # states = 25 for 5x5, 4 actions
        self.alpha = 0.1
        self.gamma = 0.9
        self.epsilon = 1.0
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.1

    def choose_action(self, state_index):
        if random.uniform(0, 1) < self.epsilon:
            return random.randint(0, 3)
        else:
            return np.argmax(self.q_table[state_index])

    def learn(self, state, action, reward, next_state):
        predict = self.q_table[state][action]
        target = reward + self.gamma * np.max(self.q_table[next_state])
        self.q_table[state][action] += self.alpha * (target - predict)

    def train(self, episodes=500):
        for episode in range(episodes):
            state_pos = self.env.reset()
            state = self.env.state_to_index(state_pos)
            done = False
            step = 0

            while not done and step < 50:
                action = self.choose_action(state)
                next_pos, reward, done = self.env.step(action)
                next_state = self.env.state_to_index(next_pos)

                self.learn(state, action, reward, next_state)

                state = next_state
                step += 1

            if self.epsilon > self.epsilon_min:
                self.epsilon *= self.epsilon_decay

            if episode % 50 == 0:
                print(f"Episode {episode} complete, Epsilon: {self.epsilon:.3f}")

### Train the agent

In [18]:
env = GridWorld()   # Create 5x5 grid environment
agent = QLearningAgent(env)

print("Training started for 5x5 grid...")
agent.train(episodes=500)  # Train agent

Training started for 5x5 grid...
Episode 0 complete, Epsilon: 0.995
Episode 50 complete, Epsilon: 0.774
Episode 100 complete, Epsilon: 0.603
Episode 150 complete, Epsilon: 0.469
Episode 200 complete, Epsilon: 0.365
Episode 250 complete, Epsilon: 0.284
Episode 300 complete, Epsilon: 0.221
Episode 350 complete, Epsilon: 0.172
Episode 400 complete, Epsilon: 0.134
Episode 450 complete, Epsilon: 0.104


### Save Q-table

In [19]:
np.save("q_table.npy", agent.q_table)
print("Training complete. Q-table saved to 'q_table.npy'.")

Training complete. Q-table saved to 'q_table.npy'.
