In [4]:
import numpy as np
import matplotlib.pyplot as plt

In [20]:
# Grid World Environment
class GridWorld:
    def __init__(self, size=4):
        self.size = size
        self.goal = (size - 1, size - 1)  # Bottom-right corner is the goal
        self.actions = ['up', 'down', 'left', 'right']  # Possible actions
        self.state = (0, 0)  # Start at top-left corner

    def reset(self):
        """Reset the agent to the starting position."""
        self.state = (0, 0)
        return self.state

    def step(self, action):
        """Take a step in the environment based on the action."""
        x, y = self.state

        # Perform the action
        if action == 'up':
            x = max(x - 1, 0)
        elif action == 'down':
            x = min(x + 1, self.size - 1)
        elif action == 'left':
            y = max(y - 1, 0)
        elif action == 'right':
            y = min(y + 1, self.size - 1)

        self.state = (x, y)

        # Check if the goal is reached
        if self.state == self.goal:
            reward = 10  # Reward for reaching the goal
            done = True  # Episode ends
        else:
            reward = -1  # Small penalty for each step
            done = False

        return self.state, reward, done
    
    # Render the Grid World
    def render(self):
        """
        Visualize the grid.
        """
        grid = np.full((self.size, self.size), ".", dtype=str)  # Initialize grid with "."
        grid[self.goal] = "G"  # Mark the goal
        grid[self.state] = "A"  # Mark the agent's position

        # Print the grid
        print("\n".join(" ".join(row) for row in grid))
        print()

# Q-Learning Algorithm
class QLearning:
    def __init__(self, env, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.env = env
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate
        self.q_table = np.zeros((env.size, env.size, len(env.actions)))  # Q-table

    def choose_action(self, state):
        """Choose an action using epsilon-greedy strategy."""
        if np.random.uniform(0, 1) < self.epsilon:
            return np.random.choice(self.env.actions)  # Explore: random action
        else:
            x, y = state
            return self.env.actions[np.argmax(self.q_table[x, y])]  # Exploit: best action

    def update_q_table(self, state, action, reward, next_state):
        """Update the Q-table using the Q-learning formula."""
        x, y = state
        next_x, next_y = next_state
        action_idx = self.env.actions.index(action)

        # Q-learning formula
        old_value = self.q_table[x, y, action_idx]
        next_max = np.max(self.q_table[next_x, next_y])
        new_value = old_value + self.alpha * (reward + self.gamma * next_max - old_value)

        self.q_table[x, y, action_idx] = new_value



# Training the Agent
def train_agent(env, agent, episodes=1000, render_every=100):
    for episode in range(episodes):
        state = env.reset()
        done = False

        while not done:
            action = agent.choose_action(state)  # Choose action
            next_state, reward, done = env.step(action)  # Take action
            agent.update_q_table(state, action, reward, next_state)  # Update Q-table
            state = next_state

        if (episode + 1) % render_every == 0:
            print(f"Episode {episode + 1} completed")
            env.render()


In [16]:
# Testing the Agent
def test_agent(env, agent):
    state = env.reset()
    done = False
    steps = 0

    print("Testing the agent...")
    while not done:
        action = agent.choose_action(state)  # Choose action (no exploration)
        next_state, reward, done = env.step(action)
        print(f"State: {state}, Action: {action}, Next State: {next_state}, Reward: {reward}")
        state = next_state
        steps += 1

    print(f"Goal reached in {steps} steps!")
    env.render()

In [21]:
env = GridWorld(size=4)
agent = QLearning(env)

print("Training the agent...")
train_agent(env, agent, episodes=1000, render_every=100)

print("\nTesting the agent...")
test_agent(env, agent)

Training the agent...
Episode 100 completed
. . . .
. . . .
. . . .
. . . A

Episode 200 completed
. . . .
. . . .
. . . .
. . . A

Episode 300 completed
. . . .
. . . .
. . . .
. . . A

Episode 400 completed
. . . .
. . . .
. . . .
. . . A

Episode 500 completed
. . . .
. . . .
. . . .
. . . A

Episode 600 completed
. . . .
. . . .
. . . .
. . . A

Episode 700 completed
. . . .
. . . .
. . . .
. . . A

Episode 800 completed
. . . .
. . . .
. . . .
. . . A

Episode 900 completed
. . . .
. . . .
. . . .
. . . A

Episode 1000 completed
. . . .
. . . .
. . . .
. . . A


Testing the agent...
Testing the agent...
State: (0, 0), Action: right, Next State: (0, 1), Reward: -1
State: (0, 1), Action: right, Next State: (0, 2), Reward: -1
State: (0, 2), Action: down, Next State: (1, 2), Reward: -1
State: (1, 2), Action: down, Next State: (2, 2), Reward: -1
State: (2, 2), Action: down, Next State: (3, 2), Reward: -1
State: (3, 2), Action: right, Next State: (3, 3), Reward: 10
Goal reached in 6 ste

## References

- [Grid World representation for a neural network](https://stackoverflow.com/questions/36850302/grid-world-representation-for-a-neural-network)
- [Fundamentals of Reinforcement Learning: Navigating Gridworld with Dynamic Programming](https://medium.com/gradientcrescent/fundamentals-of-reinforcement-learning-navigating-gridworld-with-dynamic-programming-9b98a6f20310)