In [1]:
import numpy as np
import random

In [2]:
# Environment
GRID_SIZE = 4
GOAL = (3, 3)
START = (0, 0)

# Hyperparameters
ALPHA = 0.1      # Learning rate
GAMMA = 0.9      # Discount factor
EPSILON = 0.2    # Exploration rate
EPISODES = 100
MAX_STEPS = 20

# Q-table: maps (state, action) -> Q-value
q_table = {}

In [3]:
def get_state_key(x, y):
    return (x, y)

def get_q_value(state, action):
    return q_table.get((state, action), 0.0)

def set_q_value(state, action, value):
    q_table[(state, action)] = value

def select_action(state, epsilon):
    """Epsilon-greedy action selection"""
    if random.random() < epsilon:
        return random.randint(0, 3)  # Explore
    else:
        # Exploit: choose best action
        actions = [0, 1, 2, 3]
        return max(actions, key=lambda a: get_q_value(state, a))

def execute_action(x, y, action):
    """Execute action and return new position"""
    moves = [(-1, 0), (1, 0), (0, -1), (0, 1)]  # Up, Down, Left, Right
    dx, dy = moves[action]
    nx, ny = x + dx, y + dy
    
    # Clamp to grid boundaries
    nx = max(0, min(GRID_SIZE - 1, nx))
    ny = max(0, min(GRID_SIZE - 1, ny))
    
    return nx, ny

def train_episode():
    """Train one episode"""
    # Random start position
    x, y = random.randint(0, GRID_SIZE - 1), random.randint(0, GRID_SIZE - 1)
    
    for step in range(MAX_STEPS):
        state = get_state_key(x, y)
        action = select_action(state, EPSILON)
        
        # Execute action
        nx, ny = execute_action(x, y, action)
        next_state = get_state_key(nx, ny)
        
        # Calculate reward
        if (nx, ny) == GOAL:
            reward = 10
        else:
            reward = -0.1
        
        # Q-learning update
        old_q = get_q_value(state, action)
        max_next_q = max(get_q_value(next_state, a) for a in range(4))
        new_q = old_q + ALPHA * (reward + GAMMA * max_next_q - old_q)
        set_q_value(state, action, new_q)
        
        # Move to next state
        x, y = nx, ny
        
        # Stop if goal reached
        if (x, y) == GOAL:
            break

def test_agent():
    """Test the trained agent"""
    x, y = START
    path = [(x, y)]
    
    for step in range(15):
        if (x, y) == GOAL:
            break
        
        state = get_state_key(x, y)
        action = select_action(state, 0)  # No exploration
        x, y = execute_action(x, y, action)
        path.append((x, y))
    
    return path

In [4]:
# Train the agent
print("Training...")
for episode in range(EPISODES):
    train_episode()
    if (episode + 1) % 20 == 0:
        print(f"Episode {episode + 1}/{EPISODES}")

print("\nTraining complete!")

# Test the agent
print("\nTesting agent path:")
path = test_agent()
print(f"Start: {START}, Goal: {GOAL}")
print(f"Path: {' -> '.join(str(p) for p in path)}")
print(f"Steps taken: {len(path) - 1}")

# Display grid
print("\nGrid visualization:")
for i in range(GRID_SIZE):
    for j in range(GRID_SIZE):
        if (i, j) == GOAL:
            print("G", end=" ")
        elif (i, j) in path:
            print("*", end=" ")
        else:
            print(".", end=" ")
    print()

Training...
Episode 20/100
Episode 40/100
Episode 60/100
Episode 80/100
Episode 100/100

Training complete!

Testing agent path:
Start: (0, 0), Goal: (3, 3)
Path: (0, 0) -> (1, 0) -> (1, 1) -> (1, 2) -> (1, 3) -> (2, 3) -> (3, 3)
Steps taken: 6

Grid visualization:
* . . . 
* * * * 
. . . * 
. . . G 
