In [1]:
# Create a Q Learning Algo from scratch

grid = [
    [1, 0, 0, 0],
    [0, 1, 0, 0],
    [0, 0, 2, 0],
    [0, 0, 0, 3]
]

print(grid)

[[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 2, 0], [0, 0, 0, 3]]


In [10]:
states = [(i, j) for i in range(4) for j in range(4)]
actions = [(0, 1), (1, 0), (0, -1), (-1, 0)] # Right, Down, Left, Up
q_table = {(state, action): 0 for state in states for action in actions}


In [11]:
import random

# Parameters
alpha = 0.5 # Learning rate
gamma = 0.95 # Discount factor
epsilon = 0.1 # Exploration rate
num_episodes = 10000

# Function to get the next state and reward
def get_next_state_and_reward(state, action):
    x, y = state
    dx, dy = action
    next_x, next_y = x + dx, y + dy
    
    # Check for out-of-bounds or hitting a wall
    if next_x < 0 or next_x >= 4 or next_y < 0 or next_y >= 4 or grid[next_x][next_y] == 1:
        return state, -1 # Penalty for hitting a wall
    
    reward = -1 # Default reward
    if grid[next_x][next_y] == 2:
        reward = 100 # Reward for reaching the goal
    elif grid[next_x][next_y] == 3:
        reward = -100 # Penalty for hitting the starting point again
    
    return (next_x, next_y), reward

# Q-Learning
for episode in range(num_episodes):
    state = (0, 0) # Start at the top-left corner
    done = False
    
    while not done:
        if random.uniform(0, 1) < epsilon:
            action = random.choice(actions) # Explore: choose a random action
        else:
            action = max(actions, key=lambda x: q_table[(state, x)]) # Exploit: choose the best action
        
        next_state, reward = get_next_state_and_reward(state, action)
        
        # Update Q-value
        old_value = q_table[(state, action)]
        next_max = max(q_table[(next_state, a)] for a in actions)
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[(state, action)] = new_value
        
        state = next_state
        
        if grid[state[0]][state[1]] == 2 or grid[state[0]][state[1]] == 3:
            done = True


In [12]:
def test_agent():
    state = (0, 0)
    done = False
    path = []
    
    while not done:
        action = max(actions, key=lambda x: q_table[(state, x)])
        next_state, reward = get_next_state_and_reward(state, action)
        path.append(state)
        state = next_state
        
        if grid[state[0]][state[1]] == 2 or grid[state[0]][state[1]] == 3:
            done = True
    
    return path

# Test the agent
path = test_agent()
print("Path taken by the agent:", path)


Path taken by the agent: [(0, 0), (0, 1), (0, 2), (1, 2)]
