In [1]:
import numpy as np
import random



In [3]:
# Grid World parameters
GRID_SIZE = 5
START = (0, 0)
GOAL = (4, 4)
OBSTACLE_COUNT = 3  # Number of random obstacles
ACTIONS = ['UP', 'DOWN', 'LEFT', 'RIGHT']
EPISODES = 500  # Training episodes
ALPHA = 0.1  # Learning rate
GAMMA = 0.9  # Discount factor
EPSILON = 0.1  # Exploration factor



In [5]:
# Initialize Q-table
Q_table = np.zeros((GRID_SIZE, GRID_SIZE, len(ACTIONS)))



In [7]:
# Place obstacles at random locations
def generate_obstacles():
    obstacles = set()
    while len(obstacles) < OBSTACLE_COUNT:
        pos = (random.randint(0, GRID_SIZE - 1), random.randint(0, GRID_SIZE - 1))
        if pos != START and pos != GOAL:
            obstacles.add(pos)
    return obstacles

OBSTACLES = generate_obstacles()



In [11]:
# Reward function
def get_reward(state):
    if state == GOAL:
        return 10
    elif state in OBSTACLES:
        return -5
    return -1  # Small penalty for each step



In [15]:
# Get next state based on action
def get_next_state(state, action):
    x, y = state
    if action == 'UP':
        x = max(0, x - 1)
    elif action == 'DOWN':
        x = min(GRID_SIZE - 1, x + 1)
    elif action == 'LEFT':
        y = max(0, y - 1)
    elif action == 'RIGHT':
        y = min(GRID_SIZE - 1, y + 1)
    return (x, y)



In [17]:
# Choose action using epsilon-greedy strategy
def choose_action(state):
    if random.uniform(0, 1) < EPSILON:
        return random.choice(ACTIONS)  # Explore
    else:
        return ACTIONS[np.argmax(Q_table[state[0], state[1]])]  # Exploit



In [21]:
# Train the agent
for episode in range(EPISODES):
    state = START
    while state != GOAL:
        action = choose_action(state)
        next_state = get_next_state(state, action)
        reward = get_reward(next_state)

        # Q-learning update rule
        action_index = ACTIONS.index(action)
        best_next_q = np.max(Q_table[next_state[0], next_state[1]])
        Q_table[state[0], state[1], action_index] += ALPHA * (reward + GAMMA * best_next_q - Q_table[state[0], state[1], action_index])

        # Move to next state
        state = next_state



In [23]:
# Print final Q-table
print("Trained Q-table:")
print(Q_table)


Trained Q-table:
[[[-1.43326060e+00 -4.34062000e-01 -1.47999834e+00 -5.15679173e-01]
  [-2.35027744e+00  6.27373967e-01 -2.54375246e+00 -2.99874539e+00]
  [-9.95000000e-01 -8.38077602e-01 -7.52695402e-01 -7.08772010e-01]
  [-6.79346521e-01 -7.48869959e-01 -9.93557030e-01 -1.38173900e+00]
  [-5.00000000e-01 -3.05830900e-01 -2.43910000e-01 -5.00000000e-01]]

 [[-1.49545090e+00  3.59062498e-01 -4.71823790e-01  6.28820000e-01]
  [-5.72039275e-01  1.80980000e+00 -4.61090657e-01 -2.27761694e+00]
  [-1.85855907e+00  3.12138078e+00 -4.37362944e-01 -6.04116054e-01]
  [-5.43657824e-01  5.51979133e-01 -1.36481000e+00 -5.47626203e-01]
  [-9.50000000e-01 -2.52860657e-01 -4.26010253e-01 -4.83089539e-01]]

 [[-1.55501225e+00 -1.81232153e+00 -9.22399656e-01  1.80854969e+00]
  [ 6.05120511e-01  3.07111125e+00  5.90642516e-01  3.12200000e+00]
  [-2.20422004e+00  4.58000000e+00  1.77771476e+00  3.88929262e+00]
  [-3.38075004e-01  6.05456478e+00 -5.47199399e-01 -3.28122100e-01]
  [-1.90000000e-01  2.72999