In [1]:
import numpy as np
import pygame
import time

gridworld = np.array([
    ['S', 'F', 'F', 'F'],
    ['F', 'H', 'F', 'H'],
    ['F', 'F', 'F', 'H'],
    ['H', 'F', 'F', 'G']
])

rewards = {
    'S': 0,
    'G': 1,
    'H': -1,
    'F': 0
}

def get_next_state(state, action):
    i, j = state
    if action == 'up':
        i = max(i - 1, 0)
    elif action == 'down':
        i = min(i + 1, gridworld.shape[0] - 1)
    elif action == 'left':
        j = max(j - 1, 0)
    elif action == 'right':
        j = min(j + 1, gridworld.shape[1] - 1)
    return (i, j)

def monte_carlo(grid, rewards, num_episodes, gamma):
    state_values = np.zeros(grid.shape)
    state_counts = np.zeros(grid.shape)

    for _ in range(num_episodes):
        episode = []
        state = (0, 0)  # Starting state

        while grid[state] not in ['G', 'H']:
            action = np.random.choice(['up', 'down', 'left', 'right'])
            next_state = get_next_state(state, action)
            episode.append((state, action, rewards[grid[next_state]]))
            state = next_state

        # MC update
        G = 0
        for t in range(len(episode) - 1, -1, -1):
            state, action, reward = episode[t]
            G = gamma * G + reward
            i, j = state
            state_counts[i, j] += 1
            state_values[i, j] += (G - state_values[i, j]) / state_counts[i, j]

    return state_values

def greedy_action(state, state_values, grid):
    actions = ['up', 'down', 'left', 'right']
    best_value = -np.inf
    best_action = None
    for action in actions:
        next_state = get_next_state(state, action)
        i, j = next_state
        if grid[next_state] == 'H':  # Avoid holes if possible
            continue
        if state_values[i, j] > best_value:
            best_value = state_values[i, j]
            best_action = action
    if best_action is None:  # If all neighbors are holes, pick one anyway
        best_action = np.random.choice(actions)
    return best_action

# Train with MC
num_episodes = 10000
gamma = 0.999
mc_state_values = monte_carlo(gridworld, rewards, num_episodes, gamma)
print("Monte Carlo State Values:\n", mc_state_values)

# ---- Visualization ----
pygame.init()
cell_size = 100
rows, cols = gridworld.shape
screen = pygame.display.set_mode((cols * cell_size, rows * cell_size))
pygame.display.set_caption("Gridworld Simulation")

colors = {
    'S': (0, 255, 0),
    'G': (255, 215, 0),
    'H': (255, 0, 0),
    'F': (200, 200, 200),
}
agent_color = (0, 0, 255)

state = (0, 0)
running = True
while running:
    for event in pygame.event.get():
        if event.type == pygame.QUIT:
            running = False

    # Draw grid
    for i in range(rows):
        for j in range(cols):
            rect = pygame.Rect(j * cell_size, i * cell_size, cell_size, cell_size)
            pygame.draw.rect(screen, colors[gridworld[i, j]], rect)
            pygame.draw.rect(screen, (0, 0, 0), rect, 2)

    # Draw agent
    x, y = state[1] * cell_size + 10, state[0] * cell_size + 10
    pygame.draw.circle(screen, agent_color, (x + 40, y + 40), 20)

    pygame.display.flip()
    time.sleep(0.5)

    if gridworld[state] == 'H':
        print("Robot fell into hole!")
        time.sleep(1)
        break

    if gridworld[state] == 'G':
        print("Goal reached!")
        time.sleep(1)
        break

    action = greedy_action(state, mc_state_values, gridworld)
    state = get_next_state(state, action)

pygame.quit()



pygame 2.6.1 (SDL 2.32.54, Python 3.11.13)
Hello from the pygame community. https://www.pygame.org/contribute.html
Monte Carlo State Values:
 [[-0.96449308 -0.97055509 -0.9502557  -0.97387664]
 [-0.96369223  0.         -0.92438193  0.        ]
 [-0.9330156  -0.83427111 -0.73605725  0.        ]
 [ 0.         -0.67508361 -0.10407832  0.        ]]
Goal reached!
