In [4]:
import numpy as np

In [5]:
gridworld = np.array([
    ['S', 'F', 'F', 'F'],
    ['F', 'H', 'F', 'H'],
    ['F', 'F', 'F', 'H'],
    ['H', 'F', 'F', 'G']
])

rewards = {
    'S': 0,
    'G': 1,
    'H': -1,
    'F': 0
}

In [6]:
def monte_carlo(grid, rewards, num_episodes, gamma):
    state_values = np.zeros(grid.shape)
    state_counts = np.zeros(grid.shape)

    for _ in range(num_episodes):
        episode = []
        state = (0, 0)  # Starting state

        # Generate an episode by following a random policy
        while grid[state] != 'G':
            action = np.random.choice(['up', 'down', 'left', 'right'])
            next_state = get_next_state(state, action)

            episode.append((state, action, rewards[grid[next_state]]))
            state = next_state

        # Update state values using Monte Carlo returns
        G = 0
        for t in range(len(episode) - 1, -1, -1):
            state, action, reward = episode[t]
            G = gamma * G + reward
            state_counts[state] += 1
            state_values[state] += (G - state_values[state]) / state_counts[state]

    return state_values

In [7]:
def get_next_state(state, action):
    if action == 'up':
        return (max(state[0] - 1, 0), state[1])
    elif action == 'down':
        return (min(state[0] + 1, gridworld.shape[0] - 1), state[1])
    elif action == 'left':
        return (state[0], max(state[1] - 1, 0))
    elif action == 'right':
        return (state[0], min(state[1] + 1, gridworld.shape[1] - 1))

In [8]:
# Test the algorithms
num_episodes = 1000
gamma = 0.99
alpha = 0.01

print("Monte Carlo State Values:")
mc_state_values = monte_carlo(gridworld, rewards, num_episodes, gamma)
print(mc_state_values)

Monte Carlo State Values:
[[-1.52513704 -1.76561138 -1.74542163 -2.1360147 ]
 [-1.89606685 -1.72467013 -2.17876184 -2.32467573]
 [-2.20178226 -1.925548   -1.70095107 -1.59494335]
 [-2.45207124 -1.72427187 -0.70435172  0.        ]]


In [9]:
def greedy_action(state, state_values, grid):
    actions = ['up', 'down', 'left', 'right']
    best_action = None
    best_value = -np.inf

    for action in actions:
        next_state = get_next_state(state, action)
        if next_state == state:
            continue
        value = state_values[next_state]
        if value > best_value:
            best_value = value
            best_action = action

    return best_action


In [10]:
import pygame
import time

# Initialize Pygame
pygame.init()
cell_size = 100
rows, cols = gridworld.shape
screen = pygame.display.set_mode((cols * cell_size, rows * cell_size))
pygame.display.set_caption("Gridworld Simulation")

# Colors
colors = {
    'S': (0, 255, 0),   # Green
    'G': (255, 215, 0), # Gold
    'H': (255, 0, 0),   # Red
    'F': (200, 200, 200), # Gray
}
agent_color = (0, 0, 255)  # Blue

# Simulation
state = (0, 0)
running = True
while running:
    for event in pygame.event.get():
        if event.type == pygame.QUIT:
            running = False

    # Draw grid
    for i in range(rows):
        for j in range(cols):
            rect = pygame.Rect(j * cell_size, i * cell_size, cell_size, cell_size)
            pygame.draw.rect(screen, colors[gridworld[i, j]], rect)
            pygame.draw.rect(screen, (0, 0, 0), rect, 2)

    # Draw agent
    x, y = state[1] * cell_size + 10, state[0] * cell_size + 10
    pygame.draw.circle(screen, agent_color, (x + 40, y + 40), 20)

    pygame.display.flip()
    time.sleep(0.5)

    # Stop if goal reached
    if gridworld[state] == 'G':
        print("Goal reached!")
        time.sleep(1)
        break

    # Move according to greedy policy
    action = greedy_action(state, mc_state_values, gridworld)
    next_state = get_next_state(state, action)
    state = next_state

pygame.quit()
