In [30]:
import numpy as np

In [31]:
gridworld = np.array([
    ['S', 'F', 'F', 'F'],
    ['F', 'H', 'F', 'H'],
    ['F', 'F', 'F', 'H'],
    ['H', 'F', 'F', 'G']
])

rewards = {
    'S': 0,
    'G': 1,
    'H': -1,
    'F': 0
}

In [32]:
def monte_carlo(grid, rewards, num_episodes, gamma):
    state_values = np.zeros(grid.shape)
    state_counts = np.zeros(grid.shape)

    for _ in range(num_episodes):
        episode = []
        state = (0, 0)  # Starting state

        # Generate an episode by following a random policy
        while grid[state] != 'G':
            action = np.random.choice(['up', 'down', 'left', 'right'])
            next_state = get_next_state(state, action)

            episode.append((state, action, rewards[grid[next_state]]))
            state = next_state

        # Update state values using Monte Carlo returns
        G = 0
        for t in range(len(episode) - 1, -1, -1):
            state, action, reward = episode[t]
            G = gamma * G + reward
            state_counts[state] += 1
            state_values[state] += (G - state_values[state]) / state_counts[state]

    return state_values

In [24]:
def td_learning(grid, rewards, num_episodes, alpha=0.1, gamma=0.9):
    state_values = np.zeros(grid.shape)

    for _ in range(num_episodes):
        state = (0, 0)  
        total_return = 0

        while grid[state] != 'G':  
            action = np.random.choice(['up', 'down', 'left', 'right'])
            next_state = get_next_state(state, action)
            reward = rewards[grid[next_state]]

            # TD update
            state_values[state] += alpha * (reward + gamma * state_values[next_state] - state_values[state])

            
            state = next_state
            total_return += reward



    return state_values


In [33]:
def get_next_state(state, action):
    if action == 'up':
        return (max(state[0] - 1, 0), state[1])
    elif action == 'down':
        return (min(state[0] + 1, gridworld.shape[0] - 1), state[1])
    elif action == 'left':
        return (state[0], max(state[1] - 1, 0))
    elif action == 'right':
        return (state[0], min(state[1] + 1, gridworld.shape[1] - 1))

In [34]:
# Test the algorithms
num_episodes = 1000
gamma = 0.999
alpha = 0.001

print("Monte Carlo State Values:")
mc_state_values = monte_carlo(gridworld, rewards, num_episodes, gamma)
print(mc_state_values)

Monte Carlo State Values:
[[-11.96824146 -11.77575085 -11.29470863 -11.16235986]
 [-12.1494958  -11.43388503 -10.71450314 -10.08352739]
 [-12.01687505 -10.40232521  -8.59337213  -6.45277331]
 [-12.26812146  -9.67536061  -5.4254866    0.        ]]


In [26]:
# Test the algorithms
num_episodes = 1000
gamma = 0.999
alpha = 0.001

print("TDo State Values:")
mc_state_values = td_learning(gridworld, rewards, num_episodes, gamma)
print(mc_state_values)

TDo State Values:
[[-4.01634905 -3.40125655  0.19590713 -4.14523913]
 [-1.65408572 -1.33075715 -3.02979456 -0.11132895]
 [-0.13785342 -1.90503838 -0.46943199  0.99999255]
 [-3.60262286  0.5838343   0.9995866   0.        ]]


In [35]:
def greedy_action(state, state_values, grid):
    actions = ['up', 'down', 'left', 'right']
    best_value = -np.inf
    best_action = None
    for action in actions:
        next_state = get_next_state(state, action)
        i, j = next_state
        if grid[next_state] == 'H':  # Avoid hole
            continue
        if state_values[i, j] > best_value:
            best_value = state_values[i, j]
            best_action = action
    if best_action is None:  
        best_action = np.random.choice(actions)
    return best_action

In [38]:
import pygame
import time

pygame.init()
cell_size = 100
rows, cols = gridworld.shape
screen = pygame.display.set_mode((cols * cell_size, rows * cell_size))
pygame.display.set_caption("Gridworld Simulation")

# Colors
colors = {
    'S': (0, 255, 0),   
    'G': (255, 215, 0), 
    'H': (255, 0, 0),   
    'F': (200, 200, 200), 
}
agent_color = (0, 0, 255) 

# Simulation
state = (0, 0)
running = True
while running:
    for event in pygame.event.get():
        if event.type == pygame.QUIT:
            running = False

    #grid
    for i in range(rows):
        for j in range(cols):
            rect = pygame.Rect(j * cell_size, i * cell_size, cell_size, cell_size)
            pygame.draw.rect(screen, colors[gridworld[i, j]], rect)
            pygame.draw.rect(screen, (0, 0, 0), rect, 2)

    #agent
    x, y = state[1] * cell_size + 10, state[0] * cell_size + 10
    pygame.draw.circle(screen, agent_color, (x + 40, y + 40), 20)

    pygame.display.flip()
    time.sleep(0.5)

    if gridworld[state] == 'H':
        print("Robot fell into hole!")
        time.sleep(1)
        break

    if gridworld[state] == 'G':
        print("Goal reached!")
        time.sleep(1)
        break

    # Move by policy
    action = greedy_action(state, mc_state_values, gridworld)
    next_state = get_next_state(state, action)
    state = next_state

pygame.quit()


Goal reached!
