In [34]:
import numpy as np

gridworld = np.array([
    ['S', 'F', 'F', 'F'],
    ['F', 'F', 'F', 'H'],
    ['F', 'F', 'F', 'H'],
    ['H', 'F', 'F', 'G']
])

rewards = {
    'S': 0,
    'G': 1,
    'H': -1,
    'F': 0
}


In [35]:
def td_learning(grid, rewards, num_episodes, alpha=0.1, gamma=0.9):
    state_values = np.zeros(grid.shape)

    returns_per_episode = []
    
    for _ in range(num_episodes):
        state = (0, 0)  
        total_return = 0

        while grid[state] != 'G':  
            action = np.random.choice(['up', 'down', 'left', 'right'])
            next_state = get_next_state(state, action)
            reward = rewards[grid[next_state]]

            # TD update
            state_values[state] += alpha * (reward + gamma * state_values[next_state] - state_values[state])

            
            state = next_state
            total_return += reward

            returns_per_episode.append(total_return)


    return state_values


In [36]:
def get_next_state(state, action):
    if action == 'up':
        return (max(state[0] - 1, 0), state[1])
    elif action == 'down':
        return (min(state[0] + 1, gridworld.shape[0] - 1), state[1])
    elif action == 'left':
        return (state[0], max(state[1] - 1, 0))
    elif action == 'right':
        return (state[0], min(state[1] + 1, gridworld.shape[1] - 1))

In [37]:
# Test the algorithms TD
num_episodes = 1000
gamma = 0.99
alpha = 0.01

print("Monte TD State Values:")
mc_state_values = td_learning(gridworld, rewards, num_episodes, gamma)
print(mc_state_values)

Monte TD State Values:
[[-0.05069306  0.31305861  0.34230059 -3.98600449]
 [-0.35842554 -1.14114669 -0.25966003 -5.27359306]
 [-3.46923705  0.64141026  0.45392436 -0.99280506]
 [-0.27823858  0.88576582  0.99986905  0.        ]]


In [38]:
def greedy_action(state, state_values, grid):
    actions = ['up', 'down', 'left', 'right']
    best_action = None
    best_value = -np.inf

    for action in actions:
        next_state = get_next_state(state, action)
        if next_state == state:
            continue
        value = state_values[next_state]
        if value > best_value:
            best_value = value
            best_action = action

    return best_action


In [39]:
import pygame
import time

# Initialize Pygame

pygame.init()
cell_size = 100
rows, cols = gridworld.shape
screen = pygame.display.set_mode((cols * cell_size, rows * cell_size))
pygame.display.set_caption("Gridworld Simulation")

# Colors
colors = {
    'S': (0, 255, 0),   
    'G': (255, 215, 0), 
    'H': (255, 0, 0),   
    'F': (200, 200, 200), 
}
agent_color = (0, 0, 255) 

# Simulation
state = (0, 0)
running = True
while running:
    for event in pygame.event.get():
        if event.type == pygame.QUIT:
            running = False

    #grid
    for i in range(rows):
        for j in range(cols):
            rect = pygame.Rect(j * cell_size, i * cell_size, cell_size, cell_size)
            pygame.draw.rect(screen, colors[gridworld[i, j]], rect)
            pygame.draw.rect(screen, (0, 0, 0), rect, 2)

    #agent
    x, y = state[1] * cell_size + 10, state[0] * cell_size + 10
    pygame.draw.circle(screen, agent_color, (x + 40, y + 40), 20)

    pygame.display.flip()
    time.sleep(0.5)

    if gridworld[state] == 'H':
        print("Robot fell into hole!")
        time.sleep(1)
        break

    if gridworld[state] == 'G':
        print("Goal reached!")
        time.sleep(1)
        break

    # Move according to greedy policy
    action = greedy_action(state, mc_state_values, gridworld)
    next_state = get_next_state(state, action)
    state = next_state

pygame.quit()
