In [3]:
pip install imageio

Collecting imageio
  Downloading imageio-2.37.0-py3-none-any.whl.metadata (5.2 kB)
Downloading imageio-2.37.0-py3-none-any.whl (315 kB)
Installing collected packages: imageio
Successfully installed imageio-2.37.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [18]:
import numpy as np
import random
import matplotlib.pyplot as plt
from PIL import Image
import os

# Ambiente e parâmetros
n_rows, n_cols = 4, 4
goal_state = (3, 3)
obstacles = [(1, 2)]  # Obstáculos para rotas alternativas
actions = ['up', 'down', 'left', 'right']
arrow_map = {'up': '↑', 'down': '↓', 'left': '←', 'right': '→'}

episodes = 2000
snapshot_interval = 100  # snapshots a cada 100 episódios
alpha = 0.1
gamma = 0.9
epsilon_start = 1.0
epsilon_end = 0.1
decay_rate = 0.005

frames = []

def is_terminal(state):
    return state == goal_state

def get_next_state(state, action):
    i, j = state
    if action == 'up': i = max(i - 1, 0)
    elif action == 'down': i = min(i + 1, n_rows - 1)
    elif action == 'left': j = max(j - 1, 0)
    elif action == 'right': j = min(j + 1, n_cols - 1)
    return (i, j) if (i, j) not in obstacles else state

def get_reward(state):
    if state == goal_state:
        return 10
    elif state in obstacles:
        return -10
    else:
        return -1

# Inicializa Q-table
Q = {(i, j): {a: 0.0 for a in actions} for i in range(n_rows) for j in range(n_cols) if (i, j) not in obstacles}

for ep in range(episodes):
    epsilon = epsilon_end + (epsilon_start - epsilon_end) * np.exp(-decay_rate * ep)
    state = (0, 0)

    while not is_terminal(state):
        if random.random() < epsilon:
            action = random.choice(actions)
        else:
            action = max(Q[state], key=Q[state].get)

        next_state = get_next_state(state, action)
        reward = get_reward(next_state)
        next_best = max(Q[next_state], key=Q[next_state].get) if next_state in Q else None
        target = reward
        if next_best is not None:
            target += gamma * Q[next_state][next_best]
        Q[state][action] += alpha * (target - Q[state][action])
        state = next_state

    if (ep + 1) % snapshot_interval == 0 or ep == episodes - 1:
        policy_grid = np.full((n_rows, n_cols), '', dtype=object)
        q_values_for_color = np.full((n_rows, n_cols), np.nan)

        for i in range(n_rows):
            for j in range(n_cols):
                pos = (i, j)
                if pos in obstacles:
                    policy_grid[i, j] = '■'
                    q_values_for_color[i, j] = np.nan
                elif pos == goal_state:
                    policy_grid[i, j] = '★'
                    q_values_for_color[i, j] = np.nan
                elif pos in Q:
                    best_action = max(Q[pos], key=Q[pos].get)
                    best_value = Q[pos][best_action]
                    policy_grid[i, j] = f"{arrow_map[best_action]} {best_value:.2f}"
                    q_values_for_color[i, j] = best_value
                else:
                    policy_grid[i, j] = ''
                    q_values_for_color[i, j] = np.nan

        fig, ax = plt.subplots(figsize=(6, 6))
        ax.set_title(f"Política após {ep+1} episódios (melhor ação)")

        valid_q = q_values_for_color[~np.isnan(q_values_for_color)]
        min_q, max_q = valid_q.min(), valid_q.max()
        norm_q = (q_values_for_color - min_q) / (max_q - min_q + 1e-8)

        colors = np.empty(policy_grid.shape, dtype=object)
        for i in range(n_rows):
            for j in range(n_cols):
                if np.isnan(norm_q[i, j]):
                    colors[i, j] = '#FFFFFF'  # branco para obstáculo e objetivo
                else:
                    intensity = 1 - norm_q[i, j]  # mais escuro = maior valor
                    red_int = int(255 * intensity)
                    green_int = int(255 * intensity)
                    blue_int = 255
                    colors[i, j] = f'#{red_int:02x}{green_int:02x}{blue_int:02x}'

        table = ax.table(cellText=policy_grid, cellColours=colors, loc='center', cellLoc='center')
        table.scale(1, 1.5)  # escala maior para melhor leitura
        for key, cell in table.get_celld().items():
            cell.set_fontsize(14)

        ax.axis('off')
        fig.canvas.draw()

        image = np.frombuffer(fig.canvas.tostring_rgb(), dtype='uint8')
        image = image.reshape(fig.canvas.get_width_height()[::-1] + (3,))
        frames.append(Image.fromarray(image))
        plt.close(fig)

# Salvar resultados
output_dir = "./qlearning_outputs"
os.makedirs(output_dir, exist_ok=True)

gif_path = os.path.join(output_dir, "qlearning_bestaction.gif")
frames[0].save(gif_path, save_all=True, append_images=frames[1:], duration=1000, loop=0)

for idx, frame in enumerate(frames):
    frame.save(os.path.join(output_dir, f"frame_{idx+1:03d}.png"))

pdf_path = os.path.join(output_dir, "qlearning_bestaction_slides.pdf")
frames[0].save(pdf_path, save_all=True, append_images=frames[1:])

print("GIF salvo em:", gif_path)
print("PDF salvo em:", pdf_path)


  image = np.frombuffer(fig.canvas.tostring_rgb(), dtype='uint8')


GIF salvo em: ./qlearning_outputs\qlearning_bestaction.gif
PDF salvo em: ./qlearning_outputs\qlearning_bestaction_slides.pdf
