Aprendisaje por refuerzo rompecabezas
Diego Roberto Arancbia Delgado
link del repositorio:https://github.com/drArancibiaDelgado/sis420-/tree/main/Examenes/Final

In [3]:
import numpy as np
# Implementación del entorno del rompecabezas 
class PuzzleEnv:
    def __init__(self):
        self.n_rows = 4
        self.n_cols = 5
        self.state = np.arange(self.n_rows * self.n_cols).reshape((self.n_rows, self.n_cols))
        np.random.shuffle(self.state)
        self.goal_state = np.arange(self.n_rows * self.n_cols).reshape((self.n_rows, self.n_cols))

    def reset(self):
        np.random.shuffle(self.state)
        return self._get_state()

    def _get_state(self):
        return tuple(map(tuple, self.state))  # Convertimos la matriz en una tupla de tuplas para que sea hashable

    def step(self, action):
        # Encontrar la posición de la celda vacía (representada por el valor 0)
        zero_pos = np.argwhere(self.state == 0)[0]
        row, col = zero_pos

        # Determinar la nueva posición del vacío basado en la acción
        if action == 0 and row > 0:  # Arriba
            new_row, new_col = row - 1, col
        elif action == 1 and row < self.n_rows - 1:  # Abajo
            new_row, new_col = row + 1, col
        elif action == 2 and col > 0:  # Izquierda
            new_row, new_col = row, col - 1
        elif action == 3 and col < self.n_cols - 1:  # Derecha
            new_row, new_col = row, col + 1
        else:
            return self._get_state(), -1, False  # Acción inválida

        # Intercambiar los valores de la celda vacía y la celda objetivo
        self.state[row, col], self.state[new_row, new_col] = self.state[new_row, new_col], self.state[row, col]

        # Calcular la recompensa
        reward = 1 if np.array_equal(self.state, self.goal_state) else -0.1

        # Comprobar si el rompecabezas está resuelto
        done = np.array_equal(self.state, self.goal_state)

        return self._get_state(), reward, done

    def render(self):
        print(self.state)


In [4]:
import random
#la formula que se utiliza para el qlearning en este codigo es la siguiente
#Q(s,a) = Q(s,a) + alpha * (reward + gamma * max(Q(s',a')) - Q(s,a))
# se define una clase QLearningAgent que tiene los siguientes metodos
# __init__ = inicializa los valores de alpha, gamma, epsilon y la tabla q
# choose_action = elige una accion aleatoria o la mejor accion segun la tabla q
# update = actualiza la tabla q segun la formula de qlearning
  
class QLearningAgent:
    def __init__(self, n_actions, state_shape, alpha=0.5, gamma=0.99, epsilon=0.8):
        self.n_actions = n_actions
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.q_table = {}

    def choose_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return random.randint(0, self.n_actions - 1)
        else:
            if state not in self.q_table:
                self.q_table[state] = np.zeros(self.n_actions)
            return np.argmax(self.q_table[state])

    def update(self, state, action, reward, next_state):
        #glosario
        #state = estado actual
        #action = accion actual
        #reward = recompensa
        #next_state = estado siguiente
        #Q(s,a) = Q(s,a) + alpha * (reward + gamma * max(Q(s',a')) - Q(s,a))
        if state not in self.q_table:
            self.q_table[state] = np.zeros(self.n_actions)
        if next_state not in self.q_table:
            self.q_table[next_state] = np.zeros(self.n_actions)
        best_next_action = np.argmax(self.q_table[next_state])
        td_target = reward + self.gamma * self.q_table[next_state][best_next_action]
        td_error = td_target - self.q_table[state][action]
        self.q_table[state][action] += self.alpha * td_error


In [5]:
# Se crea una instancia del entorno y del agente, y se entrena el agente

env = PuzzleEnv()
agent = QLearningAgent(n_actions=4, state_shape=(env.n_rows, env.n_cols))

n_episodes = 100000
max_steps_per_episode = 100

for episode in range(n_episodes):
    state = env.reset()
    done = False
    steps = 0

    while not done and steps < max_steps_per_episode:
        action = agent.choose_action(state)
        next_state, reward, done = env.step(action)
        agent.update(state, action, reward, next_state)
        state = next_state
        steps += 1

    if episode % 5000 == 0:
        print(f"Episode {episode} completed")

print("Training finished")
env.render()


Episode 0 completed
Episode 5000 completed
Episode 10000 completed
Episode 15000 completed
Episode 20000 completed
Episode 25000 completed
Episode 30000 completed
Episode 35000 completed
Episode 40000 completed
Episode 45000 completed
Episode 50000 completed
Episode 55000 completed
Episode 60000 completed
Episode 65000 completed
Episode 70000 completed
Episode 75000 completed
Episode 80000 completed
Episode 85000 completed
Episode 90000 completed
Episode 95000 completed
Training finished
[[15 10  2  5  0]
 [12 19  8 18  9]
 [16 11  3 17 14]
 [13  7  1  4  6]]
