## Importación de gymnasium para entrenador de modelos

In [None]:
!pip install -q gymnasium
!pip install gymnasium[toy-text]

In [None]:
import gymnasium as gym
import random
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# render_mode="human"
environment = gym.make("FrozenLake-v1", is_slippery=False,
                       render_mode="rgb_array")
environment.reset()
tablero = environment.render()
plt.imshow(tablero)

In [None]:
from os import environ
nb_states = environment.observation_space.n
nb_actions = environment.action_space.n
qtable = np.zeros((nb_states, nb_actions))
print(qtable)

In [None]:
action = 2
environment.step(action)
tablero = environment.render()
plt.imshow(tablero)

In [None]:
action = environment.action_space.sample()
new_state, reward, terminated, truncated, info = environment.step(action)
tablero = environment.render()
print(f'Reward = {reward} terminated = {terminated} truncated = {truncated} new_state = {new_state}')
plt.imshow(tablero)


In [None]:
# Hyperparameters
episodes = 1000        # Total number of episodes
alpha = 0.5            # Learning rate
gamma = 0.9            # Discount factor

# List of outcomes to plot
outcomes = []

print('Q-table before training:')
print(qtable)
#environment = gym.make("FrozenLake-v1", is_slippery=False,
#                       render_mode="human")

# Training
for i in range(episodes):
    # Empieza la partida
    state = environment.reset()[0]
    done = False

    # Por defecto es derrota hasta que se demuestre lo contrario
    outcomes.append("Failure")

    # Hast que sucede algo se ejecuta
    while not done:
        # Recogemos la acción con mejor resultado
        if np.max(qtable[state]) > 0:
          action = np.argmax(qtable[state])

        # Si no hay mejor acción se elige al azar
        else:
          action = environment.action_space.sample()

        # Se mueve a la dirección deseada
        new_state, reward, terminated, truncated, info = environment.step(action)
        done = terminated or truncated
        # Actualización Q(s,a)
        qtable[state, action] = qtable[state, action] + \
                                alpha * (reward + gamma * np.max(qtable[new_state]) - qtable[state, action])

        # Actualizar el estado actual del juego
        state = new_state

        # Si hay recompensa es que ganó
        if reward:
          outcomes[-1] = "Success"
    print(i)
print()
print('===========================================')
print('Q-table after training:')
print(qtable)

In [None]:
# CON MEJORA CONTINUA

qtable = np.zeros((environment.observation_space.n, environment.action_space.n))

# Hyperparameters
episodes = 1000        # Total number of episodes
alpha = 0.5            # Learning rate
gamma = 0.9            # Discount factor
epsilon = 1.0          # Amount of randomness in the action selection
epsilon_decay = 0.001  # Fixed amount to decrease

environment = gym.make("FrozenLake-v1", is_slippery=False,
                       render_mode="human")
# List of outcomes to plot
outcomes = []

print('Q-table before training:')
print(qtable)

# Training
for _ in range(episodes):
    state = environment.reset()[0]
    done = False

    # By default, we consider our outcome to be a failure
    outcomes.append("Failure")

    # Until the agent gets stuck in a hole or reaches the goal, keep training it
    while not done:
        # Generate a random number between 0 and 1
        rnd = np.random.random()

        # If random number < epsilon, take a random action
        if rnd < epsilon:
          action = environment.action_space.sample()
        # Else, take the action with the highest value in the current state
        else:
          action = np.argmax(qtable[state])

        # Implement this action and move the agent in the desired direction
        #new_state, reward, done, info = environment.step(action)
        new_state, reward, terminated, truncated, info = environment.step(action)
        done = terminated or truncated
        # Update Q(s,a)
        qtable[state, action] = qtable[state, action] + \
                                alpha * (reward + gamma * np.max(qtable[new_state]) - qtable[state, action])

        # Update our current state
        state = new_state

        # If we have a reward, it means that our outcome is a success
        if reward:
          outcomes[-1] = "Success"

    # Update epsilon
    epsilon = max(epsilon - epsilon_decay, 0)

print()
print('===========================================')
print('Q-table after training:')
print(qtable)

# Plot outcomes
plt.figure(figsize=(12, 5))
plt.xlabel("Run number")
plt.ylabel("Outcome")
ax = plt.gca()
ax.set_facecolor('#efeeea')
plt.bar(range(len(outcomes)), outcomes, color="#0A047A", width=1.0)
plt.show()