# **Imports**

In [1]:
import enum
import typing
import urllib
import collections
import abc
import numpy as np

# Definición de las diferentes acciones

In [2]:
class Action(enum.Enum):
    """Cada una de las posibles figuras."""
    ROCK = '🪨 Rock'
    PAPER = '🧻 Paper'
    SCISSORS = '✂️ Scissors'
    LIZARD = '🦎 Lizard'
    SPOCK = '🖖 Spock'

# Definicion de las diferentes recompensas según cada acción

In [3]:
MOVES_AND_REWARDS = {
    (Action.ROCK, Action.ROCK): 0, (Action.ROCK, Action.PAPER): -1,
    (Action.ROCK, Action.SCISSORS): 1, (Action.ROCK, Action.LIZARD): 1,
    (Action.ROCK, Action.SPOCK): -1,
    (Action.PAPER, Action.ROCK): 1, (Action.PAPER, Action.PAPER): 0,
    (Action.PAPER, Action.SCISSORS): -1, (Action.PAPER, Action.LIZARD): -1,
    (Action.PAPER, Action.SPOCK): 1,
    (Action.SCISSORS, Action.ROCK): -1, (Action.SCISSORS, Action.PAPER): 1,
    (Action.SCISSORS, Action.SCISSORS): 0, (Action.SCISSORS, Action.LIZARD): 1,
    (Action.SCISSORS, Action.SPOCK): -1,
    (Action.LIZARD, Action.ROCK): -1, (Action.LIZARD, Action.PAPER): 1,
    (Action.LIZARD, Action.SCISSORS): -1, (Action.LIZARD, Action.LIZARD): 0,
    (Action.LIZARD, Action.SPOCK): 1,
    (Action.SPOCK, Action.ROCK): 1, (Action.SPOCK, Action.PAPER): -1,
    (Action.SPOCK, Action.SCISSORS): 1, (Action.SPOCK, Action.LIZARD): -1,
    (Action.SPOCK, Action.SPOCK): 0,
}

# Creación del juego

In [4]:
class Game:
    RENDER_MODE_HUMAN = 'human'
    
    def __init__(self, render_mode=None):
        self.render_mode = render_mode

    def play(self, p1_action, p2_action):
        result = MOVES_AND_REWARDS[(p1_action, p2_action)]
        if self.render_mode == 'human':
            self.render(p1_action, p2_action, result)
        return result
    
    @staticmethod
    def render(p1_action, p2_action, result):
        if result == 0:
            print(f'{p1_action.value} tie!')
        elif result == 1:
            print(f'{p1_action.value} beats {p2_action.value}')
        elif result == -1:
            print(f'{p2_action.value} beats {p1_action.value}')
        else:
            raise ValueError(f'{p1_action}, {p2_action}, {result}')

# Definición de la calse Transition

In [6]:
class Transition(typing.NamedTuple):
    """Representa la transición de un estado al siguiente"""
    prev_state: int              # Estado origen de la transición
    next_state: int              # Estado destino de la transición
    action: Action               # Acción que provocó esta transición
    reward: typing.SupportsFloat # Recompensa obtenida

# **Clase Agent abstracta**

In [7]:
class Agent(metaclass=abc.ABCMeta):
    
    @abc.abstractmethod
    def __init__(self, name: str):
        """Inicializa el objeto.
        
        :param name: El nombre del agente.
        """
        self.name = name

    @abc.abstractmethod
    def decide(self, state:int) -> Action:
        """Decide la acción a llevar a cabo dado el estado actual.
        
        :param state: El estado en el que se encuentra el agente.
        :returns: La acción a llevar a cabo.
        """
    
    def update(self, transition: Transition):
        """Actualiza (si es necesario) el estado interno del agente.
        
        :param transition: La información de la transición efectuada.
        """
        pass
    
    def __str__(self):
        return self.name

# **Definicion de nuestro agente (TechNoir)**

In [80]:
class TechNoirAgent(Agent):
    def __init__(self, q_table: typing.Any=None):
        """Inicializa este objeto.
        
        :param name: El nombre del agente, para identificarle.
        :param q_table: Una tabla q de valores. Es opcional.
        """

        super().__init__(name='TechNoir Agent')
        if q_table:
            q_table = q_table
        else:
            self.q_table = {}

    def decide(self, state:int, 𝜀: typing.SupportsFloat=0) -> Action:
        """Decide la acción a ejecutar.
        
        :param state: El estado en el que nos encontramos.
        :param 𝜀: Un valor entre 0 y 1 que representa, según la estrategia
            ε-greedy, la probabilidad de que la acción sea elegida de manera
            aleatoria de entre todas las opciones posibles. Es opcional, y si
            no se especifica su valor es 0 (sin probabilidades de que se elija
            una acción aleatoria).
        :param returns: La acción a ejecutar.
        """
        
        if np.random.random() < 𝜀:
            action = np.random.choice(list(Action))
        else:
            q_values = self.q_table.get(state, {action: 0 for action in Action})
            action = max(q_values, key=q_values.get)
        
        return action

    def update(self, t: Transition, 𝛼=0.01, 𝛾=0.95):
        """Actualiza el estado interno de acuerdo a la experiencia vivida.
        
        :param transition: Toda la información correspondiente a la transición
            de la que queremos aprender.
        :param 𝛼: El factor de aprendizaje del cambio en el valor q. Por
            defecto es 0.1
        :param 𝛾: La influencia de la recompensa a largo plazo en el valor q a
            actualizar. Va de 0 (sin influencia) a 1 (misma influencia que el
            valor actual). Por defecto es 0.95.
        """
        prev_q_values = self.q_table.get(t.prev_state, {action: 0 for action in Action})
        next_q_values = self.q_table.get(t.next_state, {action: 0 for action in Action})
        
        max_q_value = max(next_q_values.values())
        
        prev_q_values[t.action] = (1 - 𝛼) * prev_q_values[t.action] + 𝛼 * (t.reward + 𝛾 * max_q_value)
        
        self.q_table[t.prev_state] = prev_q_values

    def __strQTable__(self) -> str:
        """Representación textual de la tabla Q del agente.
        
        :returns: Una cadena indicando la estructura interna de la tabla Q.
        """

        q_table_str = ""
        for state, q_values in self.q_table.items():
            q_table_str += f"State {state}: {q_values}\n"
        
        return q_table_str

# Carga del dataset para entrenamiento

In [81]:
dataset_url = 'https://blazaid.github.io/Aprendizaje-profundo/Datasets/rock-paper-scissors-lizard-spock.trn'

player2_actions = []
with urllib.request.urlopen(dataset_url) as f:
    for line in f:
        move = line.decode('utf-8').strip().upper()
        if move:
            player2_actions.append(Action[move])

# Entrenamiento del agente

In [82]:
𝜀 = 1
𝛿𝜀 = 𝜀 / len(player2_actions)

game = Game()
agent = TechNoirAgent()

state = 0  # El entorno (juego) no tiene estado, así que siempre será el mismo
for p2_action in player2_actions:
    p1_action = agent.decide(state, 𝛆)
    reward = game.play(p1_action, p2_action)

    # Actualizamos el agente
    agent.update(Transition(
        prev_state=state,
        next_state=state,
        action=p1_action,
        reward=reward
    ))

    # Actualizamos 𝜀
    𝜀 -= 𝛿𝜀 if 𝜀 > 0 else 0

# Resultados del entrenamiento

In [83]:
print(agent.q_table)

{0: {<Action.ROCK: '🪨 Rock'>: 3.6283617857855823, <Action.PAPER: '🧻 Paper'>: 3.6040215510640126, <Action.SCISSORS: '✂️ Scissors'>: 3.639882834789625, <Action.LIZARD: '🦎 Lizard'>: 3.451654015970638, <Action.SPOCK: '🖖 Spock'>: 3.969101960343152}}


# Definicion de los otros agentes para competición

In [84]:
class Botnifacio(Agent):
    def __init__(self):
        super().__init__(name='Botnifacio')
    
    def decide(self, state:int) -> Action:
        return np.random.choice(list(Action))


class Gustabot(Agent):
    def __init__(self):
        super().__init__(name='Gustabot')
        self.weights = np.random.random(5)
        self.weights /= sum(self.weights)
    
    def decide(self, state:int) -> Action:
        return np.random.choice(list(Action), p=self.weights)
    
    def update(self, transition: Transition):
        self.weights = np.random.random(5)
        self.weights /= sum(self.weights)

# **Competición**

In [94]:
import itertools

FRIENDLY_SETS = 10000
COMPETITION_SETS = 1000



competitors = [
     Botnifacio,
     Gustabot,
     TechNoirAgent,
]


leaderboard = {c: 0 for c in sorted(competitors, key=lambda x:
x().__str__())}
game = Game()
for p1, p2 in itertools.combinations(leaderboard.keys(), 2):
    p1, p2 = p1(), p2()
    # Amistoso
    s = 0
    for i in range(FRIENDLY_SETS):
        a1 = p1.decide(s)
        a2 = p2.decide(s)
        reward = game.play(a1, a2)
        p1.update(Transition(prev_state=s, next_state=s, action=a1, reward=reward))
        p2.update(Transition(prev_state=s, next_state=s, action=a2, reward=-reward))

     # Competición
    s = 0
    r1 = r2 = 0
    for i in range(COMPETITION_SETS):
        a1 = p1.decide(s)
        a2 = p2.decide(s)
        reward = game.play(a1, a2)
        r1 += reward
        r2 -= reward

    # Actualización de marcadores globales
    if r1 > r2:
        leaderboard[p1.__class__] += 3
    elif r2 > r1:
        leaderboard[p2.__class__] += 3
    else:
        leaderboard[p1.__class__] += 1
        leaderboard[p2.__class__] += 1

    print(f'{p1}: {r1}, {p2}: {r2}')


print('LEADERBOARD')
for c, r in sorted([(i(), v) for i, v in leaderboard.items()],
key=lambda t: t[1], reverse=True):
     print(f'{r:<10}\t{c}')

Botnifacio: 15, Gustabot: -15
Botnifacio: 9, TechNoir Agent: -9
Gustabot: 546, TechNoir Agent: -546
LEADERBOARD
6         	Botnifacio
3         	Gustabot
0         	TechNoir Agent
