In [2]:
import logging
from collections import namedtuple
import random
from typing import Callable
from copy import deepcopy
from itertools import accumulate
from operator import xor
from matplotlib import pyplot as plt
import numpy as np
import math
import json
import os
from copy import deepcopy, copy
from collections import Counter


logging.basicConfig(
    format="[%(asctime)s] %(levelname)s: %(message)s",
    datefmt="%H:%M:%S",
    level=logging.DEBUG,
)

In [3]:
Nimply = namedtuple("Nimply", "row, num_objects")

In [4]:
class Nim:
    def __init__(self, num_rows: int, k: int = None, state = None) -> None:
        if state:
            self._rows = state
        else:
            self._rows = [i * 2 + 1 for i in range(num_rows)]
            self._k = k
            self._sticks = sum(self._rows)

    def __bool__(self):
        return sum(self._rows) > 0

    def __str__(self):
        return "<" + " ".join(str(_) for _ in self._rows) + ">"

    def __eq__(self, __o: object) -> bool:
        # return self._sticks == __o._sticks
        return Counter(self._rows) == Counter(__o._rows)
        
    def __hash__(self) -> int:
        return hash(Counter(self._rows).__str__())
        
    @property
    def rows(self) -> tuple:
        return tuple(self._rows)

    @property
    def k(self) -> int:
        return self._k

    @property
    def sticks(self) -> int:
        return self._sticks

    def nimming(self, ply: Nimply) -> None:
        row, num_objects = ply
        assert self._rows[row] >= num_objects
        assert self._k is None or num_objects <= self._k
        self._rows[row] -= num_objects

    # -1 if I don't win, 0 if I make the last winnning move
    def give_reward(self) -> int:
        -1 * int(not self.__bool__())

    def get_state_and_reward(self):
        return self._rows, self.give_reward()
         

In [5]:
def nim_sum(state: Nim) -> int:
    *_, result = accumulate(state.rows, xor)
    return result


def cook_status(state: Nim, complete=False) -> dict:
    cooked = dict()
    cooked["possible_moves"] = [
        (r, o) for r, c in enumerate(state.rows) for o in range(1, c + 1) if state.k is None or o <= state.k
    ]
    cooked["active_rows_number"] = sum(o > 0 for o in state.rows)
    cooked["shortest_row"] = min((x for x in enumerate(state.rows) if x[1] > 0), key=lambda y: y[1])[0]
    cooked["longest_row"] = max((x for x in enumerate(state.rows)), key=lambda y: y[1])[0]
    cooked["completation"] = sum(state.rows)/state.sticks
    if complete:
        brute_force = list()
        cooked["nim_sum"] = nim_sum(state)
        for m in cooked["possible_moves"]:
            tmp = deepcopy(state)
            tmp.nimming(m)
            brute_force.append((m, nim_sum(tmp)))
        cooked["brute_force"] = brute_force
    return cooked

In [6]:
def optimal_strategy(state: Nim) -> Nimply:
    data = cook_status(state, complete = True)
    return next((bf for bf in data["brute_force"] if bf[1] == 0), random.choice(data["brute_force"]))[0]


In [7]:
def aggressive(state: Nim) -> Nimply:
    """Pick always the entire row if the number of active rows is odd"""
    data = cook_status(state)
    if data['active_rows_number'] % 2 == 0:
        # random move
        row, num_objects = random.choice(data['possible_moves'])
    else:
        # aggressive move
        row = data['longest_row']
        num_objects = state.rows[row]
    return Nimply(row, num_objects)

In [8]:
strategy = (aggressive, optimal_strategy)

nim = Nim(4)
logging.debug(f"status: Initial board  -> {nim}")
player = 0
while nim:
    ply = strategy[player](nim)
    logging.debug(f"{player}: {ply}")
    nim.nimming(ply)
    logging.debug(f"status: After player {player} -> {nim}")
    player = 1 - player
winner = 1 - player
logging.info(f"status: Player {winner} won!")

[09:11:57] DEBUG: status: Initial board  -> <1 3 5 7>
[09:11:57] DEBUG: 0: Nimply(row=3, num_objects=4)
[09:11:57] DEBUG: status: After player 0 -> <1 3 5 3>
[09:11:57] DEBUG: 1: (2, 4)
[09:11:57] DEBUG: status: After player 1 -> <1 3 1 3>
[09:11:57] DEBUG: 0: Nimply(row=1, num_objects=3)
[09:11:57] DEBUG: status: After player 0 -> <1 0 1 3>
[09:11:57] DEBUG: 1: (3, 3)
[09:11:57] DEBUG: status: After player 1 -> <1 0 1 0>
[09:11:57] DEBUG: 0: Nimply(row=2, num_objects=1)
[09:11:57] DEBUG: status: After player 0 -> <1 0 0 0>
[09:11:57] DEBUG: 1: (0, 1)
[09:11:57] DEBUG: status: After player 1 -> <0 0 0 0>
[09:11:57] INFO: status: Player 1 won!


In [9]:
class Agent():
    def __init__(self, nim, alpha = 0.1, random_factor = 0.2) -> None:
        self.G = {}
        self.init_reward(nim)

    def init_reward(self, nim) -> dict:
        result = []
        def generate_states(rows, current_state):
            if len(current_state) == len(rows):
                # We have generated a valid state 
                result.append(current_state)
                return
            # Generate all possible states for the current row
            for i in range(rows[len(current_state)] + 1):
                new_state = current_state + [i]
                generate_states(rows, new_state)
        
        generate_states(nim.rows, [])
        for state in result:
            self.G[Nim(NIM_SIZE, state=state)] = np.random.uniform(low=1.0, high=0.1)            

### Reinforcement learning strategies


In [10]:
# Costants
NUM_EPOCHS = 0
NIM_SIZE = 3
OPPONENT = aggressive

In [12]:
# Create the agent
game = Nim(NIM_SIZE)
agent = Agent(game)

print(len(agent.G))

for i in range(NUM_EPOCHS):
    logging.debug(f"Starting the game #{i}")
    current_game = Nim(NIM_SIZE)
    turn = True
    while current_game:
        if turn:
            # my agent turn
            state, _ = current_game.get_state_and_reward() # forse è inutile
            action = agent.choose_action(current_game)
            current_game.nimming(action)
            state, reward = current_game.get_state_and_reward()
            agent.update_history(state, reward)
        else:
            action = OPPONENT(current_game)
            current_game.nimming(action)
        turn = not turn
    winner = int(not turn)
    logging.debug(f"The game ended. Player {winner} wins")
    agent.learn()




40
