In [383]:
__author__ = "Aiello Davide"
import logging
from collections import namedtuple
import random
import numpy as np
from typing import Callable
from copy import deepcopy
from itertools import accumulate
from operator import xor       

# Nim Class

In [384]:
Nimply = namedtuple("Nimply", "row, num_objects")

In [385]:
class Nim:
    def __init__(self, num_rows: int, k: int = None) -> None:
        self._rows = [i * 2 + 1 for i in range(num_rows)]
        self._k = k
        self._steps = 0 

    def __str__(self):
        return "<" + " ".join(str(_) for _ in self._rows) + ">"

    def __bool__(self):
        return sum(self._rows) > 0

    @property
    def rows(self) -> tuple:
        return tuple(self._rows)

    @property
    def k(self) -> int:
        return self._k

    def is_game_over(self) -> bool:
        return not (sum(self._rows) > 0)

    def nimming(self, ply: Nimply) -> None:
        row, num_objects = ply
        assert self._rows[row] >= num_objects
        assert self._k is None or num_objects <= self._k
        self._rows[row] -= num_objects

    def give_reward(self):
        # if lose give -1 reward
        # if not win 0 reward
        # if win five +1 reward
        if not self:
            return 1
        if self:     
            return 0 
        
    def possible_moves(self):
        return [(r, o) for r, c in enumerate(self._rows) for o in range(1, c + 1) if self._k is None or o <= self._k]

# Nim-sum

In [386]:
# algorithm taken from professor's code
def nim_sum(state: Nim) -> int:
    *_, result = accumulate(state.rows, xor)
    return result


def cook_status(state: Nim) -> dict:
    cooked = dict()
    cooked["possible_moves"] = [
        (r, o) for r, c in enumerate(state.rows) for o in range(1, c + 1) if state.k is None or o <= state.k
    ]
    cooked["active_rows_number"] = sum(o > 0 for o in state.rows)       # numero di righe maggiori di zero     
    cooked["shortest_row"] = min((x for x in enumerate(state.rows) if x[1] > 0), key=lambda y: y[1])[0]
    cooked["longest_row"] = max((x for x in enumerate(state.rows)), key=lambda y: y[1])[0]
    cooked["nim_sum"] = nim_sum(state)

    brute_force = list()
    for m in cooked["possible_moves"]:
        tmp = deepcopy(state)
        tmp.nimming(m)
        brute_force.append((m, nim_sum(tmp)))
    cooked["brute_force"] = brute_force

    return cooked

def optimal_strategy(state: Nim) -> Nimply:
    data = cook_status(state)
    return next((bf for bf in data["brute_force"] if bf[1] == 0), random.choice(data["brute_force"]))[0]

# Agent Class

In [387]:
class Agent(object):
    def __init__(self, states, alpha=0.15, random_factor=0.2):  
        self.state_history = []  
        self.alpha = alpha
        self.random_factor = random_factor
        self.G = {}                         
        self.init_reward(states)          

    def init_reward(self, state):          
        for move in state.possible_moves(): 
                self.G[move] = np.random.uniform(low=0.1, high=1.0) 

    def choose_action(self, possible_moves):
        maxG = -10e15           
        next_move = None
        randomN = np.random.random()
        if randomN < self.random_factor:           
            next_move = Nimply(*possible_moves[random.randint(0, len(possible_moves) - 1)])
        else:
            for move in possible_moves:                                                                                                                              
                if self.G[move] >= maxG:   
                    next_move = move
                    maxG = self.G[move]
        return Nimply(*next_move)

    def update_state_history(self, move, reward): 
        self.state_history.append((move, reward))

    
    def learn(self):
        target = 0

        for prev, reward in reversed(self.state_history):                        
            self.G[prev] = self.G[prev] + self.alpha * (target - self.G[prev])  
            target += reward       

        self.state_history = []      
        self.random_factor -= 10e-5 

# Task 3.4: reinforcement learning

In [388]:
def dummy(state: Nim) -> Nimply: 
    row = random.choice([r for r, c in enumerate(state.rows) if c > 0])
    return Nimply(row, 1)

def pure_random(state: Nim) -> Nimply: 
    row = random.choice([r for r, c in enumerate(state.rows) if c > 0])
    if(state.rows[row] > state.k):
        num_objects = random.randint(1, state.k)
    else:
        num_objects = random.randint(1, state.rows[row])
    return Nimply(row, num_objects)

def shortest_row(state: Nim) -> Nimply:
    row = min((x for x in enumerate(state.rows) if x[1] > 0), key=lambda y: y[1])[0]
    if(state.rows[row] > k):
       num_objects = random.randint(1, k)
    else:
       num_objects = state.rows[row]
    return Nimply(row, num_objects)

def Davide_strategy(state: Nim) -> Nimply:
    possible_moves = [(r, o) for r, c in enumerate(state.rows) for o in range(1, c + 1)]
    if any([True for i in possible_moves if i[1] > 1]):
        obj = 0
        while(obj == 0):
            row_num = random.randint(0, len(state.rows) - 1)
            if state.rows[row_num] > 0:
                obj = max([i[1] for i in possible_moves if i[0] == row_num], key=lambda i:i)
                if obj > k:
                    obj = k
                ply = Nimply(row_num, obj)
    else: 
        ply = None
        while ply == None or ply[1] > k:
            ply = Nimply(*possible_moves[random.randint(0, len(possible_moves) - 1)])
    return ply

def reinforcement_learning(agent, NIM_SIZE, k, NUM_MATCHES): 
    winrateHistory = []
    for i in range(500):   
        won = 0             
        for m in range(NUM_MATCHES):
            nim = Nim(NIM_SIZE, k)
            player = 0
            while nim:
                if player == 0:
                    ply0 = Davide_strategy(nim)
                    nim.nimming(ply0)
                else:
                    ply1 = agent.choose_action(nim.possible_moves())
                    nim.nimming(ply1)
                    agent.update_state_history(ply1, nim.give_reward())


                player = 1 - player
            winner = 1 - player
            if winner == 1:
                won += 1
            else:
                agent.update_state_history(ply1, -1)
            agent.learn()
        winrateHistory.append(won/NUM_MATCHES*100)
    return winrateHistory

## Main

In [389]:
logging.getLogger().setLevel(logging.DEBUG)

ALPHA = 0.1
RANDOM_FACTOR = 0.4
NIM_SIZE = 7
k = NIM_SIZE**NIM_SIZE             
NUM_MATCHES = 100


nim = Nim(NIM_SIZE, k)
agent = Agent(nim, alpha=ALPHA, random_factor=RANDOM_FACTOR)      
winrateHistory = reinforcement_learning(agent, NIM_SIZE, k, NUM_MATCHES)

count = 0
for i, wr in enumerate(winrateHistory):
    if i % 20 == 0:
        logging.info(f"{i}: {wr}")
        if wr == 100:
            count += 1
        if count == 3:
            break

INFO:root:0: 17.0
INFO:root:20: 42.0
INFO:root:40: 100.0
INFO:root:60: 100.0
INFO:root:80: 100.0
