In [75]:
import logging
import random
from typing import Callable
from copy import deepcopy
from itertools import accumulate
from operator import xor
from matplotlib import pyplot as plt
import numpy as np
import math
import json
import os
from copy import deepcopy, copy
from nim_utils import *

logging.basicConfig(
    format="[%(asctime)s] %(levelname)s: %(message)s",
    datefmt="%H:%M:%S",
    level=logging.INFO,
)

In [76]:
class Agent():
    def __init__(self, nim, alpha = 0.1, random_factor = 0.2) -> None:
        self.state_history = []
        self.alpha = alpha
        self.random_factor = random_factor
        # Q table, initially empty
        self.Q = {}
        self.init_q(nim)

    def init_q(self, nim) -> dict:
        result = []
        def generate_states(rows, current_state):
            if len(current_state) == len(rows):
                # We have generated a valid state 
                result.append(current_state)
                return
            # Generate all possible states for the current row
            for i in range(rows[len(current_state)] + 1):
                new_state = current_state + [i]
                generate_states(rows, new_state)
        
        generate_states(nim.rows, [])
        for state in result:
            x = Nim(len(nim.rows), state=state)
            for action in x.possible_moves():
                self.Q[(x, action)] = np.random.uniform(low=1.0, high=0.1)     

    def choose_action(self, state) -> Nimply:
        maxG = -10e15
        next_move = None
        possible_moves = state.possible_moves()
        if np.random.random() < self.random_factor:
            next_move = random.choice(possible_moves) 
        else:
            for action in possible_moves:
                # print(f"Check {new_state}")
                if self.Q[(state, action)] >= maxG:
                    next_move = action
                    maxG = self.Q[(state, action)]
        return next_move

    def update_state_history(self, state, action,  reward):
        self.state_history.append(((state, action), reward))

    def learn(self):
        target = 0
        for prev, reward in reversed(self.state_history):
            state, action = prev
            print(f"{state}{action}")
            self.Q[prev] = self.Q[prev] + self.alpha * (target - self.Q[prev])
            target += reward
        self.state_history = []
        self.random_factor -= 10e-5  # decrease random factor each episode of play

    def get_strategy(self) -> Callable:
        def agent_strategy(state: Nim) -> Nimply:
            action = self.choose_action(state)
            # print(f"Choosen action: {action}")
            return Nimply(action[0], action[1])
        return agent_strategy

In [77]:
def evaluate_against(strategy: Callable, against: Callable, NIM_SIZE: int, NUM_MATCHES = 100) -> float:
    opponent = (strategy, against)
    won = 0
    for _ in range(NUM_MATCHES):
        nim = Nim(NIM_SIZE)
        player = 0
        while nim:
            # logging.debug(nim)
            ply = opponent[player](nim)
            nim.nimming(ply)
            player = 1 - player
        if player == 1: # winner is the zero
            won += 1
        # logging.debug(f"player {1 - player} has won.")
    return won / NUM_MATCHES

### Reinforcement learning strategies


In [78]:
# Costants
NUM_EPOCHS = 5000
NIM_SIZE_learn = 2
OPPONENTS = [aggressive]

# Create the agent
game = Nim(NIM_SIZE_learn)
print(f"size = {len(game.rows)}")
agent = Agent(game, alpha=0.1, random_factor=.1)

states = agent.Q

size = 2


In [79]:
# for state in states:
#     print(f"{state[0]}, {state[1]}: {states[state]}")

len(states)

16

In [80]:
# for state in states:
#     print(f"{state}: {states[state]}")
win_logs = []
scores = []
for i in range(NUM_EPOCHS):
    # logging.debug(f"Starting the game #{i}")
    current_game = Nim(NIM_SIZE_learn)
    turn = True
    OPPONENT = random.choice(OPPONENTS)
    while current_game:
        if turn:
            state, _ = current_game.get_state_and_reward()
            # my agent turn
            action = agent.choose_action(current_game)
            # print(f"Action: {action}")
            current_game.nimming(action)
            _, reward = current_game.get_state_and_reward()
            # print(f"{state} -> {reward}")
            agent.update_state_history(state, action, reward)
        else:
            action = OPPONENT(current_game)
            current_game.nimming(action)
        turn = not turn
    winner = int(not turn)
    win_logs.append(winner)
    # logging.debug(f"The game ended. Player {winner} wins")
    agent.learn()
    if i % 300 == 0:
        score = evaluate_against(agent.get_strategy(), aggressive, NIM_SIZE=NIM_SIZE_learn)
        scores.append(score)
        print(f"#{i}: {score}")


agent_strategy = agent.get_strategy()


<0 0>(1, 1)


KeyError: (<nim_utils.Nim object at 0x000001445CDBC130>, (1, 1))

In [None]:
plt.scatter(range(len(win_logs)), win_logs, marker='o', linewidths=.2, edgecolors=None)
plt.show()

In [None]:
plt.plot(range(len(scores)), scores)
plt.show()

In [None]:
evaluate_against(agent_strategy, aggressive)

