Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

In [31]:
from itertools import combinations
from collections import namedtuple, defaultdict
import random
from copy import deepcopy

from tqdm.auto import tqdm
import numpy as np

In [32]:
Reward = namedtuple('Reward', ['move', 'rew'])

class History:
    def __init__(self, x: [int], o: [int]) -> None:
        available = set(range(1,10)) - set(x) - set(o)
        self._x = set(x)
        self._o = set(o)
        self._tab = [Reward(move=num, rew=0) for num in available]

In [33]:
MAGIC = [2, 7, 6, 9, 5, 1, 4, 3, 8]

def print_board(pos):
    """Nicely prints the board"""
    for r in range(3):
        for c in range(3):
            i = r * 3 + c
            if MAGIC[i] in pos[1]:
                print(' X ', end='')
                if c != 2: print('|', end='')
            elif MAGIC[i] in pos[0]:
                print(' O ', end='')
                if c != 2: print('|', end='')
            else:
                print(' . ', end='')
                if c != 2: print('|', end='')
        print()
        if r != 2: print('-----------')
    print()

def win(elements):
    """Checks is elements is winning"""
    return any(sum(c) == 15 for c in combinations(elements, 3))

In [34]:
def random_player(available: set(), state):
    for n in available:
        state.add(n)
        result = win(state)
        state.remove(n)
        if result: return n
    return random.choice(list(available))

In [35]:
# x = {1, 2, 3}
# y = x.copy()
# y.add(5)
# y

In [36]:
def find_state_in_history(history, state: ([int], [int])):
    for rew in history:
        if (rew._x == state[1]) & (rew._o == state[0]):
            return rew
    return None

def RL_player(available: set(), state: ([int], [int]), history):
    found = find_state_in_history(history, state)
    if(found is not None):
        max = sorted(found._tab, key=lambda x:x.rew, reverse=True)[0].rew
        possible_move = list(move.move for move in found._tab if ((move.rew == max) & (any((move.move == a) for a in available))))
        return random.choice(possible_move)
    else:
        history.append(History(state[1], state[0]))
        return random.choice(list(available))
    
def update_rewards(state: ([int], [int]), history, move: int, update: int):
    found = find_state_in_history(history, state)
    if found is not None:
         for i, reward in enumerate(found._tab):
            if reward.move == move:
                found._tab[i] = Reward(move=reward.move, rew=reward.rew + update)

In [37]:
# player1: RL_player play with X
# player0: random_player play with O

history = []

def random_game(first: int):
    state = (set(), set())
    available = set(range(1, 10))
    now_playing = first
    while available:
        if(now_playing):
            #player1
            RL_play = RL_player(available, state, history)
            previous_state = (set(state[0]), set(state[1]))
            state[1].add(RL_play)
            available.remove(RL_play)
            if win(state[1]):
                update_rewards(previous_state, history, RL_play, 2)
                return 1
        else:
            #player0
            random_play = random_player(available, state[0])
            state[0].add(random_play)
            available.remove(random_play)
            if win(state[0]):
                update_rewards(previous_state, history, RL_play, -1)
                return 0
        now_playing = 1 - now_playing
    return '-'

In [38]:
RL_player_wins = 0
random_player_wins = 0
draw = 0
first = 0

# Warm-up
for _ in range(100000):
    random_game(first)
    first = 1 - first

# Let's play seriously
for _ in range(100):
    winner = random_game(first)
    first = 1 - first
    if winner == '-': draw += 1
    elif winner == 0: random_player_wins += 1
    elif winner == 1: RL_player_wins += 1

print("Reinforcement Learning player won: {} times", RL_player_wins)
print("Random player won: {} times", random_player_wins)
print("The match ended in draw {} times", draw)

Reinforcement Learning player won: {} times 80
Random player won: {} times 10
The match ended in draw {} times 10
