Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

In [8]:
from itertools import combinations
from collections import namedtuple
import random

In [9]:
Reward = namedtuple('Reward', ['move', 'rew'])

class History:
    """each state of the game has a reward table"""
    def __init__(self, x: set, o: set) -> None:
        available = set(range(1,10)) - x - o
        self._x = set(x)
        self._o = set(o)
        self._tab = {num: Reward(move=num, rew=0) for num in available}

In [10]:
MAGIC = [2, 7, 6, 9, 5, 1, 4, 3, 8]

def print_board(pos):
    """Nicely prints the board"""
    for r in range(3):
        for c in range(3):
            i = r * 3 + c
            if MAGIC[i] in pos[1]:
                print(' X ', end='')
                if c != 2: print('|', end='')
            elif MAGIC[i] in pos[0]:
                print(' O ', end='')
                if c != 2: print('|', end='')
            else:
                print(' . ', end='')
                if c != 2: print('|', end='')
        print()
        if r != 2: print('-----------')
    print()

def win(elements, recent_move):
    """Checks if elements is winning"""
    return any(sum(c) == 15 for c in combinations(elements, 3) if recent_move in c)

In [11]:
def random_player(available: set(), state: (set, set)):
    for n in available:
        """check if there is a winning move """
        state[0].add(n)
        result = win(state[0], n)
        state[0].remove(n)
        if result: return n
    """play randomly"""
    return random.choice(list(available))

In [12]:
def find_state_in_history(history, state: (set, set)):
    """check if RL_player is in a state already present in its history"""
    for rew in history:
        if rew._x == state[1] and rew._o == state[0]:
            return rew
    return None

def RL_player(available: set(), state: (set, set), history):
    found = find_state_in_history(history, state)
    if found is not None:
        """it is in a familiar state so choose the best move"""
        max_rew = sorted(found._tab.values(), key=lambda x:x.rew, reverse=True)[0].rew
        possible_move = [move for move, reward in found._tab.items() if reward.rew == max_rew and move in available]
        return random.choice(possible_move)
    else:
        """it isn't in a familiar state so add this state to the history"""
        history.append(History(state[1], state[0]))
        return random.choice(list(available))
    
def update_rewards(state: (set, set), history, move: int, update: int):
    found = find_state_in_history(history, state)
    if found is not None:
        found._tab[move] = Reward(move=move, rew=found._tab[move].rew + update)

In [13]:
# player1: RL_player play with X
# player0: random_player play with O

history = []

def random_game(first: int):
    state = (set(), set())
    available = set(range(1, 10))
    now_playing = first
    while available:
        if now_playing:
            #player1
            RL_play = RL_player(available, state, history)
            previous_state = (set(state[0]), set(state[1]))
            state[1].add(RL_play)
            available.remove(RL_play)
            if win(state[1], RL_play):
                update_rewards(previous_state, history, RL_play, 2)
                return 1
        else:
            #player0
            random_play = random_player(available, state)
            state[0].add(random_play)
            available.remove(random_play)
            if win(state[0], random_play):
                update_rewards(previous_state, history, RL_play, -1)
                return 0
        now_playing = 1 - now_playing
    return '-'

In [14]:
RL_player_wins = 0
random_player_wins = 0
draw = 0
first = 0

# Warm-up
for _ in range(1000000):
    random_game(first)
    first = 1 - first

# Let's play seriously
for _ in range(100):
    winner = random_game(first)
    first = 1 - first
    if winner == '-': draw += 1
    elif winner == 0: random_player_wins += 1
    elif winner == 1: RL_player_wins += 1

print(f"Reinforcement Learning player won: {RL_player_wins} times")
print(f"Random player won: {random_player_wins} times")
print(f"The match ended in draw {draw} times")

 . | . | . 
-----------
 . | O | . 
-----------
 . | . | . 

 . | . | . 
-----------
 . | O | . 
-----------
 . | X | . 

 . | . | . 
-----------
 . | O | . 
-----------
 O | X | . 

 . | . | . 
-----------
 . | O | . 
-----------
 O | X | X 

 . | . | . 
-----------
 . | . | . 
-----------
 X | . | . 

 . | O | . 
-----------
 . | . | . 
-----------
 X | . | . 

 . | O | . 
-----------
 X | . | . 
-----------
 X | . | . 

 . | O | O 
-----------
 X | . | . 
-----------
 X | . | . 

 . | . | . 
-----------
 . | . | O 
-----------
 . | . | . 

 X | . | . 
-----------
 . | . | O 
-----------
 . | . | . 

 X | O | . 
-----------
 . | . | O 
-----------
 . | . | . 

 X | O | X 
-----------
 . | . | O 
-----------
 . | . | . 

 X | O | X 
-----------
 . | . | O 
-----------
 . | O | . 

 X | O | X 
-----------
 . | . | O 
-----------
 . | O | X 

 . | . | . 
-----------
 . | . | . 
-----------
 . | X | . 

 . | . | . 
-----------
 O | . | . 
-----------
 . | X | . 

 . | X | . 
-----------
