Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

In [7]:
from itertools import combinations
from collections import namedtuple, defaultdict
from random import choice
from copy import deepcopy

from tqdm.auto import tqdm
import numpy as np

In [8]:
State = namedtuple('State', ['x', 'o'])

In [9]:
MAGIC = [2, 7, 6, 9, 5, 1, 4, 3, 8]

In [10]:
def print_board(pos):
    """Nicely prints the board"""
    for r in range(3):
        for c in range(3):
            i = r * 3 + c
            if MAGIC[i] in pos.x:
                print('X', end='')
            elif MAGIC[i] in pos.o:
                print('O', end='')
            else:
                print('.', end='')
        print()
    print()

In [11]:
def win(elements):
    """Checks is elements is winning"""
    return any(sum(c) == 15 for c in combinations(elements, 3))

def state_value(pos: State):
    """Evaluate state: +1 first player wins"""
    if win(pos.x):
        return 1
    elif win(pos.o):
        return -1
    else:
        return 0
    
    

In [12]:
def random_game():
    trajectory = list()
    state = State(set(), set())
    available = set(range(1, 9+1))
    while available:
        x = choice(list(available))
        state.x.add(x)
        trajectory.append(deepcopy(state))
        available.remove(x)
        if win(state.x) or not available:
            break

        o = choice(list(available))
        state.o.add(o)
        trajectory.append(deepcopy(state))
        available.remove(o)
        if win(state.o):
            break
    return trajectory

In [13]:
value_dictionary = defaultdict(float)
hit_state = defaultdict(int)
epsilon = 0.001

for steps in tqdm(range(500_000)):
    trajectory = random_game()
    final_reward = state_value(trajectory[-1])
    for state in trajectory:
        hashable_state = (frozenset(state.x), frozenset(state.o))
        hit_state[hashable_state] += 1
        value_dictionary[hashable_state] = value_dictionary[
            hashable_state
        ] + epsilon * (final_reward - value_dictionary[hashable_state])

  0%|          | 0/500000 [00:00<?, ?it/s]

In [16]:
sorted(value_dictionary.items(), key=lambda e: e[1], reverse=True)[:10]

[((frozenset({1, 2, 3, 6, 8}), frozenset({4, 5, 7, 9})), 0.9191579346880341),
 ((frozenset({1, 2, 3, 4, 9}), frozenset({5, 6, 7, 8})), 0.9186711800754285),
 ((frozenset({2, 6, 7, 8, 9}), frozenset({1, 3, 4, 5})), 0.9164442602253164),
 ((frozenset({1, 2, 3, 6, 7}), frozenset({4, 5, 8, 9})), 0.9156884847308895),
 ((frozenset({2, 3, 4, 6, 9}), frozenset({1, 5, 7, 8})), 0.9156884847308895),
 ((frozenset({2, 3, 6, 7, 9}), frozenset({1, 4, 5, 8})), 0.9145847241687919),
 ((frozenset({1, 3, 5, 6, 9}), frozenset({2, 4, 7, 8})), 0.9128583491447205),
 ((frozenset({1, 6, 7, 8, 9}), frozenset({2, 3, 4, 5})), 0.9126838040690546),
 ((frozenset({1, 2, 4, 7, 9}), frozenset({3, 5, 6, 8})), 0.9125964004695241),
 ((frozenset({3, 5, 6, 7, 9}), frozenset({1, 2, 4, 8})), 0.9123336643739864)]

In [29]:
len(hit_state)

5477

In [19]:
EXP_RATE = 0.3

In [22]:
def chooseAction(available_positions, actual_state, value_dictionary):
        if np.random.uniform(0, 1) <= EXP_RATE:
            # take random action
            action = choice(list(available_positions))   
        else:
            value_max = -999
            for p in available_positions:
                # next_board = current_board.copy()
                # next_board[p] = symbol
                next_state = deepcopy(actual_state)
                next_state.x.add(p)
                hashable_state = (frozenset(next_state.x), frozenset(next_state.o))
                
                value = 0 if value_dictionary.get(hashable_state) is None else value_dictionary.get(hashable_state)
                # print("value", value)
                if value >= value_max:
                    value_max = value
                    action = p
        # print("{} takes action {}".format(self.name, action))
        return action

In [31]:
trajectory = list()
state = State(set(), set())
available = set(range(1, 9+1))
while available:
    x = chooseAction(available, state, value_dictionary)
    state.x.add(x)
    trajectory.append(deepcopy(state))
    available.remove(x)
    if win(state.x) or not available:
        break

    o = choice(list(available))
    state.o.add(o)
    trajectory.append(deepcopy(state))
    available.remove(o)
    if win(state.o):
        break

print_board(state)


X..
.X.
OOX
