# Reinforcement Learning


Solve tic-tac-toe using [magic square](https://it.wikipedia.org/wiki/Quadrato_magico).
Picking 3 numbers from there that add up to 15 is equivalent to wining at tic tac toe.


In [66]:
import numpy as np
from collections import namedtuple, defaultdict
from itertools import combinations
from random import choice
from copy import deepcopy
from tqdm.auto import tqdm

In [59]:
State = namedtuple("Position", ["x", "o"])

In [60]:
MAGIC = [2, 7, 6, 9, 5, 1, 4, 3, 8]

In [61]:
def print_board(pos):
    for r in range(3):
        for c in range(3):
            index = r * 3 + c
            if MAGIC[index] in pos.x:
                print("X", end="")
            elif MAGIC[index] in pos.o:
                print("O", end="")
            else:
                print(" ", end="")
        print()
    print()

In [62]:
def win(elements):
    return any(sum(c) == 15 for c in combinations(elements, 3))


# Evaluate the state
def state_value(position: State):
    if win(position.x):
        return 1
    elif win(position.o):
        return -1
    else:
        return 0

In [73]:
# Starts with empty position and keeps adding on each side until someone wins


def random_game():
    trajectory = list()
    state = State(set(), set())
    available = set(range(1, 9 + 1)) - state.x - state.o
    while available:
        x = choice(list(available))
        state.x.add(x)
        available.remove(x)
        trajectory.append(deepcopy(state))
        if win(state.x):
            break

        if not available:
            break

        o = choice(list(available))
        state.o.add(o)
        available.remove(o)
        trajectory.append(deepcopy(state))
        if win(state.o):
            break

    return trajectory

In [64]:
state = State({2, 4, 8}, {6, 9, 5})
print(state_value(state))
print()
print_board(state)

0

X O
OO 
X X



In [80]:
value_dict = defaultdict(float)
hit_state = defaultdict(int)
epsilon = 0.0001
for steps in tqdm(range(1_000_000)):
    trajectory = random_game()
    final_reward = state_value(trajectory[-1])
    for state in trajectory:
        hashable_state = State(frozenset(state.x), frozenset(state.o))
        hit_state[hashable_state] += 1
        value_dict[hashable_state] = value_dict[hashable_state] + epsilon * (
            final_reward - value_dict[hashable_state]
        )

100%|██████████| 1000000/1000000 [02:15<00:00, 7402.59it/s]


In [82]:
sorted(value_dict.items(), key=lambda e: e[1], reverse=True)[:10]

[(Position(x=frozenset({5}), o=frozenset()), 0.49501840788155943),
 (Position(x=frozenset({5}), o=frozenset({9})), 0.4318671341217071),
 (Position(x=frozenset({5}), o=frozenset({3})), 0.42636385247524783),
 (Position(x=frozenset({5}), o=frozenset({1})), 0.4252544914947035),
 (Position(x=frozenset({5}), o=frozenset({7})), 0.42412489867020525),
 (Position(x=frozenset({4, 5, 6, 8, 9}), o=frozenset({1, 2, 3, 7})),
  0.38677566752395254),
 (Position(x=frozenset({1, 2, 5, 7, 8}), o=frozenset({9, 3, 4, 6})),
  0.38659166342461665),
 (Position(x=frozenset({4, 5, 6, 7, 9}), o=frozenset({8, 1, 2, 3})),
  0.3861621057831405),
 (Position(x=frozenset({1, 3, 4, 5, 7}), o=frozenset({8, 9, 2, 6})),
  0.38499464667527583),
 (Position(x=frozenset({2, 3, 4, 6, 9}), o=frozenset({8, 1, 5, 7})),
  0.3845639706747762)]